In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# In "Credit card attrition decision tree" notebook, we determined that low transaction counts are a red flag,
# but a lotistic regression chart can tell us a threshold range when the number of transactions gets below a certain level

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# first get the data and have a look

df = pd.read_csv("../input/credit-card-customers/BankChurners.csv")
df.head(10)

In [None]:
# We only want the transaction count column and the target column
df1 = df[["Attrition_Flag", "Total_Trans_Ct"]]

df1.head(10)

In [None]:
# Convert attrition flag to numeric

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

enc.fit(df1['Attrition_Flag'])
df1['Attrition_Flag_Cat'] = enc.transform(df1['Attrition_Flag'])

features = ['Total_Trans_Ct']

df1.head(10)

In [None]:
# Assign x and y columns
X = df1[features]
y = df1['Attrition_Flag_Cat']

In [None]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

model = LogisticRegression(solver='liblinear', random_state=0).fit(X,y)


In [None]:
# Check attributes
model.classes_

In [None]:


model.intercept_

In [None]:
model.coef_

In [None]:
model.predict_proba(X)

In [None]:
model.predict(X)

In [None]:
model.score(X, y)

In [None]:
predictions = pd.DataFrame(model.predict_proba(X))

predictions.columns = ['attrited', 'existing']

predictions.head()



In [None]:
# check rows before concatenating
len(df1.index)


In [None]:

len(predictions.index)

In [None]:
# Reset indexes to align for join

df1.reset_index(drop=True, inplace=True)
predictions.reset_index(drop=True, inplace=True)

frames = [df1, predictions]
result = pd.concat(frames, axis=1)

result.head(10)




In [None]:
len(result.index)

In [None]:
plt.figure(figsize=(15, 10))

plt.xlabel('Total Transaction Count', fontsize=16) 
plt.ylabel('Probability', fontsize=16) 

# predicted probability based on the logistic regression
plt.scatter(result['Total_Trans_Ct'], result['attrited'], marker='o', label='Predicted probability', color='green')

# Actual binary result
plt.scatter( result['Total_Trans_Ct'], result['Attrition_Flag_Cat'], marker='o', label='binary result: 0=attrited, 1=existing customer')


plt.legend(loc='center right', fontsize=12)
plt.grid(True)


plt.show()

Note that the lower (0=attrited) yellow line cuts off at just under 100.  In the detail data, you can see that no customers with over 100 transactions have attrited.