# Kaggle Datasets - Predict credit card customer churn. 

### https://www.kaggle.com/sakshigoyal7/credit-card-customers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Load the data, split training and test sets. 

In [None]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
#remove last two cols
df = df.iloc[:, :-2]

In [None]:
df.shape

In [None]:
df.head(2)

### Encode target variable. 

In [None]:
df['target'] = (df['Attrition_Flag'] == 'Attrited Customer').astype(int)

### Classes on target variable are imbalanced. We will use stratified sampling. 

In [None]:
df['target'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['Attrition_Flag', 'target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y)

### Make sure we have the same proportion of positives and negatives in training and testing. 

In [None]:
y_train.value_counts() / len(y_train)

In [None]:
y_test.value_counts() / len(y_test)

### Ok, the test set and the training set have the same proportion of positive examples. 

In [None]:
def get_positives_prop(series):
    return sum(series == 'Attrited Customer') / len(series)

diff = get_positives_prop(y_train) - get_positives_prop(y_test)
assert abs(diff) < 0.001

# 2. Exploratory Data Analysis with Pandas Profiling. 

In [None]:
df_eda = pd.concat((X_train, y_train), axis=1)
df_eda.shape

In [None]:
import pandas_profiling
import os.path

profile = df_eda.profile_report()

In [None]:
profile

# 3. Data processing. 

In [None]:
ordinals = {
    'Education_Level': ['Uneducated', 'High School', 'College', 'Graduate', 'Post-Graduate', 'Doctorate'],
    'Income_Category': ['Less than $40K', '$40K - $60K', '$60K - $80K', '$80K - $120K', '$120K +'],
    'Card_Category': ['Blue', 'Silver', 'Gold', 'Platinum']
}

def encode_ordinals(df, ordinals):
    sel = df.copy()
    for var, values in ordinals.items():
        sel[var] = sel[var].apply(lambda x: values.index(x) + 1
                                  if x is not np.nan else x)
    return sel

In [None]:
X_train.columns

In [None]:
#split our variables

cat_vars = ['Gender', 'Marital_Status']

ord_vars = ['Education_Level', 'Income_Category', 'Card_Category']

quant_vars = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count',
             'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit',
             'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1',
             'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

In [None]:
#make sure we haven't missed any variable
[col for col in X_train.columns
if col not in cat_vars + ord_vars + quant_vars]

In [None]:
def train_pipeline(X):
    
    #identify NaNs
    X = X.replace('Unknown', np.nan)
    
    #recode ordinals 
    X = encode_ordinals(X, ordinals)
    
    #select vars
    X_cat = X[cat_vars]
    X_ord = X[ord_vars]
    X_quant = X[quant_vars]
    
    #fill NaNs
    from sklearn.impute import SimpleImputer
    
    cat_imputer = SimpleImputer(strategy="most_frequent")
    ord_imputer = SimpleImputer(strategy="median")
    quant_imputer = SimpleImputer(strategy="median")
    
    X_cat = cat_imputer.fit_transform(X_cat)
    X_ord = ord_imputer.fit_transform(X_ord)
    X_quant = quant_imputer.fit_transform(X_quant)
    
    #encode categorical vars
    from sklearn.preprocessing import OneHotEncoder
    one_hot_encoder = OneHotEncoder()
    X_cat = one_hot_encoder.fit_transform(X_cat).toarray()
    
    params = [cat_imputer, ord_imputer, quant_imputer, one_hot_encoder]
    X_prep = np.concatenate([X_cat, X_ord, X_quant], axis=1)
    
    return X_prep, params

In [None]:
def test_pipeline(X, params):
    
    cat_imputer, ord_imputer, quant_imputer, one_hot_encoder = params
    
    #identify NaNs
    X = X.replace('Unknown', np.nan)
    
    #recode ordinals 
    X = encode_ordinals(X, ordinals)
    
    #select vars
    X_cat = X[cat_vars]
    X_ord = X[ord_vars]
    X_quant = X[quant_vars]
    
    #fill NaNs
    X_cat = cat_imputer.transform(X_cat)
    X_ord = ord_imputer.transform(X_ord)
    X_quant = quant_imputer.transform(X_quant)
    
    #encode categorical vars
    X_cat = one_hot_encoder.transform(X_cat).toarray()
    X_prep = np.concatenate([X_cat, X_ord, X_quant], axis=1)
    
    return X_prep

# Build a model. 

In [None]:
X_train_prep, params = train_pipeline(X_train)
X_test_prep = test_pipeline(X_test, params)

In [None]:
X_train_prep.shape, X_test_prep.shape

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import make_scorer

scorer = make_scorer(f1_score)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 12],
    'n_estimators': [120, 130, 140, 150, 160, 170]
}

rfc = RandomForestClassifier(random_state=42)

rfc_grid = GridSearchCV(rfc, 
                        params,
                        scoring=scorer,)

rfc_grid.fit(X_train_prep, y_train)

In [None]:
rfc_grid.best_params_

In [None]:
rfc = RandomForestClassifier(**rfc_grid.best_params_, random_state=42)
rfc.fit(X_train_prep, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc, X_train_prep, y_train, cv=5, verbose=2, scoring=scorer)
print(scores)
print(scores.mean(), scores.std())

## Final accuracy and F1 score. 

In [None]:
from sklearn.metrics import accuracy_score
y_test_pred = rfc.predict(X_test_prep)
accuracy_score(y_test_pred, y_test)

In [None]:
y_test_pred = rfc.predict(X_test_prep)
f1_score(y_test_pred, y_test)

# Feature importances. 

In [None]:
rfc.feature_importances_

In [None]:
#let's get the index of every feature in our decision trees
#our categorical features are one hot encoded, so our model sees them as multiple features

features ={'Gender': [0, 1],
           'Marital_Status': [2, 3, 4]
          }

start = 5 

for feat in ord_vars + quant_vars:
    features[feat] = [start]
    start += 1
    
print(features)

In [None]:
#we sum the importances of the one hot encoded features

feat_importances = [(feature, sum([rfc.feature_importances_[index] 
                  for index in indexes]))
                    for feature, indexes in features.items()]

feat_importances = sorted(feat_importances, key=lambda x: x[1], reverse=True)

In [None]:
plt.style.use('seaborn-whitegrid')

features, importances = zip(*feat_importances)

y_pos = np.arange(len(features))

fig, ax = plt.subplots(figsize=(8, 8))

ax.barh(y_pos, importances, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(features)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Importance')
ax.set_title('Feature Importances')

plt.show()