In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('../input/customerattritionprediction/train.csv', index_col = 'ID')
test = pd.read_csv('../input/customerattritionprediction/test.csv', index_col = 'ID')
train.head()

In [None]:
#Replace the only N/A value with the column's average 
test['GrandPayment']['TNU13T8BB6KQZTN515SK'] = 2290.544207

In [None]:
print(len(train.index))
print(len(train[train.isnull().any(axis = 1)].index))
train = train.dropna()
print(len(train.index))

In [None]:
features = train.columns[:len(train.columns) - 1]
cont_features = ['ServiceSpan','QuarterlyPayment', 'GrandPayment']
prediction = train.columns[len(train.columns) - 1]
pos_train = train[train[prediction] == 'Yes']

X = train.drop(labels = prediction, axis = 1)
y = train[prediction]

le = LabelEncoder()
y = le.fit_transform(y)

cat_features = X.columns[X.dtypes == object].tolist()

In [None]:
fig, ax = plt.subplots(len(cont_features), 1, figsize=(30, 50))

for (i,feature) in enumerate(cont_features):
    sns.kdeplot(data=train,  x=feature, ax=ax[i], shade=True, hue_order=prediction)
    sns.kdeplot(data=pos_train,  x=feature, ax=ax[i], shade=True, hue_order=prediction)
plt.show()

In [None]:
for cat_f in cat_features:
    encoder = LabelEncoder()
    tot = X[cat_f].append(test[cat_f])
    encoder.fit(tot)
    X[cat_f] = encoder.transform(X[cat_f])
    test[cat_f] = encoder.transform(test[cat_f])

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler


# define resampling
over = RandomOverSampler(sampling_strategy=0.98)
under = RandomUnderSampler(sampling_strategy=1)

# define pipeline
pipeline = Pipeline(steps=[('o', over), ('u', under)])

X_sampled, y_sampled = pipeline.fit_resample(X, y)

In [None]:
X_sampled.shape

In [None]:
fig, ax = plt.subplots(len(cat_features), 2, figsize=(20, 60))

for (i,feature) in enumerate(cat_features):
    
    #Before Resampling
    sns.countplot(x=feature, hue=y, ax=ax[i][0], data=X)
    
    #After Resampling
    sns.countplot(x=feature, hue=y_sampled, ax=ax[i][1], data=X_sampled)
    
    for p in ax[i][1].patches:
        ax[i][1].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.05, p.get_height()+50))
        
    for p in ax[i][0].patches:
        ax[i][0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.05, p.get_height()+50))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold

def eval_model(model):
    cv_rep = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    y_pred = cross_val_predict(model, X_sampled, y_sampled, cv= 10)
    cm = confusion_matrix(y_sampled, y_pred)
    print("Accuracy: " + str(accuracy_score(y_sampled, y_pred)))
    print("Recall: " + str(recall_score(y_sampled, y_pred, average=None)))
    print("Precision: " + str(precision_score(y_sampled, y_pred, average=None)))
    print(cm)
    plt.imshow(cm, cmap='binary')
    print("f1-score: " + str(f1_score(y_sampled, y_pred, average='weighted')))
    
def grid_search(model,parameters):
    grid_search = GridSearchCV(estimator = model, param_grid = parameters,scoring = 'accuracy',cv = 10, n_jobs=-1, verbose=5)
    grid_search = grid_search.fit(X_sampled, y_sampled)
    print(grid_search.best_params_)
    print(grid_search.best_score_)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf1 = RandomForestClassifier()
parameters = [{
          'n_estimators' : [250,300],
          'criterion': ['entropy', 'gini'],
          'max_features': ['auto', 'log2']
          }]
grid_search(clf1, parameters)
#eval_model(clf1)

In [None]:
clf = RandomForestClassifier()
eval_model(clf1)

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier()
eval_model(brf)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
clf2 = ExtraTreesClassifier()
eval_model(clf2)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
stack2 = StackingClassifier(estimators=[('randomforest', clf1), ('extratree', clf2), ('bal_randomforest', brf)], final_estimator=LogisticRegression())
eval_model(stack2)

# Exporting the Prediction

In [None]:
clf2.fit(X, y)
y_pred = clf2.predict(test)

In [None]:
y_pred = pd.DataFrame({'CustomerAttrition': y_pred})['CustomerAttrition'].map({1:'Yes', 0:'No'})

In [None]:
y_pred

In [None]:
output = pd.DataFrame({'ID': test.index, 'CustomerAttrition': y_pred})
output.to_csv('extra-trees-classifier.csv', index=False)