In [1]:
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
import joblib

import missingno
import pandas_profiling
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb

from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train

Unnamed: 0,ID,Age,Gender,Income,Balance,Vintage,Transaction_Status,Product_Holdings,Credit_Card,Credit_Category,Is_Churn
0,84e2fcc9,36,Female,5L - 10L,563266.44,4,0,1,0,Average,1
1,57fea15e,53,Female,Less than 5L,875572.11,2,1,1,1,Poor,0
2,8df34ef3,35,Female,More than 15L,701607.06,2,1,2,0,Poor,0
3,c5c0788b,43,Female,More than 15L,1393922.16,0,1,2,1,Poor,1
4,951d69c4,39,Female,More than 15L,893146.23,1,1,1,1,Good,1
...,...,...,...,...,...,...,...,...,...,...,...
6645,a8216afd,61,Male,5L - 10L,1354526.19,4,1,2,1,Poor,1
6646,153a306b,42,Female,5L - 10L,596286.54,3,1,2,1,Average,0
6647,d4075a95,29,Male,Less than 5L,979298.19,2,0,2,0,Poor,0
6648,5855b142,66,Female,10L - 15L,395233.65,3,0,2,1,Poor,1


In [4]:
df_test

Unnamed: 0,ID,Age,Gender,Income,Balance,Vintage,Transaction_Status,Product_Holdings,Credit_Card,Credit_Category
0,55480787,50,Female,More than 15L,1008636.39,2,1,2,1,Average
1,9aededf2,36,Male,5L - 10L,341460.72,2,0,2,1,Average
2,a5034a09,25,Female,10L - 15L,439460.10,0,0,2,1,Good
3,b3256702,41,Male,Less than 5L,28581.93,0,1,2,1,Poor
4,dc28adb5,48,Male,More than 15L,1104540.03,2,1,3+,0,Good
...,...,...,...,...,...,...,...,...,...,...
2846,19e40adf,40,Female,10L - 15L,1338458.22,0,0,1,1,Poor
2847,52d5bc8d,48,Female,More than 15L,1448280.27,0,1,2,1,Average
2848,f708121b,59,Male,More than 15L,1100555.64,3,0,1,1,Poor
2849,f008715d,34,Female,5L - 10L,1502818.92,2,0,1,1,Poor


In [5]:
df = df_train.drop("ID", axis=1)
df

Unnamed: 0,Age,Gender,Income,Balance,Vintage,Transaction_Status,Product_Holdings,Credit_Card,Credit_Category,Is_Churn
0,36,Female,5L - 10L,563266.44,4,0,1,0,Average,1
1,53,Female,Less than 5L,875572.11,2,1,1,1,Poor,0
2,35,Female,More than 15L,701607.06,2,1,2,0,Poor,0
3,43,Female,More than 15L,1393922.16,0,1,2,1,Poor,1
4,39,Female,More than 15L,893146.23,1,1,1,1,Good,1
...,...,...,...,...,...,...,...,...,...,...
6645,61,Male,5L - 10L,1354526.19,4,1,2,1,Poor,1
6646,42,Female,5L - 10L,596286.54,3,1,2,1,Average,0
6647,29,Male,Less than 5L,979298.19,2,0,2,0,Poor,0
6648,66,Female,10L - 15L,395233.65,3,0,2,1,Poor,1


In [6]:
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,Age,Balance,Vintage,Transaction_Status,Credit_Card,Is_Churn,Gender_Male,Income_5L - 10L,Income_Less than 5L,Income_More than 15L,Product_Holdings_2,Product_Holdings_3+,Credit_Category_Good,Credit_Category_Poor
0,36,563266.44,4,0,0,1,0,1,0,0,0,0,0,0
1,53,875572.11,2,1,1,0,0,0,1,0,0,0,0,1
2,35,701607.06,2,1,0,0,0,0,0,1,1,0,0,1
3,43,1393922.16,0,1,1,1,0,0,0,1,1,0,0,1
4,39,893146.23,1,1,1,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6645,61,1354526.19,4,1,1,1,1,1,0,0,1,0,0,1
6646,42,596286.54,3,1,1,0,0,1,0,0,1,0,0,0
6647,29,979298.19,2,0,0,0,1,0,1,0,1,0,0,1
6648,66,395233.65,3,0,1,1,0,0,0,0,1,0,0,1


In [7]:
X = df.drop('Is_Churn', axis=1)
Y = df['Is_Churn']

# adding samples to make all the categorical label values same
oversample = SMOTE()
X, Y = oversample.fit_resample(X, Y)

Y.value_counts()

0    5113
1    5113
Name: Is_Churn, dtype: int64

In [8]:
# Feature Scaling on training data

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,Age,Balance,Vintage,Transaction_Status,Credit_Card,Gender_Male,Income_5L - 10L,Income_Less than 5L,Income_More than 15L,Product_Holdings_2,Product_Holdings_3+,Credit_Category_Good,Credit_Category_Poor
0,-0.670859,-0.503355,1.354384,-0.817595,-1.181512,-0.878964,1.969471,-0.462974,-0.415580,-0.787783,-0.165617,-0.444797,-0.799403
1,1.156722,0.094894,-0.089972,1.223100,0.846373,-0.878964,-0.507751,2.159948,-0.415580,-0.787783,-0.165617,-0.444797,1.250934
2,-0.778364,-0.238352,-0.089972,1.223100,-1.181512,-0.878964,-0.507751,-0.462974,2.406278,1.269385,-0.165617,-0.444797,1.250934
3,0.081675,1.087839,-1.534328,1.223100,0.846373,-0.878964,-0.507751,-0.462974,2.406278,1.269385,-0.165617,-0.444797,1.250934
4,-0.348344,0.128559,-0.812150,1.223100,0.846373,-0.878964,-0.507751,-0.462974,2.406278,-0.787783,-0.165617,2.248215,-0.799403
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10221,0.834208,0.073113,1.354384,-0.817595,-1.181512,-0.878964,-0.507751,-0.462974,-0.415580,-0.787783,-0.165617,-0.444797,-0.799403
10222,-0.348344,0.179718,-0.089972,-0.817595,0.846373,1.137703,-0.507751,-0.462974,-0.415580,-0.787783,-0.165617,-0.444797,-0.799403
10223,1.264227,-0.237914,0.632206,1.223100,0.846373,-0.878964,-0.507751,-0.462974,-0.415580,-0.787783,-0.165617,-0.444797,1.250934
10224,1.049218,0.894526,-1.534328,-0.817595,-1.181512,-0.878964,-0.507751,-0.462974,-0.415580,-0.787783,-0.165617,-0.444797,-0.799403


In [9]:
# Classification Model Function

def classify(model, X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=604)
    
    # Training the model
    model.fit(X_train, Y_train)
    
    # Predicting Y_test
    pred = model.predict(X_test)
    
    # Classification Report
    class_report = classification_report(Y_test, pred)
    print("\nClassification Report:\n", class_report)
    
    # Accuracy Score
    acc_score = (accuracy_score(Y_test, pred))*100
    print("Accuracy Score:", acc_score)
    
    # F1 Score
    f_one_score = (f1_score(Y_test, pred, average='macro'))*100
    print("F1 Score:", f_one_score)
    
    # Cross Validation Score
    cv_score = (cross_val_score(model, X, Y, cv=5).mean())*100
    print("Cross Validation Score:", cv_score)
    
    # Result of accuracy minus cv scores
    result = acc_score - cv_score
    print("\nAccuracy Score - Cross Validation Score is", result)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=604)

# Choosing LGBM Classifier best parameters
fmod_param = {'learning_rate' : [0.1, 0.01, 0.001],
              'n_jobs' : [-2, -1, 1],
              'random_state' : [10, 42, 1000],
              'max_depth' : [0, 15, 30],
              'n_estimators' : [100, 200, 300]
             }

GSCV = GridSearchCV(lgb.LGBMClassifier(), fmod_param, cv=5)
GSCV.fit(X_train,Y_train)
GSCV.best_params_

{'learning_rate': 0.1,
 'max_depth': 15,
 'n_estimators': 300,
 'n_jobs': -2,
 'random_state': 10}

In [12]:
# LGBM Classifier

model=lgb.LGBMClassifier(learning_rate=0.1, max_depth=15, n_estimators=300, n_jobs=-2, random_state=10)
classify(model, X, Y)


Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.81      0.78      1230
           1       0.81      0.75      0.78      1327

    accuracy                           0.78      2557
   macro avg       0.78      0.78      0.78      2557
weighted avg       0.78      0.78      0.78      2557

Accuracy Score: 78.06022682831443
F1 Score: 78.06021340589102
Cross Validation Score: 77.16802061150985

Accuracy Score - Cross Validation Score is 0.8922062168045812


In [13]:
# Data Preprocessing and Feature Engineering on Testing data

X = df_test.drop("ID", axis=1)
X = pd.get_dummies(X, drop_first=True)

# Feature Scaling

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

Predicted_Churn = model.predict(X)

# Checking the predicted churn details and storing in dataframe format
predicted_output = pd.DataFrame()
predicted_output['ID'] = df_test["ID"]
predicted_output['Is_Churn'] = Predicted_Churn
predicted_output

Unnamed: 0,ID,Is_Churn
0,55480787,0
1,9aededf2,0
2,a5034a09,0
3,b3256702,0
4,dc28adb5,0
...,...,...
2846,19e40adf,1
2847,52d5bc8d,0
2848,f708121b,1
2849,f008715d,0


In [14]:
predicted_output.to_csv("sample_submission8.csv", index=False)