In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from imblearn.combine import SMOTEENN

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
df = train.drop('ID', axis=1)
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,Age,Balance,Vintage,Transaction_Status,Credit_Card,Is_Churn,Gender_Male,Income_5L - 10L,Income_Less than 5L,Income_More than 15L,Product_Holdings_2,Product_Holdings_3+,Credit_Category_Good,Credit_Category_Poor
0,36,563266.44,4,0,0,1,0,1,0,0,0,0,0,0
1,53,875572.11,2,1,1,0,0,0,1,0,0,0,0,1
2,35,701607.06,2,1,0,0,0,0,0,1,1,0,0,1
3,43,1393922.16,0,1,1,1,0,0,0,1,1,0,0,1
4,39,893146.23,1,1,1,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6645,61,1354526.19,4,1,1,1,1,1,0,0,1,0,0,1
6646,42,596286.54,3,1,1,0,0,1,0,0,1,0,0,0
6647,29,979298.19,2,0,0,0,1,0,1,0,1,0,0,1
6648,66,395233.65,3,0,1,1,0,0,0,0,1,0,0,1


In [4]:
X = df.drop('Is_Churn', axis=1)
Y = df['Is_Churn']

# Feature Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=604)

In [5]:
# LGBM

model_lgbm=lgb.LGBMClassifier()
model_lgbm.fit(x_train,y_train)
y_pred=model_lgbm.predict(x_test)
score = model_lgbm.score(x_test,y_test)
print("DT test data score", score)

DT test data score 0.7699248120300752


In [6]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.78      0.98      0.87      1028
           1       0.46      0.07      0.12       302

    accuracy                           0.77      1330
   macro avg       0.62      0.52      0.49      1330
weighted avg       0.71      0.77      0.70      1330



In [7]:
# Using SMOTEENN (UpSampling+ENN) to handle class imbalance
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X,Y)
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)
model_lgbm_smote=lgb.LGBMClassifier()
model_lgbm_smote.fit(xr_train,yr_train)
yr_predict = model_lgbm_smote.predict(xr_test)
model_score_smote = model_lgbm_smote.score(xr_test, yr_test)
print("Balanced Class test data score is:", model_score_smote)
f_one_score = (f1_score(yr_test, yr_predict, average='macro'))*100
print("F1 Score:", f_one_score)
print(metrics.classification_report(yr_test, yr_predict))

Balanced Class test data score is: 0.8726554787759131
F1 Score: 86.93973129289434
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       426
           1       0.89      0.89      0.89       587

    accuracy                           0.87      1013
   macro avg       0.87      0.87      0.87      1013
weighted avg       0.87      0.87      0.87      1013



In [8]:
# Data Preprocessing and Feature Engineering on Testing data

X = test.drop("ID", axis=1)
X = pd.get_dummies(X, drop_first=True)

# Feature Scaling

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

Predicted_Churn = model_lgbm_smote.predict(X)

# Checking the predicted churn details and storing in dataframe format
predicted_output = pd.DataFrame()
predicted_output['ID'] = test["ID"]
predicted_output['Is_Churn'] = Predicted_Churn
predicted_output

Unnamed: 0,ID,Is_Churn
0,55480787,1
1,9aededf2,1
2,a5034a09,1
3,b3256702,1
4,dc28adb5,1
...,...,...
2846,19e40adf,1
2847,52d5bc8d,1
2848,f708121b,1
2849,f008715d,1


In [9]:
predicted_output.to_csv("sample_submission_solution.csv", index=False)