In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, ShuffleSplit, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, classification_report, plot_confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
import category_encoders as ce

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('customer_segmentation.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Customer Id,Age,Edu,Years Employed,Income,Card Debt,Other Debt,Defaulted,DebtIncomeRatio
0,0,1,41,2,6,19,0.124,1.073,0.0,6.3
1,1,2,47,1,26,100,4.582,8.218,0.0,12.8
2,2,3,33,2,10,57,6.111,5.802,1.0,20.9
3,3,4,29,2,4,19,0.681,0.516,0.0,6.3
4,4,5,47,1,31,253,9.308,8.908,0.0,7.2
...,...,...,...,...,...,...,...,...,...,...
845,845,846,27,1,5,26,0.548,1.220,,6.8
846,846,847,28,2,7,34,0.359,2.021,0.0,7.0
847,847,848,25,4,0,18,2.802,3.210,1.0,33.4
848,848,849,32,1,12,28,0.116,0.696,0.0,2.9


In [4]:
df.isna().sum()

Unnamed: 0           0
Customer Id          0
Age                  0
Edu                  0
Years Employed       0
Income               0
Card Debt            0
Other Debt           0
Defaulted          150
DebtIncomeRatio      0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df

Unnamed: 0.1,Unnamed: 0,Customer Id,Age,Edu,Years Employed,Income,Card Debt,Other Debt,Defaulted,DebtIncomeRatio
0,0,1,41,2,6,19,0.124,1.073,0.0,6.3
1,1,2,47,1,26,100,4.582,8.218,0.0,12.8
2,2,3,33,2,10,57,6.111,5.802,1.0,20.9
3,3,4,29,2,4,19,0.681,0.516,0.0,6.3
4,4,5,47,1,31,253,9.308,8.908,0.0,7.2
...,...,...,...,...,...,...,...,...,...,...
844,844,845,41,1,7,43,0.694,1.198,0.0,4.4
846,846,847,28,2,7,34,0.359,2.021,0.0,7.0
847,847,848,25,4,0,18,2.802,3.210,1.0,33.4
848,848,849,32,1,12,28,0.116,0.696,0.0,2.9


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 0 to 849
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       700 non-null    int64  
 1   Customer Id      700 non-null    int64  
 2   Age              700 non-null    int64  
 3   Edu              700 non-null    int64  
 4   Years Employed   700 non-null    int64  
 5   Income           700 non-null    int64  
 6   Card Debt        700 non-null    float64
 7   Other Debt       700 non-null    float64
 8   Defaulted        700 non-null    float64
 9   DebtIncomeRatio  700 non-null    float64
dtypes: float64(4), int64(6)
memory usage: 60.2 KB


In [8]:
df = df.drop(columns=['Unnamed: 0','Customer Id','Income','Card Debt','Other Debt'])

In [9]:
df

Unnamed: 0,Age,Edu,Years Employed,Defaulted,DebtIncomeRatio
0,41,2,6,0.0,6.3
1,47,1,26,0.0,12.8
2,33,2,10,1.0,20.9
3,29,2,4,0.0,6.3
4,47,1,31,0.0,7.2
...,...,...,...,...,...
844,41,1,7,0.0,4.4
846,28,2,7,0.0,7.0
847,25,4,0,1.0,33.4
848,32,1,12,0.0,2.9


In [10]:
df['Defaulted'].value_counts() / len(df) * 100

0.0    73.857143
1.0    26.142857
Name: Defaulted, dtype: float64

In [11]:
X = df.drop(columns=['Defaulted'])
y = df['Defaulted']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 40)

In [13]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [14]:
models = [logreg,knn,dt,rf,xgb]
score=[]
rata=[]
std=[]

for i in models:
    skfold=StratifiedKFold(n_splits=5)
    estimator=Pipeline([('model',i)])
    model_cv=cross_val_score(estimator,X_train,y_train,cv=skfold,scoring='accuracy')
    score.append(model_cv)
    rata.append(model_cv.mean())
    std.append(model_cv.std())
    
pd.DataFrame({'model':['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost'],'mean accuracy':rata, 'sdev':std}).set_index('model')



Unnamed: 0_level_0,mean accuracy,sdev
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.782143,0.03977
KNN,0.766071,0.042783
Decision Tree,0.721429,0.03312
Random Forest,0.769643,0.030093
XGBoost,0.748214,0.038878


In [15]:
models = [logreg,knn,dt,rf,xgb]
score_f1 = []

def y_pred_func(i):
    estimator=Pipeline([('model',i)])
    X_train,X_test
    
    estimator.fit(X_train,y_train)
    return(estimator,estimator.predict(X_test),X_test)

for i,j in zip(models, ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost']):
    estimator,y_pred,X_test = y_pred_func(i)
    y_pred = estimator.predict(X_test)
    print(j,'\n', classification_report(y_test,y_pred))

Logistic Regression 
               precision    recall  f1-score   support

         0.0       0.81      0.88      0.85       103
         1.0       0.57      0.43      0.49        37

    accuracy                           0.76       140
   macro avg       0.69      0.66      0.67       140
weighted avg       0.75      0.76      0.75       140

KNN 
               precision    recall  f1-score   support

         0.0       0.82      0.84      0.83       103
         1.0       0.53      0.49      0.51        37

    accuracy                           0.75       140
   macro avg       0.68      0.67      0.67       140
weighted avg       0.74      0.75      0.75       140

Decision Tree 
               precision    recall  f1-score   support

         0.0       0.78      0.69      0.73       103
         1.0       0.35      0.46      0.40        37

    accuracy                           0.63       140
   macro avg       0.56      0.57      0.56       140
weighted avg       0.67      0

In [16]:
### Random Forest

In [17]:
### Hyper Parameter Tuning

In [18]:
pipe_RF = Pipeline([
    ('algo', RandomForestClassifier())
])

In [19]:
param_RF = {
    "algo__n_estimators" : np.arange(1, 80,1),
}

In [20]:
skf = StratifiedKFold(n_splits=350)

In [21]:
GS_RF = GridSearchCV(pipe_RF, param_RF, cv = skf, scoring='accuracy', verbose = 1, n_jobs=-1)

In [22]:
RS_RF = RandomizedSearchCV(pipe_RF, param_RF,cv = skf, scoring='accuracy', verbose = 1, n_jobs=-1 )

In [23]:
RS_RF.fit(X_train, y_train)

Fitting 350 folds for each of 10 candidates, totalling 3500 fits


RandomizedSearchCV(cv=StratifiedKFold(n_splits=350, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('algo',
                                              RandomForestClassifier())]),
                   n_jobs=-1,
                   param_distributions={'algo__n_estimators': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79])},
                   scoring='accuracy', verbose=1)

In [24]:
RS_RF.best_params_

{'algo__n_estimators': 46}

In [25]:
RF_Tuned = RS_RF.best_estimator_

In [26]:
print(classification_report(y_test, RF_Tuned.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.85      0.84      0.85       103
         1.0       0.58      0.59      0.59        37

    accuracy                           0.78       140
   macro avg       0.72      0.72      0.72       140
weighted avg       0.78      0.78      0.78       140

