# IMPORTING LIBRARIES

In [197]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection  import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report , confusion_matrix , recall_score

In [164]:
df = pd.read_csv('customer_churn.csv')

In [165]:
df.head()

Unnamed: 0,Customer_ID,Age,Tenure,ServicePlan,PaymentMethod,MonthlyUsage,SupportCalls,Churn
0,0,56,51,Premium,BankTransfer,,0,0
1,1,69,13,Premium,Cash,82.0,0,1
2,2,46,57,Basic,BankTransfer,284.0,0,0
3,3,32,55,Standard,BankTransfer,84.0,3,1
4,4,60,28,Premium,CreditCard,94.0,1,0


In [166]:
df.info(), df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23053 entries, 0 to 23052
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Customer_ID    23053 non-null  int64 
 1   Age            23053 non-null  int64 
 2   Tenure         23053 non-null  int64 
 3   ServicePlan    23053 non-null  object
 4   PaymentMethod  23053 non-null  object
 5   MonthlyUsage   20747 non-null  object
 6   SupportCalls   23053 non-null  object
 7   Churn          23053 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 1.4+ MB


(None,
 Customer_ID         0
 Age                 0
 Tenure              0
 ServicePlan         0
 PaymentMethod       0
 MonthlyUsage     2306
 SupportCalls        0
 Churn               0
 dtype: int64)

In [167]:
df['MonthlyUsage'] = pd.to_numeric(df['MonthlyUsage'] , errors='coerce')
df['SupportCalls'] = pd.to_numeric(df['SupportCalls'] , errors='coerce')


In [168]:
df.head()

Unnamed: 0,Customer_ID,Age,Tenure,ServicePlan,PaymentMethod,MonthlyUsage,SupportCalls,Churn
0,0,56,51,Premium,BankTransfer,,0.0,0
1,1,69,13,Premium,Cash,82.0,0.0,1
2,2,46,57,Basic,BankTransfer,284.0,0.0,0
3,3,32,55,Standard,BankTransfer,84.0,3.0,1
4,4,60,28,Premium,CreditCard,94.0,1.0,0


In [169]:
df.duplicated().any().sum()

np.int64(0)

# MISSING VALUE IMPUTATION

In [170]:
imputer = KNNImputer(n_neighbors= 5)
 
cols_to_impute = ['Age', 'Tenure', 'MonthlyUsage', 'SupportCalls']

df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])

In [171]:
df.isna().sum()

Customer_ID      0
Age              0
Tenure           0
ServicePlan      0
PaymentMethod    0
MonthlyUsage     0
SupportCalls     0
Churn            0
dtype: int64

In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23053 entries, 0 to 23052
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Customer_ID    23053 non-null  int64  
 1   Age            23053 non-null  float64
 2   Tenure         23053 non-null  float64
 3   ServicePlan    23053 non-null  object 
 4   PaymentMethod  23053 non-null  object 
 5   MonthlyUsage   23053 non-null  float64
 6   SupportCalls   23053 non-null  float64
 7   Churn          23053 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 1.4+ MB


# ONE HOT ENCODING

In [173]:
df['PaymentMethod'].unique() , df['ServicePlan'].unique()

(array(['BankTransfer', 'Cash', 'CreditCard'], dtype=object),
 array(['Premium', 'Basic', 'Standard'], dtype=object))

In [174]:
df = pd.get_dummies(df , columns=['ServicePlan' ,'PaymentMethod' ] , drop_first=True).astype(int)

In [175]:
df

Unnamed: 0,Customer_ID,Age,Tenure,MonthlyUsage,SupportCalls,Churn,ServicePlan_Premium,ServicePlan_Standard,PaymentMethod_Cash,PaymentMethod_CreditCard
0,0,56,51,175,0,0,1,0,0,0
1,1,69,13,82,0,1,1,0,1,0
2,2,46,57,284,0,0,0,0,0,0
3,3,32,55,84,3,1,0,1,0,0
4,4,60,28,94,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
23048,23048,31,23,73,4,1,0,1,0,1
23049,23049,29,61,196,6,0,1,0,0,0
23050,23050,67,7,165,5,0,0,0,1,0
23051,23051,21,44,153,8,0,1,0,0,1


# SPLITING AND SCALING

In [177]:
x = df.drop(columns=['Customer_ID' ,'Churn' ])
y = df['Churn']

In [179]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [182]:
x_train , x_test , y_train , y_test = train_test_split(x_scaled , y , test_size=0.2 , random_state=30)

# FEATURE SELECTION

In [198]:
selector  = RFE(LogisticRegression() , n_features_to_select=5)
x_train = selector.fit_transform(x_train , y_train)
x_test = selector.transform(x_test)

# KNN WITH GRIDSEARCH 

In [199]:
knn = KNeighborsClassifier()

param_grids = {
    'n_neighbors' : list(range(1,25)) , 
    'weights' : ['uniform' , 'distance'],
    'metric' : ['euclidean', 'manhattan']
}

model_knn = GridSearchCV(
    knn,
    param_grid=param_grids,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

model_knn.fit(x_train , y_train)

In [201]:
print("Best Params:", model_knn.best_params_)
best_knn = model_knn.best_estimator_

Best Params: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}


In [202]:
y_pred = best_knn.predict(x_test)

# EVALUATION

In [204]:
print('recall score = ' , recall_score(y_test , y_pred))
print('accuracy score = ' , accuracy_score(y_test , y_pred))
print('confusion matrix = ' , confusion_matrix(y_test , y_pred))
print('classification report = \n' , classification_report(y_test , y_pred))

recall score =  0.5052356020942408
accuracy score =  0.5001084363478638
confusion matrix =  [[1148 1171]
 [1134 1158]]
classification report = 
               precision    recall  f1-score   support

           0       0.50      0.50      0.50      2319
           1       0.50      0.51      0.50      2292

    accuracy                           0.50      4611
   macro avg       0.50      0.50      0.50      4611
weighted avg       0.50      0.50      0.50      4611

