In [47]:
import pandas as pd 
import numpy as np

In [48]:
import warnings 
warnings.filterwarnings("ignore")

In [49]:
df = pd.read_csv('churn.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       1000 non-null   int64  
 1   Age              1000 non-null   int64  
 2   Gender           1000 non-null   object 
 3   Tenure           1000 non-null   int64  
 4   MonthlyCharges   1000 non-null   float64
 5   ContractType     1000 non-null   object 
 6   InternetService  703 non-null    object 
 7   TotalCharges     1000 non-null   float64
 8   TechSupport      1000 non-null   object 
 9   Churn            1000 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 78.3+ KB


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,,2208.32,No,Yes


### **Data Prep**

In [50]:
X = df[['Age','Gender','Tenure','MonthlyCharges']]
y = df[['Churn']]

X

Unnamed: 0,Age,Gender,Tenure,MonthlyCharges
0,49,Male,4,88.35
1,43,Male,0,36.67
2,51,Female,2,63.79
3,60,Female,8,102.34
4,42,Male,32,69.01
...,...,...,...,...
995,42,Male,41,37.14
996,62,Male,9,80.93
997,51,Female,15,111.72
998,39,Male,68,65.67


In [51]:
# Transform gender column

X['Gender'] = np.where(df['Gender'] == 'Female', 1, 0)
X

Unnamed: 0,Age,Gender,Tenure,MonthlyCharges
0,49,0,4,88.35
1,43,0,0,36.67
2,51,1,2,63.79
3,60,1,8,102.34
4,42,0,32,69.01
...,...,...,...,...
995,42,0,41,37.14
996,62,0,9,80.93
997,51,1,15,111.72
998,39,0,68,65.67


In [52]:
# Transform target columns 
y['Churn'] = y['Churn'].apply(lambda x: 1 if x == "Yes" else 0)
y

Unnamed: 0,Churn
0,1
1,1
2,1
3,1
4,1
...,...
995,1
996,1
997,1
998,1


In [53]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(700, 4) (300, 4) (700, 1) (300, 1)


In [54]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)


#### Important Notes 

If we're using scaler in a X_test before import it to file, that might be a data leakage

In [55]:
import joblib 

joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [56]:
X_test = scaler.fit_transform(X_test)

### **Modelling**

#### *Define model success metrics*

In [65]:
from sklearn.metrics import accuracy_score

def modelperformance(test, pred):
    print("Accuracy Score on this model is {}".format(accuracy_score(test,pred)))

#### Import tuning

In [58]:
from sklearn.model_selection import GridSearchCV 

##### logistic regression

In [59]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()

In [60]:
log_model.fit(X_train, y_train)

In [None]:
y_pred_log = log_model.predict(X_test)

In [66]:
accuracy_score(y_test, y_pred_log)

modelperformance(y_test, y_pred_log)

Accuracy Score on this model is 0.8733333333333333


#### KNN classifier

In [68]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

param_knn = {
    "n_neighbors" : [3,5,7,9],
    "weights"      : ['uniform','distance']
}

##### Set the model & grid in 1 model

In [69]:
gridknn = GridSearchCV(knn_model, param_knn, cv=5)

In [71]:
gridknn.fit(X_train,y_train)

print(gridknn.best_estimator_,gridknn.best_params_)

KNeighborsClassifier(n_neighbors=3) {'n_neighbors': 3, 'weights': 'uniform'}


In [73]:
y_pred_gridknn = gridknn.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_gridknn)

modelperformance(y_test, y_pred_gridknn)

Accuracy Score on this model is 0.85


#### SVM Model

In [79]:
from sklearn.svm import SVC

svc_model = SVC()

In [None]:
param_svc ={
    'C' : [0.01,0.2,1],
    'kernel' : ['linear','rbf']
}

In [80]:
gridsvc = GridSearchCV(svc_model, param_svc)

In [81]:
gridsvc.fit(X_train, y_train)

In [82]:
print(gridsvc.best_estimator_, gridsvc.best_params_)

SVC(C=0.01, kernel='linear') {'C': 0.01, 'kernel': 'linear'}


In [None]:
y_pred_svc = gridsvc.predict(X_test)

In [85]:
print(accuracy_score(y_test, y_pred_svc))

modelperformance(y_test, y_pred_svc)

0.87
Accuracy Score on this model is 0.87


##### DT

In [86]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()

In [87]:
param_dt = {
    'criterion' :           ['gini','entropy'],
    'splitter'  :           ['best'],
    'max_depth' :           [None,3,5,7,10,30],
    'min_samples_split' :   [2,5,10],
    'min_samples_leaf'  :   [1,2,4]
}

In [88]:
grid_dt = GridSearchCV(dt_model, param_dt)

In [89]:
grid_dt.fit(X_train, y_train)

In [90]:
print(grid_dt.best_estimator_,grid_dt.best_params_)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=2) {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}


In [91]:
y_pred_griddt = grid_dt.predict(X_test)

In [92]:
modelperformance(y_test, y_pred_griddt)

Accuracy Score on this model is 0.8633333333333333
