In [88]:
import pandas as pd 
import numpy as np

In [89]:
import warnings 
warnings.filterwarnings("ignore")

In [90]:
df = pd.read_csv('churn.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       1000 non-null   int64  
 1   Age              1000 non-null   int64  
 2   Gender           1000 non-null   object 
 3   Tenure           1000 non-null   int64  
 4   MonthlyCharges   1000 non-null   float64
 5   ContractType     1000 non-null   object 
 6   InternetService  703 non-null    object 
 7   TotalCharges     1000 non-null   float64
 8   TechSupport      1000 non-null   object 
 9   Churn            1000 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 78.3+ KB


Unnamed: 0,CustomerID,Age,Gender,Tenure,MonthlyCharges,ContractType,InternetService,TotalCharges,TechSupport,Churn
0,1,49,Male,4,88.35,Month-to-Month,Fiber Optic,353.4,Yes,Yes
1,2,43,Male,0,36.67,Month-to-Month,Fiber Optic,0.0,Yes,Yes
2,3,51,Female,2,63.79,Month-to-Month,Fiber Optic,127.58,No,Yes
3,4,60,Female,8,102.34,One-Year,DSL,818.72,Yes,Yes
4,5,42,Male,32,69.01,Month-to-Month,,2208.32,No,Yes


### **Data Prep**

In [91]:
X = df[['Age','Gender','Tenure','MonthlyCharges']]
y = df[['Churn']]

X

Unnamed: 0,Age,Gender,Tenure,MonthlyCharges
0,49,Male,4,88.35
1,43,Male,0,36.67
2,51,Female,2,63.79
3,60,Female,8,102.34
4,42,Male,32,69.01
...,...,...,...,...
995,42,Male,41,37.14
996,62,Male,9,80.93
997,51,Female,15,111.72
998,39,Male,68,65.67


In [92]:
# Transform target columns 
y['Churn'] = y['Churn'].apply(lambda x: 1 if x == "Yes" else 0)
y

Unnamed: 0,Churn
0,1
1,1
2,1
3,1
4,1
...,...
995,1
996,1
997,1
998,1


In [93]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(700, 4) (300, 4) (700, 1) (300, 1)


#### Notes

after:
1. define the feature that be use
2. split X, y or feature and target
3. transform the y column

then:
1. we put column transformer
2. we use pipeline

#### **Column Transformer**

1. from sklearn.compose import ColumnTransformer
2. from sklearn.preprocessing import FunctionTransformer
3. Put the functiontransformer(manipulate) into variable 
4. make it into pipeline and put the pipeline into variable

In [94]:
# Transform gender column

from sklearn.preprocessing import FunctionTransformer

def gender_transform(x):
    return np.where(x == 'Female', 1, 0).astype(int)

gender_transformer = FunctionTransformer(gender_transform, validate=False)

In [95]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


In [96]:
from sklearn.compose import ColumnTransformer

dataprep = ColumnTransformer(
    transformers=[
        ('gender',gender_transformer,['Gender']),
        ('scaler',scaler,['Age','Tenure','MonthlyCharges'])
    ]
)


In [97]:
dataprep

#### Important Notes 

If we're using scaler in a X_test before import it to file, that might be a data leakage

In [98]:
import joblib 

joblib.dump(dataprep, 'pipeline.pkl')

['pipeline.pkl']

### **Modelling**

#### *Define model success metrics*

In [99]:
from sklearn.metrics import accuracy_score

def modelperformance(test, pred):
    print("Accuracy Score on this model is {}".format(accuracy_score(test,pred)))

#### Import tuning

In [100]:
from sklearn.model_selection import GridSearchCV 

##### logistic regression

In [101]:
# 1. Import model
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()

In [102]:
# 2. Put preprocess & model into 1 pipeline
from sklearn.pipeline import Pipeline

pipe_log = Pipeline(
    [
        ('dataprep', dataprep),
        ('log_model',log_model)
    ]
)

pipe_log

In [103]:
# 3. Fit into data

pipe_log.fit(X_train, y_train)

In [104]:
y_pred_log = pipe_log.predict(X_test)

In [105]:
print( accuracy_score(y_test, y_pred_log) )

modelperformance(y_test, y_pred_log)

0.8733333333333333
Accuracy Score on this model is 0.8733333333333333


#### KNN classifier

In [110]:
# 1. Import model & define param
from sklearn.neighbors import KNeighborsClassifier

knnmodel = KNeighborsClassifier()

param_knn = {
    "knnmodel__n_neighbors" : [3,5,7,9],
    "knnmodel__weights"      : ['uniform','distance']
}

In [111]:
# 2. Put preprocess & model into 1 pipeline

from sklearn.pipeline import Pipeline 

pipe_knn = Pipeline([
    ('dataprep', dataprep),
    ('knnmodel', knnmodel)
])

pipe_knn

In [112]:
# 3. Set the hyperparameter tune

grid_knn = GridSearchCV(pipe_knn, param_knn, cv=5)

In [None]:
# 4. fit into model
grid_knn.fit(X_train,y_train)

In [117]:
print(grid_knn.best_params_,grid_knn.best_estimator_)

{'knnmodel__n_neighbors': 3, 'knnmodel__weights': 'uniform'} Pipeline(steps=[('dataprep',
                 ColumnTransformer(transformers=[('gender',
                                                  FunctionTransformer(func=<function gender_transform at 0x00000175B80FA160>),
                                                  ['Gender']),
                                                 ('scaler', StandardScaler(),
                                                  ['Age', 'Tenure',
                                                   'MonthlyCharges'])])),
                ('knnmodel', KNeighborsClassifier(n_neighbors=3))])


In [115]:
y_pred_gridknn = grid_knn.predict(X_test)

In [116]:
accuracy_score(y_test, y_pred_gridknn)

modelperformance(y_test, y_pred_gridknn)

Accuracy Score on this model is 0.8533333333333334


#### SVM Model

In [None]:
from sklearn.svm import SVC

svc_model = SVC()

In [None]:
param_svc ={
    'C' : [0.01,0.2,1],
    'kernel' : ['linear','rbf']
}

In [None]:
gridsvc = GridSearchCV(svc_model, param_svc)

In [None]:
gridsvc.fit(X_train, y_train)

In [None]:
print(gridsvc.best_estimator_, gridsvc.best_params_)

SVC(C=0.01, kernel='linear') {'C': 0.01, 'kernel': 'linear'}


In [None]:
y_pred_svc = gridsvc.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred_svc))

modelperformance(y_test, y_pred_svc)

0.87
Accuracy Score on this model is 0.87


##### DT

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()

In [None]:
param_dt = {
    'criterion' :           ['gini','entropy'],
    'splitter'  :           ['best'],
    'max_depth' :           [None,3,5,7,10,30],
    'min_samples_split' :   [2,5,10],
    'min_samples_leaf'  :   [1,2,4]
}

In [None]:
grid_dt = GridSearchCV(dt_model, param_dt)

In [None]:
grid_dt.fit(X_train, y_train)

In [None]:
print(grid_dt.best_estimator_,grid_dt.best_params_)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=2) {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}


In [None]:
y_pred_griddt = grid_dt.predict(X_test)

In [None]:
modelperformance(y_test, y_pred_griddt)

Accuracy Score on this model is 0.8633333333333333


In [None]:
import joblib 

joblib.dump(log_model, 'model.pkl')

['model.pkl']