# Load dependencies

In [259]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.datasets import load_iris

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE	#create smote object
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import plot_tree

# Load file

In [102]:
churnData=pd.read_csv(r'Data/DATA_Customer-Churn.csv')

# User defined functions

In [280]:
def d_nan_to_mean(dataframe,columns):
    integers_with_nan=columns
    df=dataframe.copy()
    #Loop through all columns in list, for each get the mean and fillna with mean
    for j in integers_with_nan:
        temp_mean=np.mean(df[j])
        df[j]=df[j].fillna(temp_mean)
    
    return df
churnData_cleaned=d_nan_to_mean(churnData,["TotalCharges"])



def ca_prep_xy(dataframe,y_name,size_val=.3):
    
    #Prep X-Y Split
    y=dataframe[y_name]
    x=dataframe.drop(labels=y_name,axis=1)

    #Create train-test data using 'size_val' percent of test data
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=size_val,random_state=123)

    #Create transformer to normalize data
    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)

    numerical_columns = numerical_columns_selector(x_train)
    categorical_columns = categorical_columns_selector(x_train)

    categorical_preprocessor = OneHotEncoder(drop='first',handle_unknown="ignore")
    numerical_preprocessor = StandardScaler()

    transformer = ColumnTransformer([('cat', categorical_preprocessor, categorical_columns),
                                      ('num', numerical_preprocessor, numerical_columns)],sparse_threshold=0)

    
    #Normalize/scale data
    x_train_scaled=transformer.fit_transform(x_train)
    x_test_scaled=transformer.transform(x_test)
    feature_names=transformer.get_feature_names_out()
    x_train_scaled=pd.DataFrame(data=x_train_scaled,columns=feature_names)
    x_test_scaled=pd.DataFrame(data=x_test_scaled,columns=feature_names)
    
    with open("transformer.pickle", "wb") as f:
        pickle.dump(transformer,f)
    
    return x_train,y_train,x_train_scaled,x_test_scaled,y_test,x_test



def resize_random_sample(dataframe,y_name,downsize=False):
    valuecount_rank=dataframe.groupby([y_name])[y_name].count().sort_values(ascending=False)
    max_len=valuecount_rank.values[0]
    min_len=valuecount_rank.values[-1]
    
    new_val_list=[]
    #If downsize is true
    if downsize==True:
        for i in valuecount_rank.index:
            if len(dataframe[dataframe[y_name]==i])!=min_len:
                temp_list=dataframe[dataframe[y_name]==i].sample(min_len)
                new_val_list.append(temp_list)
            else:
                new_val_list.append(dataframe[dataframe[y_name]==i])
    
    #If downsize is true
    if downsize==False:
        for i in valuecount_rank.index:
            if len(dataframe[dataframe[y_name]==i])!=max_len:
                temp_list=dataframe[dataframe[y_name]==i].sample(max_len, replace=True)
                new_val_list.append(temp_list)
            else:
                new_val_list.append(dataframe[dataframe[y_name]==i])
                
    new_df=pd.DataFrame()
    new_df=pd.concat(new_val_list,axis=0)
    new_df = new_df.sample(frac=1)
    return new_df


def evaluate_classification_model(y_train, y_pred_train, y_test, y_pred_test):
    performance_df = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train),
                                         precision_score(y_train, y_pred_train),
                                         recall_score(y_train, y_pred_train)],
                               'Test': [accuracy_score(y_test, y_pred_test),
                                        precision_score(y_test, y_pred_test),
                                        recall_score(y_test, y_pred_test)]})
    
    pd.options.display.float_format = '{:.2f}'.format

    df_train = pd.DataFrame({'Real': y_train, 'Predicted': y_pred_train})
    df_test  = pd.DataFrame({'Real': y_test,  'Predicted': y_pred_test})

    return performance_df, df_train, df_test


def conduct_knn(database,y_name,n_neighbors=5,show_plot=False):

    x_train,y_train,x_train_scaled,x_test_scaled,y_test,x_test = ca_prep_xy(database,y_name)

    model_knn = KNeighborsClassifier(n_neighbors=n_neighbors,weights='uniform')
    model_knn.fit(x_train_scaled, y_train)
    y_pred=model_knn.predict(x_test_scaled)
    y_pred_train=model_knn.predict(x_train_scaled)

    error_metrics_df,y_train_vs_predicted,y_test_vs_predicted=evaluate_classification_model(y_train, y_pred_train,y_test, y_pred)
    if show_plot==True:
        fig, ax = plt.subplots(1,2, figsize=(14,8))

        plot_confusion_matrix(model_knn,x_train_scaled,y_train,ax=ax[0], values_format = 'd')
        ax[0].title.set_text("Train Set")

        plot_confusion_matrix(model_knn,x_test_scaled,y_test,ax=ax[1],values_format = 'd')
        ax[1].title.set_text("Test Set")
    
    return model_knn, error_metrics_df

def conduct_decision_tree(database,y_name,max_depth=5,show_plot=False):
    x_train,y_train,x_train_scaled,x_test_scaled,y_test,x_test = ca_prep_xy(database,y_name)

    model_DT = DecisionTreeClassifier(max_depth=max_depth)
    model_DT.fit(x_train, y_train)

    y_pred_train = model_DT.predict(x_train)
    y_pred_test = model_DT.predict(x_test)

    error_metrics_df,y_train_vs_predicted,y_test_vs_predicted=evaluate_classification_model(y_train, y_pred_train,y_test, y_pred_test)
    
    if show_plot==True:
        fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (34,20))
        plot_tree(model,filled = True, rounded=True,feature_names=X.columns)
        plt.show() 
    
    return model_DT, error_metrics_df

def conduct_smote(dataframe,y_name):
    smote = SMOTE()
    x=dataframe.drop(y_name,axis=1).copy()
    y=dataframe[y_name].copy()
    x_sm, y_sm = smote.fit_resample(x, y)
    sm_total=pd.concat([x_sm,y_sm],axis=1)
    return sm_total

def conduct_tomek_links(dataframe,y_name):
    x=dataframe.drop(y_name,axis=1).copy()
    y=dataframe[y_name].copy()
    tl = TomekLinks('majority')
    x_tl, y_tl = tl.fit_resample(x, y)
    tl_total=pd.concat([x_tl,y_tl],axis=1)
    print ("Original value counts:")
    print(y.value_counts())
    print()
    print("New value counts:")
    print(y_tl.value_counts())
    return tl_total

def cross_validate_models(model_list):
    for i in model_list:
        scores=cross_val_score(i, x_train, y_train, cv=5)
        print(i)
        print(scores)
        print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
        print()

# Check datatypes

In [103]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

## Fix TotalCharges to make it numeric

In [104]:
churnData['TotalCharges']=pd.to_numeric(churnData['TotalCharges'],errors='coerce')

In [105]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

# Check missing values

In [106]:
dataframe=churnData.copy()
count_missing_val=dataframe.isnull().sum()
count_all_val=len(dataframe)
missing_ratio=(count_missing_val/count_all_val)*100
data={'column':count_missing_val.index,"# missing values":count_missing_val.values,"% missing values":missing_ratio}
count_missing_values_and_percent=pd.DataFrame(data)
count_missing_values_and_percent.reset_index(drop=True,inplace=True)

count_missing_values_and_percent.loc[count_missing_values_and_percent['% missing values']>0]

Unnamed: 0,column,# missing values,% missing values
14,TotalCharges,11,0.16


## Fix nan values

In [108]:
churn_key_features=churnData_cleaned[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges','Churn']].copy()

In [109]:
churn_key_features.describe()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7043.0
mean,32.37,0.16,64.76,2283.3
std,24.56,0.37,30.09,2265.0
min,0.0,0.0,18.25,18.8
25%,9.0,0.0,35.5,402.23
50%,29.0,0.0,70.35,1400.55
75%,55.0,0.0,89.85,3786.6
max,72.0,1.0,118.75,8684.8


## Adjust Churn to make it binary

In [110]:
churn_key_features['Churn']=churn_key_features['Churn'].map(dict(Yes=1,No=0))
churn_key_features['Churn'].unique()

array([0, 1], dtype=int64)

# Conduct KNN Classifier

In [257]:
y_name="Churn"
knn_m,err_knn=conduct_knn(churn_key_features,y_name,7)
err_knn

Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.83,0.76
1,Precision,0.72,0.6
2,Recall,0.58,0.46


# Conduct Decision Tree Classifier

In [256]:
dt_m,err_dt=conduct_decision_tree(churn_key_features,y_name,3)
err_dt

Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.79,0.77
1,Precision,0.66,0.63
2,Recall,0.41,0.39


# Check cross validation of models

In [263]:
model_list=[knn_m,dt_m]
cross_validate_models(model_list)

KNeighborsClassifier(n_neighbors=7)
[0.77047913 0.74806801 0.75193199 0.75560712 0.7733952 ]
0.76 accuracy with a standard deviation of 0.01

DecisionTreeClassifier(max_depth=3)
[0.74729521 0.73338485 0.75734158 0.75096674 0.77184841]
0.75 accuracy with a standard deviation of 0.01



# Manage imbalances

## Check for imbalance

In [264]:
churn_key_features['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

### Downsize using random sample

In [265]:
downsized=resize_random_sample(churn_key_features,y_name,downsize=True)

In [266]:
downsized['Churn'].value_counts()

1    1869
0    1869
Name: Churn, dtype: int64

### Upsize using random sample replace

In [267]:
upsized=resize_random_sample(churn_key_features,y_name)

In [268]:
upsized['Churn'].value_counts()

1    5174
0    5174
Name: Churn, dtype: int64

### Upsize using  SMOTE

In [269]:
smote_df=conduct_smote(churn_key_features,y_name)

In [279]:
smote_df['Churn'].value_counts()

0    5174
1    5174
Name: Churn, dtype: int64

### Downsize using TomekLinks

In [281]:
tomeklinks=conduct_tomek_links(churn_key_features,y_name)

Original value counts:
0    5174
1    1869
Name: Churn, dtype: int64

New value counts:
0    4620
1    1869
Name: Churn, dtype: int64


In [282]:
tomeklinks_resized=resize_random_sample(tomeklinks,y_name)

In [283]:
tomeklinks_resized['Churn'].value_counts()

0    4620
1    4620
Name: Churn, dtype: int64

# Comparison of models

In [290]:
data_list=[downsized,upsized,smote_df,tomeklinks_resized]
data_names=['Downsized','Upsized','Smote','Tomeklinks_upresized']
x=0
for i in data_list:
    m,r=conduct_knn(i,y_name,7)
    m2,r2=conduct_decision_tree(i,y_name,3)
    print(data_names[x])
    print('------')
    print (m)
    print (r)
    print(m2)
    print (r2)
    print()
    print()
    print()
    x+=1

Downsized
------
KNeighborsClassifier(n_neighbors=7)
  Error_metric  Train  Test
0     Accuracy   0.78  0.71
1    Precision   0.77  0.73
2       Recall   0.78  0.72
DecisionTreeClassifier(max_depth=3)
  Error_metric  Train  Test
0     Accuracy   0.73  0.71
1    Precision   0.71  0.70
2       Recall   0.77  0.76



Upsized
------
KNeighborsClassifier(n_neighbors=7)
  Error_metric  Train  Test
0     Accuracy   0.81  0.76
1    Precision   0.78  0.73
2       Recall   0.87  0.82
DecisionTreeClassifier(max_depth=3)
  Error_metric  Train  Test
0     Accuracy   0.72  0.72
1    Precision   0.69  0.69
2       Recall   0.80  0.81



Smote
------
KNeighborsClassifier(n_neighbors=7)
  Error_metric  Train  Test
0     Accuracy   0.82  0.76
1    Precision   0.80  0.74
2       Recall   0.85  0.80
DecisionTreeClassifier(max_depth=3)
  Error_metric  Train  Test
0     Accuracy   0.75  0.73
1    Precision   0.70  0.68
2       Recall   0.87  0.86



Tomeklinks_upresized
------
KNeighborsClassifier(n_neighbo