In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


#Import metric for performance evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# Scoring functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [5]:
df = pd.read_csv('Telco-Customer-Churn.csv')

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df.churn = (df.churn == 'yes').astype(int)

In [6]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

In [7]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']


In [8]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

In [9]:
## Logistic regression
log = LogisticRegression(solver='liblinear', random_state=1)
svc = SVC(kernel='sigmoid',gamma=1.0)
dtc= DecisionTreeClassifier(max_depth=5)
knc=KNeighborsClassifier()
rfc= RandomForestClassifier(n_estimators=100,max_depth=5,random_state=2)
xgb=XGBClassifier(n_estimators=100, max_depth=5, learning_rate=1, objective='binary:logistic')


In [10]:
clfs=  {
    'SVC':svc,
    'KNN':knc,
    'Decision Tree': dtc,
    'Logistic Regression':log,
    'Random Forest':rfc,
    'Xgboost':xgb
}

In [11]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict) # for testing input
# y_val for testing results

In [12]:
def train_classifier(clf,X_train,y_train,X_val,y_val):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_val)
    accuracy = accuracy_score(y_val,y_pred)
    precision =precision_score(y_val,y_pred,zero_division=0)
    recall = recall_score(y_val,y_pred)
    score = f1_score(y_val,y_pred)

    
    return accuracy,precision,recall,score
    

In [13]:
accuracy_scores=[]

for name,clf in clfs.items():
    current_accuracy,precision,recall,score =train_classifier(clf,X_train,y_train,X_val,y_val)
    
    
    print("for ",name)
    print("\t accuracy - ",current_accuracy)
    print("\t Precision score - ",precision)
    print("\t Recall Score - ",recall)
    print("\t f1 score - ",score)

