In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib

In [2]:
# Get data
df = pd.read_csv("./data/train.csv")

df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no


In [3]:
unique_states = df['state'].unique()
unique_area_codes = df['area_code'].unique()

def preprocess(df):
    # Change categorical data to one hot encoding
    df['state'] = df['state'].astype(
        pd.CategoricalDtype(categories=unique_states)
        )
    state_one_hot = pd.get_dummies(df['state'])
    df = df.drop('state', axis = 1)
    df = df.join(state_one_hot)

    df['area_code'] = df['area_code'].astype(
        pd.CategoricalDtype(categories=unique_area_codes)
        )
    area_code_one_hot = pd.get_dummies(df['area_code'])
    df = df.drop('area_code', axis = 1)
    df = df.join(area_code_one_hot)

    # Change no/yes to 0/1
    df['international_plan'] = df['international_plan'].map({'yes': 1, 'no': 0})
    df['voice_mail_plan'] = df['voice_mail_plan'].map({'yes': 1, 'no': 0})

    return df

In [4]:
preprocessed_df = preprocess(df)

preprocessed_df.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,...,NE,KS,TN,IL,PA,CT,ND,area_code_415,area_code_408,area_code_510
0,107,0,1,26,161.6,123,27.47,195.5,103,16.62,...,0,0,0,0,0,0,0,1,0,0
1,137,0,0,0,243.4,114,41.38,121.2,110,10.3,...,0,0,0,0,0,0,0,1,0,0
2,84,1,0,0,299.4,71,50.9,61.9,88,5.26,...,0,0,0,0,0,0,0,0,1,0
3,75,1,0,0,166.7,113,28.34,148.3,122,12.61,...,0,0,0,0,0,0,0,1,0,0
4,121,0,1,24,218.2,88,37.09,348.5,108,29.62,...,0,0,0,0,0,0,0,0,0,1


In [5]:
# Seperate data
train, test = train_test_split(preprocessed_df, test_size=0.2)

x_train = train.loc[:, train.columns != 'churn']
y_train = train.loc[:, train.columns == 'churn']

x_test = test.loc[:, test.columns != 'churn']
y_test = test.loc[:, test.columns == 'churn']

### Training simple Sklearn classifiers

In [15]:
# K-Nearest Neighbors Classifiers
knn = KNeighborsClassifier()
knn.fit(x_train, y_train.values.ravel())

y_preds = knn.predict(x_test)
knn_acc = accuracy_score(y_preds, y_test)
print("Accuracy : " + str(knn_acc))

Accuracy : 0.9117647058823529


In [14]:
# Random Forest Classifiers
rnd_forest = RandomForestClassifier()
rnd_forest.fit(x_train, y_train.values.ravel())

y_preds = rnd_forest.predict(x_test)
rnd_forest_acc = accuracy_score(y_preds, y_test)
print("Accuracy : " + str(rnd_forest_acc))

Accuracy : 0.9494117647058824


In [8]:
# Multi-Layer Perceptron
mlp = MLPClassifier()
mlp.fit(x_train, y_train.values.ravel())

y_preds = mlp.predict(x_test)
mlp_acc = accuracy_score(y_preds, y_test)
print("Accuracy : " + str(mlp_acc))

Accuracy : 0.8870588235294118


In [9]:
# Ada Boost Classifier
ada = AdaBoostClassifier()
ada.fit(x_train, y_train.values.ravel())

y_preds = ada.predict(x_test)
ada_acc = accuracy_score(y_preds, y_test)
print("Accuracy : " + str(ada_acc))

Accuracy : 0.9094117647058824


In [16]:
# Save best model
filename = "model.joblib"
joblib.dump(rnd_forest, filename)

['model.joblib']

In [17]:
# Load the model
model = joblib.load(filename)