In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/churn_data.csv')

In [3]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
#we need to convert our yes/no and true/false into 0's and 1's

churn_dict = {False: 0, True: 1}
yes_no_dict = {'no': 0, 'yes': 1}
df['churn'].replace(churn_dict, inplace=True)
df['international plan'].replace(yes_no_dict, inplace=True)
df['voice mail plan'].replace(yes_no_dict, inplace=True)

In [5]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [6]:
#pop off uneeded columns

df = df.drop('phone number', axis=1)
df = df.drop('area code', axis=1)

df

Unnamed: 0,state,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,0
1,OH,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,0
2,NJ,137,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,0,1,36,156.2,77,26.55,215.5,126,18.32,279.1,83,12.56,9.9,6,2.67,2,0
3329,WV,68,0,0,0,231.1,57,39.29,153.4,55,13.04,191.3,123,8.61,9.6,4,2.59,3,0
3330,RI,28,0,0,0,180.8,109,30.74,288.8,58,24.55,191.9,91,8.64,14.1,6,3.81,2,0
3331,CT,184,1,0,0,213.8,105,36.35,159.6,84,13.57,139.2,137,6.26,5.0,10,1.35,2,0


In [7]:
#lets check our target variables value count for balance

df.churn.value_counts()

#looks like we have a large imbalance, this is something we can fix using SMOTE

0    2850
1     483
Name: churn, dtype: int64

In [8]:
#let's now prepare our data for the train_test_split

X = df.drop('churn', axis=1)
y = df.churn

In [9]:
#we must import the proper packages to perform train_test_split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021, test_size=0.20)

In [10]:
#lets create a function that will OneHotEncode our categorical features for us

from sklearn.preprocessing import OneHotEncoder

def ohe2021(X, column_name):
    
    #grab categorical columns
    X_cat = X[column_name]

    #grab numeric columns
    X_num = X.drop(column_name, axis=1)
    
    #let's now OneHotEncode
    ohe = OneHotEncoder(sparse=False, drop='first')
    X_cat_oh = pd.DataFrame(ohe.fit_transform(X_cat), index= X_cat.index, columns=ohe.get_feature_names(X_cat.columns))
    
    #combine X_num with X_cat now that its OneHotEncoded
    X = X_num.join(X_cat_oh)
    
    return X

In [11]:
X_train_oh = ohe2021(X_train, ['state'])

In [12]:
#we can now see that our training dataframe has been OneHotEncoded

X_train_oh.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
561,53,0,1,18,146.8,107,24.96,310.0,84,26.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1764,77,0,0,0,245.2,87,41.68,254.1,83,21.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1733,130,0,0,0,124.3,70,21.13,270.7,99,23.01,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3168,201,0,0,0,225.9,110,38.4,299.1,86,25.42,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562,99,0,1,28,200.7,88,34.12,264.2,116,22.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
#let's now create a function that can scale our data, remember that we only scale on our TRAINING data

from sklearn.preprocessing import StandardScaler

def scale2021(X):
    
    ss = StandardScaler()
    
    X_scaled = pd.DataFrame(ss.fit_transform(X))
    
    X_scaled.index = X.index
    
    X_scaled.columns = X.columns
    
    return X_scaled

X_train_oh_sc = scale2021(X_train_oh)

In [14]:
#we can see that we have successfully scaled our OneHotEncoded training data

X_train_oh_sc.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
561,-1.215926,-0.328029,1.620867,0.71995,-0.594966,0.32627,-0.594598,2.150149,-0.806009,2.150108,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1764,-0.613634,-0.328029,-0.616954,-0.591007,1.193361,-0.660331,1.192879,1.045323,-0.856365,1.04563,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1733,0.716426,-0.328029,-0.616954,-0.591007,-1.003882,-1.498942,-1.00405,1.373411,-0.050659,1.373485,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,7.464811,-0.151736,-0.185839,-0.153025
3168,2.498205,-0.328029,-0.616954,-0.591007,0.842602,0.474261,0.842226,1.934718,-0.705295,1.933863,...,-0.131031,-0.126515,6.765496,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
562,-0.061534,-0.328029,1.620867,1.448259,0.384616,-0.611001,0.384666,1.244943,0.805404,1.245598,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,6.5349


In [15]:
#now let's try to do something about the class imbalance using SMOTE

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=2021)

X_train_oh_sc_resamp, y_train_resamp = sm.fit_resample(X_train_oh_sc, y_train)

In [16]:
X_train_oh_sc_resamp.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,-1.215926,-0.328029,1.620867,0.71995,-0.594966,0.32627,-0.594598,2.150149,-0.806009,2.150108,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1,-0.613634,-0.328029,-0.616954,-0.591007,1.193361,-0.660331,1.192879,1.045323,-0.856365,1.04563,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
2,0.716426,-0.328029,-0.616954,-0.591007,-1.003882,-1.498942,-1.00405,1.373411,-0.050659,1.373485,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,7.464811,-0.151736,-0.185839,-0.153025
3,2.498205,-0.328029,-0.616954,-0.591007,0.842602,0.474261,0.842226,1.934718,-0.705295,1.933863,...,-0.131031,-0.126515,6.765496,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
4,-0.061534,-0.328029,1.620867,1.448259,0.384616,-0.611001,0.384666,1.244943,0.805404,1.245598,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,6.5349


In [17]:
#lets check to see if we balanced our classes

df.churn.value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [19]:
y_train_resamp.value_counts()

1    2270
0    2270
Name: churn, dtype: int64

In [26]:
#the first model we're going to checkout is a LogisticRegression

from sklearn.linear_model import LogisticRegression

log1 = LogisticRegression(random_state=2021)
log1.fit(X_train_oh_sc_resamp, y_train_resamp)
log1.score(X_train_oh_sc_resamp, y_train_resamp)

#score gives us our accuracy, comparing it to what it fit on in preparation for unseen data

0.798237885462555

In [27]:
#the next thing we will do is check our f1 score using cross_val_score

from sklearn.model_selection import cross_val_score

all_features_cross_val_score = cross_val_score(log1, X_train_oh_sc_resamp, y_train_resamp, cv=3)
all_features_cross_val_score

#f1 score gives us a balance between recall and accuracy

array([0.78269485, 0.79643093, 0.79048249])

In [36]:
coef_log1 = {}
for coef, feat in zip(log1.coef_[0,:],X_train_oh_sc_resamp.columns):
    coef_log1[feat] = coef
    
log1_dict = coef_log1

log1_dict

{'account length': 0.05465803660831661,
 'international plan': 0.8643214342535465,
 'voice mail plan': -1.0711561540621855,
 'number vmail messages': 0.5833956886122196,
 'total day minutes': 0.4340366106210175,
 'total day calls': 0.13459785202522626,
 'total day charge': 0.4364428169395637,
 'total eve minutes': 0.2441232588375694,
 'total eve calls': 0.04343982996400333,
 'total eve charge': 0.19552916020418074,
 'total night minutes': 0.061636974815759776,
 'total night calls': -0.0330775687848651,
 'total night charge': 0.120504044650966,
 'total intl minutes': 0.08417780777778124,
 'total intl calls': -0.20176042838853372,
 'total intl charge': 0.11572075198029565,
 'customer service calls': 0.9224756204837488,
 'state_AL': -0.19695803241769763,
 'state_AR': -0.11870998456433077,
 'state_AZ': -0.21820736138398855,
 'state_CA': -0.007653722948586534,
 'state_CO': -0.09376945413983571,
 'state_CT': -0.06862379551398427,
 'state_DC': -0.035934708182815016,
 'state_DE': -0.1159125763

In [28]:
log2 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.5)
log2.fit(X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(log2, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78269485 0.79643093 0.79048249]
New: [0.78137384 0.79841375 0.79180436]


In [29]:
#next model we want to checkout is ExtraTrees

from sklearn.ensemble import ExtraTreesClassifier

etc1 = ExtraTreesClassifier()
etc1.fit(X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(etc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78269485 0.79643093 0.79048249]
New: [0.94980185 0.94844679 0.96100463]


In [30]:
#let's checkout KNN

from sklearn.neighbors import KNeighborsClassifier

knc1 = KNeighborsClassifier()
knc1.fit = (X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(knc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78269485 0.79643093 0.79048249]
New: [0.82892999 0.83542631 0.8559154 ]


In [31]:
#now let's try svm vector

from sklearn import svm

svr1 = svm.SVR()
svr1.fit(X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(svr1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78269485 0.79643093 0.79048249]
New: [-0.0634439   0.57372578  0.        ]


In [37]:
#not so good, lets take a look at RandomForest

from sklearn.ensemble import RandomForestClassifier

rfc1 = RandomForestClassifier(random_state=2021)
svr1.fit(X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(rfc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78269485 0.79643093 0.79048249]
New: [0.90356671 0.93588896 0.93258427]


In [38]:
#let's checkout how DecisionTree performs

from sklearn.tree import DecisionTreeClassifier

dtc1 = DecisionTreeClassifier(random_state=2021)
svr1.fit(X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(dtc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78269485 0.79643093 0.79048249]
New: [0.87318362 0.91275611 0.90680767]
