In [1]:
# Imports
import numpy as np
import pandas as pd

In [2]:
# Read in data
df = pd.read_csv("../data/churn_data.csv")
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [3]:
# drop phone number (for now we are keeping area code but may drop it later)
df.pop('phone number')
df

Unnamed: 0,state,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,no,yes,36,156.2,77,26.55,215.5,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,no,no,0,231.1,57,39.29,153.4,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,no,no,0,180.8,109,30.74,288.8,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,yes,no,0,213.8,105,36.35,159.6,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


In [4]:
df.columns

Index(['state', 'account length', 'area code', 'international plan',
       'voice mail plan', 'number vmail messages', 'total day minutes',
       'total day calls', 'total day charge', 'total eve minutes',
       'total eve calls', 'total eve charge', 'total night minutes',
       'total night calls', 'total night charge', 'total intl minutes',
       'total intl calls', 'total intl charge', 'customer service calls',
       'churn'],
      dtype='object')

In [5]:
# Per Sam, if I run this code again down below it will return an error, so need to rerun Kernal for all
# clean data
# convert binaries
churn_dict = {False: 0, True: 1}
yes_no_dict = {'no': 0, 'yes': 1}
df['churn'].replace(churn_dict, inplace=True)
df['international plan'].replace(yes_no_dict, inplace=True)
df['voice mail plan'].replace(yes_no_dict, inplace=True)

In [6]:
# Target/split off target
X = df.drop('churn', axis=1)
y = df.churn

In [7]:
# Target Balance
# checking value count (class imbalance)- there is a very large class inbalance. We will need to smote it
df.churn.value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [8]:
df.head()

Unnamed: 0,state,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,OH,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,NJ,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [9]:
from sklearn.model_selection import train_test_split
# Train Test Split
# Don't touch test test data until the end
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 2021)

In [10]:
# function that Sam built to one hot encode
from sklearn.preprocessing import OneHotEncoder

# This code will onehotencode the categorical columns and return the dataframe
ohe_column_list = ['state']
def ohe2021(X, columns_list):  
    # One hot encode
    # get dataframe of categroical columns to oneHotEncode
    X_cat = X[ohe_column_list]
    X_cat.head()
    # get dataframe of numeric columns to oneHotEncode
    X_num = X.drop(ohe_column_list, axis=1)
    X_num.head()
    # Finally, One Hot Encode
    # from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder(sparse=False, drop='first')
    X_cat_oh = pd.DataFrame(ohe.fit_transform(X_cat), index= X_cat.index, columns=ohe.get_feature_names(X_cat.columns))
    X_cat_oh.head()
    # sandwich OHE df with numerical dataframes
    X = X_num.join(X_cat_oh)
    return X
X_train_oh = ohe2021(X_train, ohe_column_list)
X_train_oh.head()

Unnamed: 0,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
561,53,408,0,1,18,146.8,107,24.96,310.0,84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1764,77,415,0,0,0,245.2,87,41.68,254.1,83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1733,130,510,0,0,0,124.3,70,21.13,270.7,99,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3168,201,415,0,0,0,225.9,110,38.4,299.1,86,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562,99,408,0,1,28,200.7,88,34.12,264.2,116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# # Scale * ONLY ON TRAINING DATA
from sklearn.preprocessing import StandardScaler

# scale data
def scale2021(X):
    ss = StandardScaler()
    X_scaled = pd.DataFrame(ss.fit_transform(X))
    X_scaled.index = X.index
    X_scaled.columns = X.columns
    return X_scaled
X_train_oh_sc = scale2021(X_train_oh)
X_train_oh_sc.head()

Unnamed: 0,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
561,-1.215926,-0.685167,-0.328029,1.620867,0.71995,-0.594966,0.32627,-0.594598,2.150149,-0.806009,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1764,-0.613634,-0.519135,-0.328029,-0.616954,-0.591007,1.193361,-0.660331,1.192879,1.045323,-0.856365,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1733,0.716426,1.734149,-0.328029,-0.616954,-0.591007,-1.003882,-1.498942,-1.00405,1.373411,-0.050659,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,7.464811,-0.151736,-0.185839,-0.153025
3168,2.498205,-0.519135,-0.328029,-0.616954,-0.591007,0.842602,0.474261,0.842226,1.934718,-0.705295,...,-0.131031,-0.126515,6.765496,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
562,-0.061534,-0.685167,-0.328029,1.620867,1.448259,0.384616,-0.611001,0.384666,1.244943,0.805404,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,6.5349


In [12]:
#Scale then Smote!
from imblearn.over_sampling import SMOTE

In [13]:
X_train_oh_sc_resamp, y_train_resamp = SMOTE().fit_resample(X_train_oh_sc, y_train)

In [14]:
# decided to drop area code as it only has 3 and serves no purpose
X_train_oh_sc.drop('area code', axis=1)

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
561,-1.215926,-0.328029,1.620867,0.719950,-0.594966,0.326270,-0.594598,2.150149,-0.806009,2.150108,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1764,-0.613634,-0.328029,-0.616954,-0.591007,1.193361,-0.660331,1.192879,1.045323,-0.856365,1.045630,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1733,0.716426,-0.328029,-0.616954,-0.591007,-1.003882,-1.498942,-1.004050,1.373411,-0.050659,1.373485,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,7.464811,-0.151736,-0.185839,-0.153025
3168,2.498205,-0.328029,-0.616954,-0.591007,0.842602,0.474261,0.842226,1.934718,-0.705295,1.933863,...,-0.131031,-0.126515,6.765496,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
562,-0.061534,-0.328029,1.620867,1.448259,0.384616,-0.611001,0.384666,1.244943,0.805404,1.245598,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,6.534900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006,-0.262298,-0.328029,1.620867,1.375428,0.455494,-0.216360,0.455224,-1.288843,-0.352799,-1.288890,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
2669,1.092858,-0.328029,-0.616954,-0.591007,-0.911194,-0.167030,-0.911041,-0.312485,0.050054,-0.312298,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,5.380990,-0.153025
1152,-2.144459,-0.328029,-0.616954,-0.591007,0.544547,-0.561671,0.545026,-0.561516,-0.755652,-0.561096,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
2105,-1.542167,-0.328029,-0.616954,-0.591007,1.144291,-0.906981,1.144772,0.626320,-0.151372,0.627090,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025


In [15]:
y_train_resamp.value_counts()

1    2270
0    2270
Name: churn, dtype: int64

In [21]:
from sklearn.linear_model import LogisticRegression
log1 = LogisticRegression(random_state=2021)
log1.fit(X_train_oh_sc_resamp, y_train_resamp)
log1.score(X_train_oh_sc_resamp, y_train_resamp)

0.7980176211453744

In [37]:
coef_log1 = {}
for coef, feat in zip(log1.coef_[0,:],X_train_oh_sc_resamp.columns):
    coef_log1[feat] = coef
log1_dict = coef_log1
log1_dict

{'account length': 0.09474452088355663,
 'area code': -0.1697165623246269,
 'international plan': 0.9148900425670053,
 'voice mail plan': -1.1321823741607828,
 'number vmail messages': 0.7023115264905694,
 'total day minutes': 0.4351626694641925,
 'total day calls': 0.10434211608918909,
 'total day charge': 0.4353381153174092,
 'total eve minutes': 0.2414170147424499,
 'total eve calls': 0.08220615837350097,
 'total eve charge': 0.23249649847305098,
 'total night minutes': 0.07707763938897984,
 'total night calls': 0.0017020695107625603,
 'total night charge': 0.0813909142913082,
 'total intl minutes': 0.1125635225338777,
 'total intl calls': -0.1886314586597836,
 'total intl charge': 0.11471818624907681,
 'customer service calls': 0.9101920207886857,
 'state_AL': -0.20138037136493803,
 'state_AR': -0.0859665186233421,
 'state_AZ': -0.1594468156008643,
 'state_CA': 0.031668741173352256,
 'state_CO': -0.12541973102471396,
 'state_CT': -0.04826252109263929,
 'state_DC': -0.00379190318697

In [22]:
from sklearn.model_selection import cross_val_score

all_features_cross_val_score = cross_val_score(log1, X_train_oh_sc_resamp, y_train_resamp, cv=3)
all_features_cross_val_score

array([0.78005284, 0.78453404, 0.79444812])

In [25]:
log2 = LogisticRegression(class_weight='balanced', penalty='l1', C=0.5, solver="liblinear", )
log2.fit(X_train_oh_sc_resamp, y_train_resamp)
print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(log2, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78005284 0.78453404 0.79444812]
New: [0.78203435 0.78783873 0.79510905]


In [38]:
coef_log2 = {}
for coef, feat in zip(log2.coef_[0,:],X_train_oh_sc_resamp.columns):
    coef_log2[feat] = coef
log2_dict = coef_log2
log2_dict

{'account length': 0.09146552968729987,
 'area code': -0.16736391393240002,
 'international plan': 0.9045075970173109,
 'voice mail plan': -1.0484023631705326,
 'number vmail messages': 0.6205898017772018,
 'total day minutes': 0.8344754220724346,
 'total day calls': 0.09363737168114908,
 'total day charge': 0.024343950553433934,
 'total eve minutes': 0.06895773915479587,
 'total eve calls': 0.07804624794869724,
 'total eve charge': 0.3926109928355026,
 'total night minutes': 0.01950317815897334,
 'total night calls': 0.0,
 'total night charge': 0.13058688447297395,
 'total intl minutes': 0.0058613173541366735,
 'total intl calls': -0.18698215687893055,
 'total intl charge': 0.21439895980263116,
 'customer service calls': 0.9021912944301521,
 'state_AL': -0.1453562900524216,
 'state_AR': -0.04052323809884208,
 'state_AZ': -0.10908088805948693,
 'state_CA': 0.06356818353327723,
 'state_CO': -0.07734639822333429,
 'state_CT': 0.0,
 'state_DC': 0.0376771110598194,
 'state_DE': -0.05559500

In [26]:
from sklearn.ensemble import ExtraTreesClassifier
etc1 = ExtraTreesClassifier()
etc1.fit(X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(etc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78005284 0.78453404 0.79444812]
New: [0.94517834 0.9623265  0.97554527]


In [27]:
from sklearn.neighbors import KNeighborsClassifier

knc1 = KNeighborsClassifier()
knc1.fit = (X_train_oh_sc_resamp, y_train_resamp)


print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(knc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78005284 0.78453404 0.79444812]
New: [0.83025099 0.83212161 0.85723728]


In [29]:
from sklearn import svm

svm1 = svm.SVR()
svm1.fit(X_train_oh_sc_resamp, y_train_resamp)


print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(svm1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78005284 0.78453404 0.79444812]
New: [-0.0099387   0.60988416  0.        ]


In [30]:
from sklearn.ensemble import RandomForestClassifier

rfc1 = RandomForestClassifier()
rfc1.fit = (X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(rfc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78005284 0.78453404 0.79444812]
New: [0.92140026 0.94910773 0.95042961]


In [31]:
from sklearn.tree import DecisionTreeClassifier
dtc1 = DecisionTreeClassifier()
dtc1.fit = (X_train_oh_sc_resamp, y_train_resamp)

print("Old:", all_features_cross_val_score)
print("New:", cross_val_score(dtc1, X_train_oh_sc_resamp, y_train_resamp, cv=3))

Old: [0.78005284 0.78453404 0.79444812]
New: [0.88639366 0.90548579 0.9107733 ]


In [None]:
# smote, our data inside of the pipeline and run cross val on that pipeline
# scale train, then go back and scale the test data that includes our training data
# basically we need a smoted and non smoted sample although both are scaled. Is it smote
# then scale or scale then smote?

# Fitting, DON'T FIT ON TEST SET
# train on up sample when smoting, don't oversample on both parts of the cross val

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
sm = SMOTE()
ss = StandardScaler()
ohe = OneHotEncoder()
log = LogisticRegression()

In [49]:
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
preprocessing = make_column_transformer((OneHotEncoder(), make_column_selector(dtype_include=object)),
                                       (StandardScaler(), make_column_selector(dtype_include=np.number),
                                        SMOTE()))
#preprocessing = make_column_transformer((OneHotEncoder(), make_column_selector(dtype_include=object)),
                                                # StandardScaler(),
                                                # SMOTE(),
                                                # LogisticRegression())
preprocessing
preprocessing.fit_transform(X_train)

<2666x69 sparse matrix of type '<class 'numpy.float64'>'
	with 50654 stored elements in Compressed Sparse Row format>

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
dt_pipeline = make_pipeline(preprocessing, DecisionTreeClassifier(random_state=2021))
rf_pipeline = make_pipeline(preprocessing, RandomForestClassifier(random_state=2021))

In [54]:
cross_val_score(dt_pipeline, X_train, y_train)

array([0.89700375, 0.89305816, 0.9043152 , 0.90994371, 0.92307692])

In [48]:
scores = cross_val_score(smoted_scaled_pipeline, X_train, y_train, cv=3)
scores

Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/lib/python3.7/site-packages/imblearn/pipeline.py", line 277, in fit
    Xt, yt, fit_params = self._fit(X, y, **fit_params)
  File "/Applications/anaconda3/lib/python3.7/site-packages/imblearn/pipeline.py", line 233, in _fit
    **fit_params_steps[name]
  File "/Applications/anaconda3/lib/python3.7/site-packages/joblib/memory.py", line 355, in __call__
    return self.func(*args, **kwargs)
  File "/Applications/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/Applications/anaconda3/lib/python3.7/site-packages/sklearn/base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/Applications/anaconda3/li

array([nan, nan, nan])

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.pairplot(df[numeric_columns]);

In [None]:
# fig, ax = plt.subplots()

# fig.suptitle("Dummy Model")

# plot_confusion_matrix(dummy_model, X_train, y_train, ax=ax, cmap="plasma");

In [None]:
# import numpy as np; np.random.seed(0)
# import seaborn as sns
# ax = sns.heatmap(X_resamp_scaled)