In [56]:
# Imports

# Data tools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# scikit learn processing tools
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score

# scitkit learn models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# scikit learn metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve

# Imbalanced Learn
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline

# misc
from pprint import pprint

In [2]:
# Set options
pd.set_option('display.max_columns', None)
%matplotlib inline

In [3]:
# Binary confusion matrices are consturcted as follows:
# [[tn, fp],
#  [fn, tp]]

In [4]:
def print_scores(estimator, estimator_title, xtr, xtst, ytr, ytst):
    print('{} Score train:'.format(estimator_title), estimator.score(xtr, ytr))
    print('{} Score test:'.format(estimator_title), estimator.score(xtst, ytst))
    print(classification_report(ytst, estimator.predict(xtst)))

In [5]:
# Read in data set
tc = pd.read_csv('../data/processed/telecom_churn_cleaned.csv')
tc.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,AK,AL,AR,AZ,CA,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MD,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,NY,OH,OK,OR,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY,408,415,510,churn
0,128,0.0,1.0,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,107,0.0,1.0,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,137,0.0,0.0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,84,1.0,0.0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,75,1.0,0.0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [6]:
tc.describe()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,AK,AL,AR,AZ,CA,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MD,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,NY,OH,OK,OR,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY,408,415,510,churn
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,0.09691,0.276628,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856,0.015602,0.024002,0.016502,0.019202,0.010201,0.019802,0.022202,0.016202,0.018302,0.018902,0.016202,0.015902,0.013201,0.021902,0.017402,0.021302,0.021002,0.017702,0.015302,0.019502,0.021002,0.018602,0.021902,0.025203,0.018902,0.019502,0.020402,0.020402,0.018602,0.018302,0.016802,0.020402,0.018602,0.019802,0.024902,0.023402,0.018302,0.023402,0.013501,0.019502,0.018002,0.018002,0.015902,0.021602,0.021602,0.023102,0.021902,0.019802,0.023402,0.031803,0.023102,0.251425,0.49655,0.252025,0.144914
std,39.822106,0.295879,0.447398,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491,0.123947,0.153079,0.127414,0.137255,0.100499,0.13934,0.147363,0.126269,0.134061,0.136199,0.126269,0.125114,0.114153,0.146386,0.130782,0.144411,0.143413,0.131885,0.122768,0.138302,0.143413,0.135134,0.146386,0.156763,0.136199,0.138302,0.141392,0.141392,0.135134,0.134061,0.128547,0.141392,0.135134,0.13934,0.155851,0.1512,0.134061,0.1512,0.115426,0.138302,0.132978,0.132978,0.125114,0.145402,0.145402,0.150251,0.146386,0.13934,0.1512,0.175502,0.150251,0.433897,0.500063,0.434241,0.352067
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,74.0,0.0,0.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,101.0,0.0,0.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,127.0,0.0,1.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
max,243.0,1.0,1.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
tc = tc.drop('CA', axis=1)
tc.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,AK,AL,AR,AZ,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MD,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,NY,OH,OK,OR,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY,408,415,510,churn
0,128,0.0,1.0,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,107,0.0,1.0,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,137,0.0,0.0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,84,1.0,0.0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,75,1.0,0.0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [8]:
# Set up train/test split, create some extra copies to investigate scaling
y_orig = tc['churn'].values.copy()
X_orig = tc.drop('churn', axis=1).copy()
X, X_holdout, y, y_holdout = train_test_split(X_orig, y_orig, test_size=0.2,
                                                          random_state=42, shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                          random_state=42, shuffle=True)
X_train.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,AK,AL,AR,AZ,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MD,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,NY,OH,OK,OR,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY,408,415,510
42,34,0.0,0.0,0,124.8,82,21.22,282.2,98,23.99,311.5,78,14.02,10.0,4,2.7,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1166,72,0.0,0.0,0,118.2,106,20.09,167.2,136,14.21,214.2,106,9.64,12.2,3,3.29,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1895,86,0.0,1.0,21,197.9,99,33.64,165.6,100,14.08,208.0,120,9.36,10.1,9,2.73,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1083,82,1.0,0.0,0,208.8,101,35.5,213.7,87,18.16,175.1,86,7.88,12.4,6,3.35,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
129,131,0.0,1.0,36,214.2,115,36.41,161.7,117,13.74,264.7,102,11.91,9.5,4,2.57,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


### Steps to test for each model
1. StandardScaler fit/predict
2. Variations on hyperparameters
3. Balanced dataset fit predict (Random/Smote/Adyson)
4. Variations on hypterparameters with balanced data

In [9]:
steps = [('scaler', StandardScaler()),
         ('LR', LogisticRegression(solver='lbfgs', max_iter=1000))]

pipeline = Pipeline(steps)

parameters = {'LR__C' : [12000, 11000, 10000, 9000, 8000]}

cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy Train: {}".format(cv.score(X_test, y_test)))
print("Accuracy Test: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

Accuracy Train: 0.8670411985018727
Accuracy Test: 0.8670411985018727
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       456
           1       0.62      0.23      0.34        78

    accuracy                           0.87       534
   macro avg       0.75      0.60      0.63       534
weighted avg       0.84      0.87      0.84       534

Tuned Model Parameters: {'LR__C': 10000}


#### Logistic Regression continually gets the best score at C=10000. Using C=10000 for further testing.  
#### Next step is try over sampling with Logistic Regression.

In [39]:
# Use Random Oversamping to see if score improves
steps = [('OR', RandomOverSampler(sampling_strategy='minority',random_state=42)),
         ('scaler', StandardScaler()),
         ('LR', LogisticRegression(solver='lbfgs', C=10000, max_iter=1000))]

lr_pipe = Pipeline(steps)

cross_val_score(lr_pipe, X_train, y_train, cv=5, n_jobs=-1)

# lr_pipe.fit(X, y)
# y_pred = lr_pipe.predict(X_test)

# # Compute and print metrics
# print("Accuracy Train: {}".format(lr_pipe.score(X_test, y_test)))
# print("Accuracy Test: {}".format(lr_pipe.score(X_test, y_test)))
# print(classification_report(y_test, y_pred))

array([0.76580796, 0.76112412, 0.75175644, 0.75117371, 0.75058824])

#### Random Oversampling decreased score. Trying ADASYN.

In [40]:
# Use Adasyn Oversamping to see if score improves
steps = [('ADASYN', ADASYN(sampling_strategy='minority', random_state=42)),
         ('scaler', StandardScaler()),
         ('LR', LogisticRegression(solver='lbfgs', C=10000, max_iter=1000))]

lr_pipe = Pipeline(steps)
cross_val_score(lr_pipe, X_train, y_train, cv=5, n_jobs=-1)

# lr_pipe.fit(X, y)
# y_pred = lr_pipe.predict(X_test)

# # Compute and print metrics
# print("Accuracy Train: {}".format(lr_pipe.score(X_test, y_test)))
# print("Accuracy Test: {}".format(lr_pipe.score(X_test, y_test)))
# print(classification_report(y_test, y_pred))

array([0.77283372, 0.74238876, 0.73067916, 0.72769953, 0.74352941])

#### ADASYN performed worse on average than Random Oversampling.  
#### Trying SMOTE.

In [41]:
# Use SMOTE Oversamping to see if score improves
steps = [('SMOTE', SMOTE(sampling_strategy='minority', random_state=42)),
         ('scaler', StandardScaler()),
         ('LR', LogisticRegression(solver='lbfgs', C=10000, max_iter=1000))]

lr_pipe = Pipeline(steps)
cross_val_score(lr_pipe, X_train, y_train, cv=5, n_jobs=-1)

# lr_pipe.fit(X, y)
# y_pred = lr_pipe.predict(X_test)

# # Compute and print metrics
# print("Accuracy Train: {}".format(lr_pipe.score(X_test, y_test)))
# print("Accuracy Test: {}".format(lr_pipe.score(X_test, y_test)))
# print(classification_report(y_test, y_pred))

array([0.79156909, 0.7587822 , 0.75175644, 0.75117371, 0.75764706])

#### SMOTE comes in between Random Oversampling and ADASYN.  
#### Trying SMOTENC

In [34]:
# This will create a list of indexes for feature (column) in the dataframe that is categorical.
X.AK.unique().shape[0]

categoricals = []
for idx, val in enumerate(X.columns):
    if X[val].unique().shape[0] == 2:
        categoricals.append(idx)
# categoricals

In [42]:
# Use SMOTE Oversamping to see if score improves
steps = [('SMOTENC', SMOTENC(categoricals, sampling_strategy='minority', random_state=42)),
         ('scaler', StandardScaler()),
         ('LR', LogisticRegression(solver='lbfgs', C=10000, max_iter=1000))]

lr_pipe = Pipeline(steps)
cross_val_score(lr_pipe, X_train, y_train, cv=5, n_jobs=-1)

# lr_pipe.fit(X, y)
# y_pred = lr_pipe.predict(X_test)

# # Compute and print metrics
# print("Accuracy Train: {}".format(lr_pipe.score(X_test, y_test)))
# print("Accuracy Test: {}".format(lr_pipe.score(X_test, y_test)))
# print(classification_report(y_test, y_pred))

array([0.8618267 , 0.8618267 , 0.83138173, 0.85915493, 0.86352941])

### Performance of Logistic Regression is best without oversampling.  
### Start testing with Naive Bayes

In [43]:
nb = GaussianNB()
_ = nb.fit(X_train, y_train)

In [48]:
steps = [('scaler', StandardScaler()),
         ('NB', GaussianNB())]

pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = pipeline.predict(X_test)

# Compute and print metrics
print("Accuracy Train: {}".format(pipeline.score(X_test, y_test)))
print("Accuracy Test: {}".format(pipeline.score(X_test, y_test)))
print(classification_report(y_test, y_pred))

Accuracy Train: 0.5955056179775281
Accuracy Test: 0.5955056179775281
              precision    recall  f1-score   support

           0       0.90      0.59      0.72       456
           1       0.20      0.60      0.30        78

    accuracy                           0.60       534
   macro avg       0.55      0.60      0.51       534
weighted avg       0.80      0.60      0.65       534



In [49]:
# Use Random Oversamping to see if score improves
steps = [('OR', RandomOverSampler(sampling_strategy='minority',random_state=42)),
         ('scaler', StandardScaler()),
         ('NB', GaussianNB())]

nb_pipe = Pipeline(steps)

cross_val_score(nb_pipe, X_train, y_train, cv=5, n_jobs=-1)

array([0.52459016, 0.56674473, 0.44730679, 0.33333333, 0.52235294])

In [16]:
def get_categoricals(frame):
    categoricals = []
    for idx, val in enumerate(frame.columns):
        if frame[val].nunique() == 2:
            categoricals.append(idx)
    return categoricals
# get_categoricals(X_train)

In [89]:
def get_discrete_cont(frame, labels=True):
    discrete_cont = []
    for idx, val in enumerate(frame.columns):
        if frame[val].nunique() > 50:
            if labels:
                discrete_cont.append(val)
            else:
                discrete_cont.append(idx)
    return discrete_cont
# get_discrete_cont(X_train, True)

In [105]:
def apply_cut(frame, cuts=25, labels=False):
    new_frame = frame.copy()
    discrete_cont = get_discrete_cont(frame, labels=True)
    for val in discrete_cont:
        new_frame[val] = pd.qcut(new_frame[val], cuts, labels=labels)
    return new_frame
# apply_qcut(X_train).head(3)

In [106]:
C = apply_cut(X)
C_train, C_test, d_train, d_test = train_test_split(C, y)
C.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,AK,AL,AR,AZ,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MD,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,NY,OH,OK,OR,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY,408,415,510
817,24,0.0,0.0,0,1,7,1,5,0,5,22,20,22,2,6,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
1373,14,0.0,0.0,0,2,14,2,11,17,11,14,8,13,0,4,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
679,6,1.0,0.0,0,19,3,19,24,17,24,13,14,13,6,9,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
56,21,0.0,0.0,0,4,11,4,8,0,8,2,23,2,4,2,4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1993,8,0.0,0.0,0,18,9,18,22,2,22,14,17,14,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [107]:
steps = [('NB', MultinomialNB())]

pipeline = Pipeline(steps)
pipeline.fit(C_train, d_train)

# Predict the labels of the test set: y_pred
d_pred = pipeline.predict(C_test)

# Compute and print metrics
print("Accuracy Train: {}".format(pipeline.score(C_test, d_test)))
print("Accuracy Test: {}".format(pipeline.score(C_test, d_test)))
print(classification_report(d_test, d_pred))

Accuracy Train: 0.5817091454272864
Accuracy Test: 0.5817091454272864
              precision    recall  f1-score   support

           0       0.92      0.57      0.70       576
           1       0.20      0.68      0.31        91

    accuracy                           0.58       667
   macro avg       0.56      0.62      0.50       667
weighted avg       0.82      0.58      0.65       667



In [100]:
Q.head(10)

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,AK,AL,AR,AZ,CO,CT,DC,DE,FL,GA,HI,IA,ID,IL,IN,KS,KY,LA,MA,MD,ME,MI,MN,MO,MS,MT,NC,ND,NE,NH,NJ,NM,NV,NY,OH,OK,OR,PA,RI,SC,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY,408,415,510
817,9,0.0,0.0,0,0,3,0,2,0,2,9,8,9,0,6,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
1373,5,0.0,0.0,0,1,5,1,4,6,4,5,3,5,0,4,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
679,2,1.0,0.0,0,7,1,7,9,7,9,5,5,5,2,9,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
56,8,0.0,0.0,0,1,4,1,3,0,3,1,9,1,1,2,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1993,3,0.0,0.0,0,7,3,7,9,1,9,5,6,5,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1818,2,0.0,0.0,0,2,9,2,4,7,4,9,9,9,7,3,7,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2248,9,0.0,1.0,20,8,5,8,5,7,5,9,9,9,8,3,8,5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2899,2,0.0,1.0,36,5,7,5,8,1,8,6,9,6,0,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
468,3,0.0,1.0,32,0,9,0,2,8,2,8,5,8,6,5,6,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2110,5,0.0,1.0,25,2,4,2,3,1,3,3,3,3,4,6,4,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
