In [1]:
import warnings

import pandas as pd

import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier as KNN

from sklearn.linear_model import LogisticRegression as LR

from sklearn.ensemble import RandomForestClassifier as RF

from sklearn.model_selection import KFold

In [2]:
warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
#read files
churn_df = pd.read_csv('churn.csv')
print(churn_df[:5])

  State  Account Length  Area Code     Phone Int'l Plan VMail Plan  \
0    KS             128        415  382-4657         no        yes   
1    OH             107        415  371-7191         no        yes   
2    NJ             137        415  358-1921         no         no   
3    OH              84        408  375-9999        yes         no   
4    OK              75        415  330-6626        yes         no   

   VMail Message  Day Mins  Day Calls  Day Charge   ...    Eve Calls  \
0             25     265.1        110       45.07   ...           99   
1             26     161.6        123       27.47   ...          103   
2              0     243.4        114       41.38   ...          110   
3              0     299.4         71       50.90   ...           88   
4              0     166.7        113       28.34   ...          122   

   Eve Charge  Night Mins  Night Calls  Night Charge  Intl Mins  Intl Calls  \
0       16.78       244.7           91         11.01       10.0    

In [6]:
#data cleaning
to_drop = ['State', 'Area Code', 'Phone', 'Churn?']

df = churn_df.drop(to_drop, axis=1)

print(df[:5])

   Account Length Int'l Plan VMail Plan  VMail Message  Day Mins  Day Calls  \
0             128         no        yes             25     265.1        110   
1             107         no        yes             26     161.6        123   
2             137         no         no              0     243.4        114   
3              84        yes         no              0     299.4         71   
4              75        yes         no              0     166.7        113   

   Day Charge  Eve Mins  Eve Calls  Eve Charge  Night Mins  Night Calls  \
0       45.07     197.4         99       16.78       244.7           91   
1       27.47     195.5        103       16.62       254.4          103   
2       41.38     121.2        110       10.30       162.6          104   
3       50.90      61.9         88        5.26       196.9           89   
4       28.34     148.3        122       12.61       186.9          121   

   Night Charge  Intl Mins  Intl Calls  Intl Charge  CustServ Calls  
0   

In [9]:
label = churn_df['Churn?']

y = np.where(label=='True.',1,0)

yes_no_cols = ["Int'l Plan","VMail Plan"]

df[yes_no_cols] = df[yes_no_cols] == 'yes'

features = df.columns

X = df.values.astype(np.float)

print(X[:5])
print(features)

[[128.     0.     0.    25.   265.1  110.    45.07 197.4   99.    16.78
  244.7   91.    11.01  10.     3.     2.7    1.  ]
 [107.     0.     0.    26.   161.6  123.    27.47 195.5  103.    16.62
  254.4  103.    11.45  13.7    3.     3.7    1.  ]
 [137.     0.     0.     0.   243.4  114.    41.38 121.2  110.    10.3
  162.6  104.     7.32  12.2    5.     3.29   0.  ]
 [ 84.     0.     0.     0.   299.4   71.    50.9   61.9   88.     5.26
  196.9   89.     8.86   6.6    7.     1.78   2.  ]
 [ 75.     0.     0.     0.   166.7  113.    28.34 148.3  122.    12.61
  186.9  121.     8.41  10.1    3.     2.73   3.  ]]
Index(['Account Length', 'Int'l Plan', 'VMail Plan', 'VMail Message',
       'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls',
       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins',
       'Intl Calls', 'Intl Charge', 'CustServ Calls'],
      dtype='object')


In [10]:
#z_score method
scaler = StandardScaler()

X = scaler.fit_transform(X)

print(X[:5])

[[ 0.67648946  0.          0.          1.23488274  1.56676695  0.47664315
   1.56703625 -0.07060962 -0.05594035 -0.07042665  0.86674322 -0.46549436
   0.86602851 -0.08500823 -0.60119509 -0.0856905  -0.42793202]
 [ 0.14906505  0.          0.          1.30794844 -0.33373776  1.12450284
  -0.33401297 -0.10808036  0.14486653 -0.10754944  1.05857074  0.14782467
   1.05938994  1.24048169 -0.60119509  1.2411686  -0.42793202]
 [ 0.9025285   0.          0.         -0.59175986  1.16830364  0.67598459
   1.168464   -1.57338336  0.49627857 -1.57389963 -0.75686906  0.19893459
  -0.75557074  0.70312091  0.21153386  0.69715637 -1.1882185 ]
 [-0.42859027  0.          0.         -0.59175986  2.19659605 -1.46693591
   2.19675881 -2.74286476 -0.60815927 -2.7432675  -0.07855114 -0.5677142
  -0.07880574 -1.30302599  1.02426282 -1.30640087  0.33235445]
 [-0.6546293   0.          0.         -0.59175986 -0.24008971  0.62614923
  -0.24004065 -1.03893233  1.0986992  -1.03793936 -0.27631146  1.06780322
  -0.2765

In [17]:
# Training
def train_cv(X, y, clf_class, **kwargs):
    kf = KFold(n_splits=5, shuffle=True)
    y_pred = y.copy()

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
        return y_pred

def accuracy(y_ture, y_pred):

    return np.mean(y_ture==y_pred)

In [18]:
print("logistic regression:")
print("%.3f"% accuracy(y,train_cv(X, y, LR)))  
print("Random Forest:")
print("%.3f"% accuracy(y,train_cv(X, y, RF))) 
print("k-nearst-neighbors:")
print("%.3f"% accuracy(y,train_cv(X, y, KNN))) 

logistic regression:
0.972
Random Forest:
0.983
k-nearst-neighbors:
0.980
