[Solution Lecture](https://youtu.be/V9QPBMvtrVs)

[View in Colaboratory](https://colab.research.google.com/github/schwaaweb/aimlds1_07-TheMachineLearningFramework/blob/master/W07_AS--RA--The_Machine_Learning_Framework_Solution.ipynb)

In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor

train_url = 'https://raw.githubusercontent.com/ryanleeallred/churn-data/master/train.csv'
train = pd.read_csv(train_url)

# We're not even going ot mess with the test dataset until we have a ROC_AUC score that's good. 
test_url = 'https://raw.githubusercontent.com/ryanleeallred/churn-data/master/test.csv'
test = pd.read_csv(test_url)

print(train.shape)
train.head()



(2222, 22)


Unnamed: 0,id,state,account_length,area_code,phone,international_plan,vmail_plan,vmail_message,day_mins,day_calls,...,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,churn
0,3200,CT,100,510,416-1536,yes,no,0,107.2,98,...,122,7.38,156.2,117,7.03,9.7,4,2.62,1,0
1,2129,CT,146,408,380-3329,no,yes,23,149.6,96,...,124,20.38,293.5,135,13.21,7.4,4,2.0,2,0
2,598,AZ,84,415,341-2360,no,no,0,159.0,80,...,128,14.27,167.6,101,7.54,12.3,5,3.32,1,0
3,2507,VT,43,408,331-8713,no,no,0,135.8,125,...,88,13.87,229.8,106,10.34,12.6,?,3.4,0,0
4,1248,PA,101,415,368-2074,?,no,0,193.7,108,...,?,15.86,223.0,100,?,11.6,8,3.13,0,0


In [3]:
# Change "no", "yes", to corresponding 0, 1 values.

train["international_plan"].replace(('yes', 'no'), (1, 0), inplace=True)
train["vmail_plan"].replace(('yes', 'no'), (1, 0), inplace=True)

pd.options.display.max_columns = None

train.head()

Unnamed: 0,id,state,account_length,area_code,phone,international_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,eve_mins,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,churn
0,3200,CT,100,510,416-1536,1,0,0,107.2,98,18.22,86.8,122,7.38,156.2,117,7.03,9.7,4,2.62,1,0
1,2129,CT,146,408,380-3329,0,1,23,149.6,96,25.43,239.8,124,20.38,293.5,135,13.21,7.4,4,2.0,2,0
2,598,AZ,84,415,341-2360,0,0,0,159.0,80,27.03,167.9,128,14.27,167.6,101,7.54,12.3,5,3.32,1,0
3,2507,VT,43,408,331-8713,0,0,0,135.8,125,23.09,163.2,88,13.87,229.8,106,10.34,12.6,?,3.4,0,0
4,1248,PA,101,415,368-2074,?,0,0,193.7,108,32.93,186.6,?,15.86,223.0,100,?,11.6,8,3.13,0,0


In [4]:
# This will take all non-numeric values and force them to be NaNs will probably lose a small amount of data here.
train = train.apply(pd.to_numeric, errors='coerce')
print(train.dtypes)

id                      int64
state                 float64
account_length          int64
area_code             float64
phone                 float64
international_plan    float64
vmail_plan            float64
vmail_message           int64
day_mins              float64
day_calls               int64
day_charge            float64
eve_mins              float64
eve_calls             float64
eve_charge            float64
night_mins            float64
night_calls             int64
night_charge          float64
intl_mins             float64
intl_calls            float64
intl_charge           float64
custserv_calls          int64
churn                   int64
dtype: object


In [5]:
# Fill NaN values with mean column values

train.fillna(train.mean(), inplace=True)

# Count the number of NaN values in each column

print(train.isnull().sum())

id                       0
state                 2222
account_length           0
area_code                0
phone                 2222
international_plan       0
vmail_plan               0
vmail_message            0
day_mins                 0
day_calls                0
day_charge               0
eve_mins                 0
eve_calls                0
eve_charge               0
night_mins               0
night_calls              0
night_charge             0
intl_mins                0
intl_calls               0
intl_charge              0
custserv_calls           0
churn                    0
dtype: int64


In [6]:
# Drop dolumns that I don't want to deal with rn.
train = train.drop('area_code', axis=1)
train = train.drop('phone', axis=1)
train = train.drop('state', axis=1)

print(train.shape)
train.head(10)

(2222, 19)


Unnamed: 0,id,account_length,international_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,eve_mins,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,churn
0,3200,100,1.0,0.0,0,107.2,98,18.22,86.8,122.0,7.38,156.2,117,7.03,9.7,4.0,2.62,1,0
1,2129,146,0.0,1.0,23,149.6,96,25.43,239.8,124.0,20.38,293.5,135,13.21,7.4,4.0,2.0,2,0
2,598,84,0.0,0.0,0,159.0,80,27.03,167.9,128.0,14.27,167.6,101,7.54,12.3,5.0,3.32,1,0
3,2507,43,0.0,0.0,0,135.8,125,23.09,163.2,88.0,13.87,229.8,106,10.34,12.6,4.504244,3.4,0,0
4,1248,101,0.093617,0.0,0,193.7,108,32.93,186.6,99.797225,15.86,223.0,100,8.982913,11.6,8.0,3.13,0,0
5,2582,116,0.0,0.0,0,205.0,90,34.85,140.9,114.0,11.98,272.6,96,8.982913,7.5,4.0,2.03,2,0
6,2359,71,0.0,1.0,31,115.4,90,19.62,217.4,78.0,18.48,239.9,102,8.982913,13.1,4.0,3.54,1,0
7,2224,68,0.0,0.0,0,143.6,80,24.41,134.3,65.0,11.42,215.6,84,9.7,15.5,5.0,4.19,2,0
8,1661,92,0.0,1.0,38,242.2,96,41.17,159.7,144.0,13.57,210.0,108,9.45,8.9,1.0,2.4,1,0
9,2895,33,1.0,0.0,0,164.0,99,27.88,153.1,102.0,13.01,123.8,104,8.982913,6.4,4.0,1.73,0,0


In [13]:
# Generate independent and dependent vars
Y = train['churn']
X = train.drop(['churn'], axis=1)

X.head()

Unnamed: 0,id,account_length,international_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,eve_mins,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls
0,3200,100,1.0,0.0,0,107.2,98,18.22,86.8,122.0,7.38,156.2,117,7.03,9.7,4.0,2.62,1
1,2129,146,0.0,1.0,23,149.6,96,25.43,239.8,124.0,20.38,293.5,135,13.21,7.4,4.0,2.0,2
2,598,84,0.0,0.0,0,159.0,80,27.03,167.9,128.0,14.27,167.6,101,7.54,12.3,5.0,3.32,1
3,2507,43,0.0,0.0,0,135.8,125,23.09,163.2,88.0,13.87,229.8,106,10.34,12.6,4.504244,3.4,0
4,1248,101,0.093617,0.0,0,193.7,108,32.93,186.6,99.797225,15.86,223.0,100,8.982913,11.6,8.0,3.13,0


In [24]:
# Fit Random Forest Classifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

kfold = KFold(n_splits=3, random_state=42) 
scores = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='roc_auc')
print("Mean AUC Score - Random Forest: ", scores.mean())

Mean AUC Score - Random Forest:  0.8877240000187493


In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
cross_val = KFold(n_splits=3, random_state=42)
scores = cross_val_score(model, X, Y, cv=cross_val, scoring='roc_auc')
print("Mean AUC Score - Logistic Regression: ", scores.mean())

Mean AUC Score - Logistic Regression:  0.7914651797821616


In [26]:
### Decision Tree
from sklearn.tree import DecisionTreeClassifier
model2 = DecisionTreeClassifier()
cross_val = KFold(n_splits=3, random_state=42)
scores = cross_val_score(model2, X, Y, cv=cross_val, scoring='roc_auc')
print("Mean AUC Score - Decision Tree: ", scores.mean())

Mean AUC Score - Decision Tree:  0.7924412056446318


In [27]:
### Naive Bayes
from sklearn.naive_bayes import GaussianNB
model4 = GaussianNB()
cross_val = KFold(n_splits=3, random_state=42)
scores = cross_val_score(model4, X, Y, cv=cross_val, scoring='roc_auc')
print("Mean AUC Score - Gaussian Naive Bayes: ", scores.mean())

Mean AUC Score - Gaussian Naive Bayes:  0.8298956230587861


In [28]:
### K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
model5 = KNeighborsClassifier()
cross_val = KFold(n_splits=3, random_state=42)
scores = cross_val_score(model5, X, Y, cv=cross_val, scoring='roc_auc')
print("Mean AUC Score - K-Nearest Neighbors: ", scores.mean())

Mean AUC Score - K-Nearest Neighbors:  0.5987537195869482


In [29]:
### Support Vector Machine
from sklearn.svm import SVC
model1 = SVC()
cross_val = KFold(n_splits=3, random_state=42)
scores = cross_val_score(model, X, Y, cv=cross_val, scoring='roc_auc')
print("Mean AUC Score - Support Vector Machine: ", scores.mean())

Mean AUC Score - Support Vector Machine:  0.7914651797821616


In [32]:
#test

In [30]:
# Lets clean the test data now.

test["international_plan"].replace(('yes', 'no'), (1, 0), inplace=True)
test["vmail_plan"].replace(('yes', 'no'), (1, 0), inplace=True)

test = test.apply(pd.to_numeric, errors='coerce')

test.fillna(test.mean(), inplace=True)

test = test.drop('area_code', axis=1)
test = test.drop('phone', axis=1)
test = test.drop('state', axis=1)

test.head(20)

TypeError: Cannot compare types 'ndarray(dtype=float64)' and 'str'

In [33]:
# Fit to the whole train dataset

RandomForest = RandomForestClassifier()
RandomForest.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
# and Predict on the test dataset

# Need to split test into X_test Y_test
Y_test = test['churn']
X_test = test.drop(['churn'], axis=1)

predicted_values = RandomForest.predict(X_test)

final_test = X_test
final_test['churn'] = 0
final_test['churn'] = predicted_values

final_test.head(30)

Unnamed: 0,id,account_length,international_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,eve_mins,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,churn
0,221,127,0.0,1.0,22,166.0,114,28.22,174.5,103.0,14.83,244.9,68,11.02,10.2,6.0,2.75,1,0
1,1313,100,0.0,0.0,0,235.8,130,40.09,176.0,100.865815,14.96,63.6,122,2.86,7.3,1.0,1.97,2,0
2,1635,66,0.0,0.0,0,154.0,133,26.18,198.9,121.0,16.91,151.9,100,6.84,9.5,3.0,2.57,4,0
3,289,89,0.0,0.0,0,303.9,95,51.66,260.9,114.0,22.18,312.1,89,14.04,5.3,4.487832,1.43,1,1
4,2416,113,0.108108,0.0,0,156.0,141,26.52,256.8,72.0,21.83,175.3,123,7.89,11.9,5.0,3.21,2,0
5,3112,115,0.108108,0.0,0,139.3,89,23.68,192.3,95.0,16.35,151.0,75,6.8,9.3,3.0,2.51,7,1
6,601,62,1.0,0.0,0,159.7,86,27.15,197.5,76.0,16.79,121.6,105,5.47,13.9,6.0,3.75,0,1
7,326,121,0.108108,1.0,35,68.7,95,11.68,209.2,69.0,17.78,197.4,42,8.88,11.4,4.0,3.08,1,0
8,2783,79,0.0,1.0,17,167.9,114,28.54,243.7,93.0,20.71,211.9,114,9.54,9.1,2.0,2.46,1,0
9,454,97,0.0,0.0,0,256.4,125,43.59,273.9,100.0,23.28,222.7,101,10.02,11.1,4.487832,3.0,1,1


In [None]:
# Export final_test to csv

from google.colab import files

final_test.to_csv("ra_randomforest.csv", index = False)

files.download('ra_randomforest.csv')