In [10]:
import warnings
warnings.simplefilter(action= 'ignore', category=FutureWarning)
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np 

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score, accuracy_score

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,  GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier
from itertools import product

s3 = boto3.resource('s3')
bucket_name = "rachaeld-data445"
bucket = s3.Bucket(bucket_name)

file_key = 'churn-bigml-80.csv'
file_key2 = 'churn-bigml-20.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

bucket_object2 = bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

#reading the data-file
telecom_train = pd.read_csv(file_content_stream)
telecom_test = pd.read_csv(file_content_stream2)
telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [11]:
telecom_train = pd.DataFrame(telecom_train)
telecom_test = pd.DataFrame(telecom_test)

In [12]:
#Using the numpy library, create a variable in telecom_train called Churn numb
#that takes the value of 1 when Churn = True and 0 when Churn = False.
telecom_train= telecom_train.assign(churn_numb= telecom_train['Churn'].astype(int))
telecom_test= telecom_test.assign(churn_numb= telecom_test['Churn'].astype(int))

#Change the International plan variable from a categorical variable to a numerical
#variable. That is, change Yes to 1 and No to 0 in both data-frames: telecom train
#and telecom test.
telecom_train['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['International_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Change the Voice mail plan variable from a categorical variable to a numerical
#variable. That is, change Yes to 1 and No to 0 in both data-frames: telecom train
#and telecom test.
#Changing Voice_mail_plan yes/no to 1/0
telecom_train['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)
telecom_test['Voice_mail_plan'].replace(['Yes', 'No'], [1,0], inplace= True)

#Create a new variable called: total charge as the sum of Total day charge,
#Total eve charge, Total night charge, and Total intl charge in both dataframes: telecom train and telecom test.
telecom_train= telecom_train.assign(total_charge= telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge']+ telecom_train['Total_intl_charge'])
telecom_test= telecom_test.assign(total_charge= telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge']+ telecom_test['Total_intl_charge'])
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,churn_numb,total_charge
0,LA,117,408,0,0,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False,0,73.32
1,IN,65,415,0,0,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True,1,54.2
2,NY,161,415,0,0,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True,1,92.29
3,SC,111,415,0,0,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False,0,41.05
4,HI,49,510,0,0,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False,0,49.6


In [13]:
#(5 points) In both data-frames telecom train and telecom test, only keep the following variables: Account length, International plan, Voice mail plan, total charge,
#Customer service calls, and Churn numb.
telecom_train = telecom_train.drop(columns = ['State', 'Area_code', 'Number_vmail_messages', 'Total_day_minutes', 'Total_day_calls', 
                                             'Total_day_charge', 'Total_eve_minutes', 'Total_eve_calls', 'Total_eve_charge', 'Total_night_minutes',
                                              'Total_night_calls', 'Total_night_charge', 'Total_intl_minutes', 'Total_intl_calls', 'Total_intl_charge', 'Churn'], axis = 1)
telecom_test = telecom_test.drop(columns = ['State', 'Area_code', 'Number_vmail_messages', 'Total_day_minutes', 'Total_day_calls', 
                                             'Total_day_charge', 'Total_eve_minutes', 'Total_eve_calls', 'Total_eve_charge', 'Total_night_minutes',
                                             'Total_night_calls', 'Total_night_charge', 'Total_intl_minutes', 'Total_intl_calls', 'Total_intl_charge', 'Churn'], axis = 1)
telecom_train.head()

Unnamed: 0,Account_length,International_plan,Voice_mail_plan,Customer_service_calls,churn_numb,total_charge
0,128,0,1,1,0,75.56
1,107,0,1,1,0,59.24
2,137,0,0,0,0,62.29
3,84,1,0,2,0,66.8
4,75,1,0,3,0,52.09


In [7]:
#(20 points) Consider the telecom train dataset. Using Account length, International plan,
#Voice mail plan, total charge, and Customer service calls as the input variables,
#and Churn is the target variable. Do the following:

# input and target variables
X = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'Customer_service_calls', 'total_charge']]
Y = telecom_train['churn_numb']

#Split the data into train (80%) and test (20%) taking into account the proportion
#of 0s and 1s in the data. That is, if Y is the target variable, in train test split
#function, you need to add the extra argument stratify = Y

# splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .2, stratify = Y)

In [None]:
#Repeat steps (1)-(3) 1000 times. Compute the average importance of each of the variables across the 100 splits and the three models. After that, select the top 4 variables
#(the ones with top 4 average importance) as the predictor variables.
RF_res = list()
ADA_res = list()
GB_res = list()

for i in range(0,1000):
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .2, stratify = Y)
#(i) Fit a random forest model with 500 trees and depth equal to 3 to the train
#dataset. Extract the importance of variables.
    #building the model
    RF_md = RandomForestClassifier(n_estimators = 500).fit(X_train, Y_train)
    #extracting the importances
    RF_res.append(RF_md.feature_importances_)

#(ii) Fit an AdaBoost model with 500 trees, depth equal to 3, and learning rate
#equal to 0.01 to the train dataset. Extract the importance of variables.
    #building the model
    ADA_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = .01).fit(X_train, Y_train)
    #extracting the importances
    ADA_res.append(ADA_md.feature_importances_)

#(iii) Fit a gradient boosting model with 500 trees, depth equal to 3, and learning
#rate equal to 0.01 to the train dataset. Extract the importance of variables.
    #building the model 
    GB_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = .01).fit(X_train, Y_train)
    #extracting the importances
    GB_res.append(GB_md.feature_importances_)
    
    print('Iteration number:', i)


Iteration number: 0
Iteration number: 1
Iteration number: 2
Iteration number: 3
Iteration number: 4
Iteration number: 5
Iteration number: 6
Iteration number: 7
Iteration number: 8
Iteration number: 9
Iteration number: 10
Iteration number: 11
Iteration number: 12
Iteration number: 13
Iteration number: 14
Iteration number: 15
Iteration number: 16
Iteration number: 17
Iteration number: 18
Iteration number: 19
Iteration number: 20
Iteration number: 21
Iteration number: 22
Iteration number: 23
Iteration number: 24
Iteration number: 25
Iteration number: 26
Iteration number: 27
Iteration number: 28
Iteration number: 29
Iteration number: 30
Iteration number: 31
Iteration number: 32
Iteration number: 33
Iteration number: 34
Iteration number: 35
Iteration number: 36
Iteration number: 37
Iteration number: 38
Iteration number: 39
Iteration number: 40
Iteration number: 41
Iteration number: 42
Iteration number: 43
Iteration number: 44
Iteration number: 45
Iteration number: 46
Iteration number: 47
It

In [11]:
## Random Forest importance
rf_imp = pd.DataFrame(RF_res)
rf_imp.columns = [['Account_length', 'International_plan', 'Voice_mail_plan', 'Customer_service_calls', 'total_charge']]
rf_imp.apply(np.mean, axis = 0)

Account_length            0.145778
International_plan        0.079332
Voice_mail_plan           0.071425
Customer_service_calls    0.162868
total_charge              0.540598
dtype: float64

In [12]:
## ADA importance
print('ADA importance')
ada_imp = pd.DataFrame(ADA_res)
ada_imp.columns = [['Account_length', 'International_plan', 'Voice_mail_plan', 'Customer_service_calls', 'total_charge']]
ada_imp.apply(np.mean, axis = 0)

ADA importance


Account_length            0.301072
International_plan        0.248960
Voice_mail_plan           0.035027
Customer_service_calls    0.072636
total_charge              0.342305
dtype: float64

In [13]:
## Gradient boosting importance
print('ADA importance')
gb_imp = pd.DataFrame(GB_res)
gb_imp.columns = [['Account_length', 'International_plan', 'Voice_mail_plan', 'Customer_service_calls', 'total_charge']]
gb_imp.apply(np.mean, axis = 0)

ADA importance


Account_length            0.015232
International_plan        0.117347
Voice_mail_plan           0.132464
Customer_service_calls    0.163761
total_charge              0.571196
dtype: float64

In [14]:
### from the above we can see that RFs lowest is  
    ### ada boosts lowest is 
    ### gradients boostings lowest is
### therefore we would remove 
X = X.drop(columns = ['Voice_mail_plan' ], axis = 1)
#telecom_test = telecom_test.drop(columns = [ ], axis = 1)

In [12]:
#(e) (45 points) Consider the telecom train dataset. Using Churn as the target variable,
#and the remaining variables as the input variables. Do the following:
# input and target variables
X = telecom_train[['Account_length', 'International_plan', 'Customer_service_calls', 'total_charge']]
Y = telecom_train['churn_numb']

#Split the data into train (80%) and test (20%) taking into account the proportion
#of 0s and 1s in the data. That is, if Y is the target variable, in train test split
#function, you need to add the extra argument stratify = Y.

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .2, stratify = Y)



In [20]:
#Using the train dataset, build random forest models with the following setting:
#n_tree = [100, 500, 1000, 1500, 2000] and depth = [3, 5, 7]. In order to create a data-frame that contains all the combinations of trees and
#depths, you can use the following code:

def expand_grid(RF_dictionary):
    return pd.DataFrame([row for row in product(*RF_dictionary.values())],
    columns = RF_dictionary.keys())

RF_dictionary = {'n_tree': [100, 500, 1000, 1500, 2000],
    'depth': [3, 5, 7]}
RF_parameters = expand_grid(RF_dictionary)
RF_parameters['Recall']= np.nan
RF_parameters['Accuracy']= np.nan
RF_parameters

Unnamed: 0,n_tree,depth,Recall,Accuracy
0,100,3,,
1,100,5,,
2,100,7,,
3,500,3,,
4,500,5,,
5,500,7,,
6,1000,3,,
7,1000,5,,
8,1000,7,,
9,1500,3,,


In [21]:
#Using the train dataset, build AdaBoost models with the following setting:
#n_tree = [100, 500, 1000, 1500, 2000], depth = [3, 5, 7], and learning rate
#= [0.1, 0.01, 0.001]. In order to create a data-frame that contains all the
#combinations of trees, depths, and learning rates, you can use the following
#code:
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*ADA_dictionary.values())],
    columns = ADA_dictionary.keys())
ADA_dictionary = {'n_tree': [100, 500, 1000, 1500, 2000],
    'depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]}
ADA_parameters = expand_grid(ADA_dictionary)
ADA_parameters['Recall']= np.nan
ADA_parameters['Accuracy']= np.nan
ADA_parameters

Unnamed: 0,n_tree,depth,learning_rate,Recall,Accuracy
0,100,3,0.1,,
1,100,3,0.01,,
2,100,3,0.001,,
3,100,5,0.1,,
4,100,5,0.01,,
5,100,5,0.001,,
6,100,7,0.1,,
7,100,7,0.01,,
8,100,7,0.001,,
9,500,3,0.1,,


In [22]:
# Using the train dataset, build gradient boosting models with the following setting: n_tree = [100, 500, 1000, 1500, 2000], depth = [3, 5, 7], and
#learning rate = [0.1, 0.01, 0.001]. In order to create a data-frame that
#contains all the combinations of trees, depths, and learning rates, you can use
#the following code:

def expand_grid(GB_dictionary):
    return pd.DataFrame([row for row in product(*GB_dictionary.values())],
    columns = GB_dictionary.keys())
GB_dictionary = {'n_tree': [100, 500, 1000, 1500, 2000],
    'depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]}
GB_parameters = expand_grid(GB_dictionary)
GB_parameters['Recall']= np.nan
GB_parameters['Accuracy']= np.nan
GB_parameters

Unnamed: 0,n_tree,depth,learning_rate,Recall,Accuracy
0,100,3,0.1,,
1,100,3,0.01,,
2,100,3,0.001,,
3,100,5,0.1,,
4,100,5,0.01,,
5,100,5,0.001,,
6,100,7,0.1,,
7,100,7,0.01,,
8,100,7,0.001,,
9,500,3,0.1,,


In [None]:
import warnings
warnings.simplefilter(action= 'ignore', category=FutureWarning)

RF_recall= pd.DataFrame()
RF_accuracy= pd.DataFrame()

ADA_recall= pd.DataFrame()
ADA_accuracy= pd.DataFrame()

GB_recall= pd.DataFrame()
GB_accuracy= pd.DataFrame()

#Running loops to find best model
for i in range (0,100):

    #Splitting the Data
    X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size= 0.2, stratify= Y)
    

### RANDOM FOREST ###
    for k in range(len(RF_parameters)):
        #Buidling the model
        RF_md= RandomForestClassifier(n_estimators= RF_parameters['n_tree'][k], max_depth=RF_parameters['depth'][k]).fit(X_train, Y_train)
        #Predicting on the model
        RF_pred= RF_md.predict_proba(X_test)[:,1]
        #changing liklihoods to labels 
        RF_labels= np.where(RF_pred < .1, 0, 1)
        RF_recall.loc[i, k]= recall_score(Y_test, RF_labels)
        RF_accuracy.loc[i, k]= accuracy_score(Y_test, RF_labels)
        #print('RF iteration num', k)
        
### ADA BOOST ###
    for m in range(len(ADA_parameters)):
        #Building the model
        ADA_md= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth= ADA_parameters['n_tree'][m]),
                                    n_estimators= ADA_parameters['depth'][m], learning_rate= ADA_parameters['learning_rate'][m]).fit(X_train, Y_train)
        #Predicting on the model
        ADA_pred= ADA_md.predict_proba(X_test)[:,1]
        #changing liklihoods to labels 
        ADA_labels= np.where(ADA_pred < .1, 0, 1)
        ADA_recall.loc[i,m]= recall_score(Y_test, ADA_labels)
        ADA_accuracy.loc[i,m]= accuracy_score(Y_test, ADA_labels)
        #print('ADA iteration num', m)
        
### GRADIENT BOOSTING ###
    for n in range(len(GB_parameters)):
        #Building the model
        GB_md= GradientBoostingClassifier(max_depth= GB_parameters['depth'][n], n_estimators= GB_parameters['n_tree'][n],
                                          learning_rate= GB_parameters['learning_rate'][n]).fit(X_train, Y_train)
        #Predicting on the model
        GB_pred= GB_md.predict_proba(X_test)[:,1]
        #changing liklihoods to labels 
        GB_labels= np.where(GB_pred < .1, 0, 1)
        GB_recall.loc[i,n] = recall_score(Y_test, GB_labels)
        GB_accuracy.loc[i,n] = accuracy_score(Y_test, GB_labels)
        #print('GB iteration num', n)
    print('whole loop iteration num:', i)


whole loop iteration num: 0
whole loop iteration num: 1
whole loop iteration num: 2
whole loop iteration num: 3
whole loop iteration num: 4
whole loop iteration num: 5
whole loop iteration num: 6
whole loop iteration num: 7
whole loop iteration num: 8
whole loop iteration num: 9
whole loop iteration num: 10
whole loop iteration num: 11
whole loop iteration num: 12
whole loop iteration num: 13
whole loop iteration num: 14
whole loop iteration num: 15
whole loop iteration num: 16
whole loop iteration num: 17
whole loop iteration num: 18
whole loop iteration num: 19
whole loop iteration num: 20
whole loop iteration num: 21
whole loop iteration num: 22
whole loop iteration num: 23
whole loop iteration num: 24
whole loop iteration num: 25
whole loop iteration num: 26
whole loop iteration num: 27
whole loop iteration num: 28
whole loop iteration num: 29
whole loop iteration num: 30
whole loop iteration num: 31
whole loop iteration num: 32
whole loop iteration num: 34
whole loop iteration num

In [52]:
RF_recall.describe()
##one with the highest- 0 
    ## would go with model 0, although the accuracy is not the highest we would be okay with this because we do not want to classify 
    ## someone that isn't going to cancel as a cancel, because that would mean losing a customer and has a hugher cost than misclassifying someone
    ## that is going to cancel as someone who isn't. 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.859359,0.858718,0.856795,0.858974,0.85859,0.857308,0.858974,0.858718,0.856923,0.858718,0.858718,0.857179,0.858718,0.858846,0.857179
std,0.029745,0.029774,0.030221,0.029664,0.029633,0.029476,0.029552,0.029663,0.029871,0.02983,0.029886,0.029832,0.02983,0.029914,0.029943
min,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051,0.782051
25%,0.846154,0.846154,0.833333,0.846154,0.846154,0.842949,0.846154,0.846154,0.842949,0.846154,0.846154,0.846154,0.846154,0.846154,0.846154
50%,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974,0.858974
75%,0.875,0.875,0.871795,0.875,0.875,0.871795,0.871795,0.875,0.871795,0.871795,0.875,0.871795,0.871795,0.875,0.871795
max,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077,0.923077


In [53]:
RF_accuracy.describe()
##one with the highest- 11, 0 is not to far off though. 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.872547,0.893165,0.897903,0.872566,0.894963,0.898408,0.872603,0.895037,0.898165,0.872566,0.895356,0.898371,0.872528,0.895225,0.898277
std,0.010623,0.011697,0.010196,0.010698,0.010002,0.010154,0.010695,0.010497,0.010159,0.010728,0.010189,0.010152,0.010701,0.010262,0.010149
min,0.842697,0.867041,0.874532,0.842697,0.868914,0.876404,0.842697,0.867041,0.878277,0.842697,0.868914,0.876404,0.842697,0.867041,0.878277
25%,0.867041,0.883895,0.890918,0.867041,0.88764,0.890918,0.867041,0.889045,0.891386,0.867041,0.889513,0.891386,0.867041,0.88764,0.890918
50%,0.872659,0.893258,0.898876,0.872659,0.895131,0.898876,0.872659,0.895131,0.898876,0.872659,0.895131,0.898876,0.872659,0.895131,0.898876
75%,0.878745,0.902622,0.904494,0.88015,0.900749,0.904494,0.878745,0.902622,0.904494,0.878745,0.901217,0.904494,0.878745,0.901217,0.904494
max,0.897004,0.925094,0.921348,0.897004,0.921348,0.923221,0.897004,0.921348,0.923221,0.897004,0.923221,0.923221,0.897004,0.923221,0.923221


In [55]:
ADA_recall.describe()
##one with the highest- 8 and 32 tied
    ## would use 32 beacause it has the better accuracy and the highest recall

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.619744,0.617564,0.62,0.620256,0.619615,0.617436,0.620256,0.619872,0.622179,0.617949,0.617308,0.621667,0.620769,0.619872,0.621026,0.618462,0.620128,0.618846,0.616923,0.617949,0.617564,0.617051,0.617692,0.619744,0.617692,0.620513,0.617692,0.617949,0.616923,0.620641,0.620256,0.619231,0.622179,0.619359,0.617308,0.618718,0.61859,0.618718,0.617179,0.618462,0.620513,0.61859,0.620897,0.618974,0.619103
std,0.054277,0.050796,0.052293,0.04993,0.050369,0.049994,0.051469,0.05486,0.053855,0.049536,0.051132,0.054225,0.053319,0.051423,0.053137,0.052848,0.051529,0.05399,0.051323,0.054728,0.050992,0.053396,0.050643,0.052977,0.053013,0.05643,0.050182,0.052212,0.052317,0.053689,0.053957,0.053516,0.052984,0.052646,0.052856,0.053426,0.051585,0.053457,0.054332,0.05228,0.05253,0.052033,0.053477,0.049573,0.054676
min,0.525641,0.525641,0.5,0.5,0.512821,0.525641,0.512821,0.487179,0.487179,0.5,0.5,0.512821,0.512821,0.512821,0.512821,0.474359,0.512821,0.5,0.525641,0.487179,0.5,0.487179,0.525641,0.512821,0.512821,0.474359,0.525641,0.5,0.5,0.512821,0.487179,0.461538,0.5,0.5,0.487179,0.512821,0.512821,0.487179,0.487179,0.5,0.5,0.538462,0.5,0.512821,0.5
25%,0.576923,0.576923,0.586538,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.586538,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923,0.576923
50%,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.628205,0.628205,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.628205,0.621795,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.615385,0.621795,0.615385,0.615385,0.615385,0.615385,0.615385,0.628205,0.615385,0.615385,0.615385,0.621795,0.615385,0.628205,0.621795,0.615385
75%,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.666667,0.653846,0.666667,0.653846,0.644231,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.644231,0.657051,0.653846,0.653846,0.653846,0.666667,0.666667,0.653846,0.653846,0.666667,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846,0.653846
max,0.769231,0.74359,0.74359,0.75641,0.769231,0.730769,0.74359,0.75641,0.75641,0.75641,0.74359,0.769231,0.769231,0.74359,0.75641,0.74359,0.730769,0.769231,0.74359,0.75641,0.74359,0.74359,0.769231,0.75641,0.769231,0.782051,0.75641,0.74359,0.75641,0.769231,0.769231,0.75641,0.75641,0.74359,0.74359,0.74359,0.75641,0.75641,0.74359,0.74359,0.75641,0.75641,0.74359,0.74359,0.75641


In [56]:
ADA_accuracy.describe()
##one with the highest-32
    #8 - .886404, 32- .886723

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.886498,0.886442,0.887004,0.886723,0.886423,0.886423,0.886554,0.886479,0.886404,0.886592,0.886124,0.886629,0.886742,0.886816,0.886873,0.886592,0.886798,0.886011,0.88633,0.88633,0.88588,0.886124,0.886124,0.886517,0.886255,0.88676,0.886479,0.88603,0.886142,0.887116,0.886348,0.885936,0.886723,0.886386,0.885805,0.887004,0.886217,0.886723,0.886086,0.886348,0.886367,0.886049,0.886536,0.88633,0.886685
std,0.011906,0.011111,0.011281,0.011475,0.011221,0.011487,0.011116,0.012298,0.011492,0.011633,0.011219,0.011833,0.011551,0.01095,0.011844,0.011421,0.011579,0.012498,0.011247,0.011548,0.011147,0.01127,0.011695,0.011386,0.011206,0.011982,0.011626,0.01117,0.011479,0.011436,0.011644,0.01152,0.011755,0.011374,0.011283,0.011321,0.010577,0.011755,0.011868,0.011426,0.011792,0.011168,0.011055,0.01084,0.010882
min,0.855805,0.863296,0.859551,0.863296,0.855805,0.857678,0.861423,0.861423,0.855805,0.861423,0.857678,0.861423,0.857678,0.857678,0.857678,0.861423,0.861423,0.857678,0.857678,0.857678,0.859551,0.859551,0.855805,0.861423,0.859551,0.861423,0.859551,0.861423,0.861423,0.857678,0.859551,0.855805,0.859551,0.861423,0.855805,0.857678,0.863296,0.861423,0.859551,0.859551,0.857678,0.859551,0.859551,0.857678,0.863296
25%,0.88015,0.878277,0.88015,0.878277,0.879682,0.88015,0.879682,0.878277,0.878277,0.88015,0.88015,0.88015,0.879682,0.88015,0.879682,0.879682,0.88015,0.878277,0.879682,0.878277,0.878277,0.878277,0.878277,0.877809,0.88015,0.878277,0.876404,0.878277,0.878277,0.88015,0.878277,0.878277,0.878277,0.878277,0.876404,0.88015,0.878277,0.88015,0.876404,0.88015,0.879682,0.879682,0.878277,0.88015,0.88015
50%,0.88764,0.885768,0.886704,0.88764,0.88764,0.88764,0.885768,0.88764,0.885768,0.885768,0.885768,0.88764,0.885768,0.88764,0.885768,0.885768,0.88764,0.885768,0.88764,0.885768,0.885768,0.886704,0.885768,0.885768,0.885768,0.885768,0.88764,0.885768,0.885768,0.88764,0.885768,0.885768,0.885768,0.885768,0.885768,0.88764,0.885768,0.885768,0.885768,0.885768,0.885768,0.885768,0.885768,0.885768,0.886704
75%,0.895131,0.895131,0.895131,0.895131,0.893258,0.893258,0.893258,0.895131,0.895131,0.895131,0.893258,0.893727,0.895131,0.893258,0.897004,0.895131,0.895131,0.895131,0.893258,0.893258,0.893727,0.893258,0.893258,0.895131,0.893258,0.895131,0.895131,0.893258,0.893258,0.895131,0.895131,0.893258,0.895131,0.895131,0.893258,0.893727,0.893727,0.895131,0.895131,0.893258,0.893727,0.893727,0.895131,0.895131,0.895131
max,0.911985,0.913858,0.913858,0.913858,0.913858,0.91573,0.917603,0.919476,0.913858,0.911985,0.923221,0.919476,0.910112,0.910112,0.917603,0.911985,0.919476,0.91573,0.910112,0.913858,0.91573,0.913858,0.910112,0.913858,0.913858,0.913858,0.913858,0.919476,0.917603,0.911985,0.913858,0.91573,0.921348,0.913858,0.91573,0.91573,0.910112,0.913858,0.911985,0.913858,0.919476,0.911985,0.917603,0.913858,0.919476


In [57]:
GB_recall.describe()
##one with the highest- (2,5,8), .857692 - 17 (if not looking at 1s)
    ##do not want to use 17 bc accuracy low, 2s accuracy low, 5s accuracy low, 8s accuracy low 
    ##1 has .848333, 20 has .849487

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.848333,0.847179,1.0,0.82641,0.852949,1.0,0.764487,0.842692,1.0,0.796154,0.848718,0.852821,0.715641,0.848205,0.867821,0.68141,0.813974,0.857692,0.744615,0.849487,0.847179,0.681923,0.83141,0.852949,0.656923,0.766026,0.842821,0.719872,0.847051,0.847179,0.669103,0.803846,0.852308,0.655128,0.73859,0.841795,0.703718,0.842179,0.847179,0.658846,0.781026,0.851667,0.656282,0.723333,0.839872
std,0.032344,0.030691,0.0,0.036281,0.030733,0.0,0.041218,0.034278,0.0,0.045828,0.031351,0.030789,0.049983,0.030909,0.036839,0.048602,0.037322,0.040192,0.050653,0.032167,0.030691,0.052362,0.036142,0.030733,0.04849,0.042943,0.034508,0.052288,0.033662,0.030691,0.053739,0.03961,0.030463,0.052355,0.047058,0.034296,0.050627,0.034125,0.030691,0.051698,0.044062,0.030992,0.047925,0.046927,0.034406
min,0.75641,0.769231,1.0,0.74359,0.782051,1.0,0.666667,0.75641,1.0,0.666667,0.769231,0.782051,0.564103,0.769231,0.782051,0.512821,0.717949,0.74359,0.628205,0.75641,0.769231,0.538462,0.730769,0.782051,0.5,0.653846,0.75641,0.602564,0.75641,0.769231,0.538462,0.705128,0.782051,0.5,0.615385,0.75641,0.589744,0.75641,0.769231,0.538462,0.666667,0.782051,0.487179,0.589744,0.74359
25%,0.830128,0.833333,1.0,0.794872,0.833333,1.0,0.730769,0.820513,1.0,0.769231,0.833333,0.833333,0.679487,0.833333,0.846154,0.653846,0.794872,0.833333,0.705128,0.833333,0.833333,0.653846,0.807692,0.833333,0.641026,0.740385,0.820513,0.679487,0.820513,0.833333,0.628205,0.778846,0.833333,0.628205,0.705128,0.820513,0.666667,0.820513,0.833333,0.628205,0.75641,0.833333,0.628205,0.692308,0.820513
50%,0.846154,0.846154,1.0,0.833333,0.858974,1.0,0.75641,0.846154,1.0,0.794872,0.846154,0.858974,0.717949,0.846154,0.871795,0.679487,0.814103,0.858974,0.74359,0.846154,0.846154,0.692308,0.833333,0.858974,0.653846,0.769231,0.846154,0.717949,0.846154,0.846154,0.666667,0.807692,0.852564,0.653846,0.74359,0.846154,0.705128,0.846154,0.846154,0.666667,0.782051,0.852564,0.653846,0.730769,0.839744
75%,0.871795,0.871795,1.0,0.846154,0.871795,1.0,0.794872,0.871795,1.0,0.820513,0.871795,0.871795,0.746795,0.871795,0.897436,0.708333,0.833333,0.884615,0.782051,0.871795,0.871795,0.717949,0.858974,0.871795,0.682692,0.794872,0.871795,0.75641,0.871795,0.871795,0.705128,0.833333,0.871795,0.692308,0.769231,0.862179,0.730769,0.858974,0.871795,0.692308,0.807692,0.871795,0.692308,0.75641,0.862179
max,0.923077,0.923077,1.0,0.923077,0.923077,1.0,0.871795,0.923077,1.0,0.884615,0.923077,0.923077,0.820513,0.923077,0.974359,0.794872,0.910256,0.948718,0.833333,0.923077,0.923077,0.794872,0.923077,0.923077,0.769231,0.858974,0.923077,0.833333,0.923077,0.923077,0.794872,0.897436,0.923077,0.769231,0.833333,0.923077,0.820513,0.910256,0.923077,0.794872,0.884615,0.923077,0.769231,0.820513,0.910256


In [54]:
GB_accuracy.describe()
##one with the highest, 1- .907640 - 20

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.899494,0.90764,0.1460674,0.890112,0.904494,0.1460674,0.890468,0.89824,0.1460674,0.884213,0.904326,0.878764,0.895318,0.901423,0.821292,0.902079,0.894419,0.791367,0.885375,0.900581,0.90764,0.899288,0.893352,0.904663,0.902491,0.892491,0.898184,0.889232,0.898258,0.907603,0.900787,0.888614,0.904401,0.902303,0.895861,0.899213,0.890843,0.895674,0.907547,0.900524,0.888539,0.90427,0.902734,0.898483,0.899419
std,0.010799,0.009556,1.952678e-16,0.011083,0.010222,1.952678e-16,0.011444,0.01199,1.952678e-16,0.011654,0.010125,0.015496,0.012044,0.009868,0.112,0.01144,0.010742,0.092444,0.012224,0.010312,0.009556,0.011834,0.011136,0.010316,0.010813,0.011017,0.012015,0.011893,0.010553,0.009634,0.011808,0.011782,0.00985,0.011504,0.011333,0.010375,0.012178,0.010473,0.009462,0.011278,0.011661,0.009697,0.011197,0.011303,0.010199
min,0.874532,0.88764,0.1460674,0.863296,0.874532,0.1460674,0.867041,0.85206,0.1460674,0.844569,0.878277,0.844569,0.865169,0.874532,0.299625,0.868914,0.872659,0.494382,0.842697,0.876404,0.88764,0.868914,0.870787,0.874532,0.88015,0.861423,0.85206,0.853933,0.870787,0.885768,0.874532,0.863296,0.88015,0.878277,0.874532,0.870787,0.853933,0.868914,0.88764,0.874532,0.863296,0.88015,0.876404,0.874532,0.876404
25%,0.891386,0.900749,0.1460674,0.883895,0.898876,0.1460674,0.882022,0.891386,0.1460674,0.876404,0.897004,0.868914,0.88764,0.894663,0.757491,0.895131,0.887172,0.742509,0.877809,0.893258,0.900749,0.893258,0.885768,0.898876,0.895131,0.883895,0.891386,0.88015,0.891386,0.900749,0.893258,0.88015,0.898408,0.893258,0.889045,0.89279,0.882022,0.88764,0.900749,0.89279,0.882022,0.898876,0.895131,0.891386,0.89279
50%,0.898876,0.906367,0.1460674,0.889513,0.904494,0.1460674,0.891386,0.89794,0.1460674,0.883895,0.904494,0.876404,0.895131,0.900749,0.881086,0.902622,0.895131,0.825843,0.883895,0.900749,0.906367,0.900749,0.893258,0.904494,0.904494,0.893258,0.898876,0.889513,0.898876,0.906367,0.902622,0.888577,0.904494,0.902622,0.896067,0.898876,0.891386,0.897004,0.906367,0.902622,0.88764,0.904494,0.904494,0.89794,0.899813
75%,0.906367,0.914326,0.1460674,0.897472,0.911985,0.1460674,0.897004,0.906835,0.1460674,0.893258,0.910112,0.885768,0.902622,0.906367,0.891386,0.908708,0.901217,0.859551,0.892322,0.906835,0.914326,0.908708,0.901217,0.911985,0.910112,0.900749,0.90824,0.897004,0.904494,0.914326,0.910112,0.895599,0.910112,0.910112,0.902622,0.906367,0.897004,0.902622,0.914326,0.90824,0.895131,0.910112,0.910112,0.906367,0.904494
max,0.925094,0.928839,0.1460674,0.917603,0.928839,0.1460674,0.919476,0.926966,0.1460674,0.911985,0.928839,0.928839,0.919476,0.926966,0.923221,0.930712,0.91573,0.906367,0.91573,0.926966,0.928839,0.928839,0.917603,0.928839,0.930712,0.91573,0.926966,0.917603,0.926966,0.928839,0.925094,0.917603,0.928839,0.928839,0.917603,0.923221,0.925094,0.917603,0.928839,0.925094,0.917603,0.928839,0.928839,0.923221,0.925094


In [15]:
#(35 points) Using the telecom train build three models: the best random forest model
#from part (f), the best AdaBoost model form part (f), and the best gradient boosting
#model form part (f). Using these to three models, predict the likelihood of Churn on the
#telecom test data-frame. After that, aggregate those likelihoods using the weighted
#average formula (use average recall of the models as weights). Using 10% as cutoff
#value, report the accuracy and recall of the aggregated predictions

# input and target variables
X = telecom_train[['Account_length', 'International_plan', 'Customer_service_calls', 'total_charge']]
Y = telecom_train['churn_numb']

#Splitting the Data
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size= 0.2, stratify= Y)

#### BUILDING THE BEST MODELS ####
## RANDOM FOREST ##
BestRF= RandomForestClassifier(n_estimators= 100, max_depth= 3).fit(X_train, Y_train)
#Predicting on the model
BestRF_pred= BestRF.predict_proba(X_test)[:,1]

## ADA ##
BestADA= AdaBoostClassifier(base_estimator= DecisionTreeClassifier(max_depth= 5), n_estimators= 1500, learning_rate= .001).fit(X_train, Y_train)
#Predicting on the model
BestADA_pred= BestADA.predict_proba(X_test)[:,1]

## GRADIENT BOOST ##
BestGB= GradientBoostingClassifier(max_depth= 7, n_estimators= 500, learning_rate= .001).fit(X_train, Y_train)
#Predicting on the model
BestGB_pred=  BestGB.predict_proba(X_test)[:,1]

RF_oldrecall = 0.859359 
ADA_oldrecall = 0.622179
GB_oldrecall = 0.857692
Total_oldrecall = RF_oldrecall + ADA_oldrecall + GB_oldrecall


labels = (RF_oldrecall/Total_oldrecall)*(BestRF_pred) + (ADA_oldrecall/Total_oldrecall)*(BestADA_pred) + (GB_oldrecall/Total_oldrecall)*(BestGB_pred)
labels= np.where(labels< .1, 0, 1)
Recall= recall_score(Y_test, labels)
Accuracy= accuracy_score(Y_test, labels)
print('The new recall is:', Recall)
print('The new accuracy is:', Accuracy)

The new recall is: 0.8205128205128205
The new accuracy is: 0.8258426966292135
