In [5]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

#defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'rachaeld-data445'
bucket = s3.Bucket(bucket_name)

#defining the csv file
file_key = 'telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [6]:
##creating the frequenxy table of churn 
churn_data['Churn'].value_counts() / churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [7]:
##defining the input and target variables
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn_data['Churn']

##splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [8]:
Y_train.value_counts() / Y_train.shape[0]

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [9]:
Y_test.value_counts() / Y_test.shape[0]

0    0.854573
1    0.145427
Name: Churn, dtype: float64

In [10]:
### RANDOM FOREST
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

RF_pred = RF_md.predict_proba(X_test)[:,1]

##ROC-AUC
fpr, tpr, threshold = roc_curve(Y_test, RF_pred)

In [11]:
cutoffs = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})
cutoffs

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.632559
1,0.000000,0.010309,0.632559
2,0.000000,0.350515,0.501541
3,0.001754,0.350515,0.499202
4,0.001754,0.443299,0.418131
...,...,...,...
123,0.978947,1.000000,0.055151
124,0.982456,1.000000,0.055128
125,0.994737,1.000000,0.054891
126,0.998246,1.000000,0.054864


In [13]:
cutoffs = cutoffs.drop(cutoffs.index[0], axis = 0)
cutoffs

Unnamed: 0,fpr,tpr,threshold
2,0.000000,0.350515,0.501541
3,0.001754,0.350515,0.499202
4,0.001754,0.443299,0.418131
5,0.007018,0.443299,0.411707
6,0.007018,0.463918,0.402706
...,...,...,...
123,0.978947,1.000000,0.055151
124,0.982456,1.000000,0.055128
125,0.994737,1.000000,0.054891
126,0.998246,1.000000,0.054864


In [15]:
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['fpr']**2 + (1-cutoffs['tpr'])**2)
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
2,0.000000,0.350515,0.501541,0.649485
3,0.001754,0.350515,0.499202,0.649487
4,0.001754,0.443299,0.418131,0.556704
5,0.007018,0.443299,0.411707,0.556745
6,0.007018,0.463918,0.402706,0.536128
...,...,...,...,...
123,0.978947,1.000000,0.055151,0.978947
124,0.982456,1.000000,0.055128,0.982456
125,0.994737,1.000000,0.054891,0.994737
126,0.998246,1.000000,0.054864,0.998246


In [17]:
cutoffs = cutoffs.sort_values(by = 'Euclidean_dist')
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
60,0.166667,0.865979,0.123582,0.213867
62,0.175439,0.876289,0.114734,0.214670
58,0.163158,0.855670,0.125357,0.217834
59,0.166667,0.855670,0.123728,0.220474
61,0.175439,0.865979,0.120457,0.220772
...,...,...,...,...
123,0.978947,1.000000,0.055151,0.978947
124,0.982456,1.000000,0.055128,0.982456
125,0.994737,1.000000,0.054891,0.994737
126,0.998246,1.000000,0.054864,0.998246


In [19]:
##changing liklihoods to labels
RF_labels = np.where(RF_pred < 0.213867, 0,1)

print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.96      0.87      0.91       570
           1       0.50      0.76      0.60        97

    accuracy                           0.85       667
   macro avg       0.73      0.82      0.76       667
weighted avg       0.89      0.85      0.87       667

