In [6]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/telecom_churn.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
churn_data = pd.read_csv(file_content_stream)
churn_data = churn_data.dropna()
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [7]:
# frequency table of churn
churn_data['Churn'].value_counts() / churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [9]:
# define inputs and targets
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn_data['Churn']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [10]:
# Check proportion of 0s and 1s
Y_train.value_counts() / Y_train.shape[0]

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [11]:
Y_test.value_counts() / Y_test.shape[0]

0    0.854573
1    0.145427
Name: Churn, dtype: float64

In [12]:
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)
rf_pred = rf_md.predict_proba(X_test)[:, 1]

# ROC_AUC
fpr, tpr, threshold = roc_curve(Y_test, rf_pred)

In [13]:
cutoffs = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})
cutoffs

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.601274
1,0.000000,0.010309,0.601274
2,0.000000,0.154639,0.567130
3,0.001754,0.154639,0.565962
4,0.001754,0.206186,0.560548
...,...,...,...
149,0.971930,1.000000,0.054072
150,0.975439,1.000000,0.054061
151,0.987719,1.000000,0.053989
152,0.991228,1.000000,0.053985


In [15]:
# drop first observation
cutoffs = cutoffs.drop(cutoffs.index[0], axis = 0)
cutoffs

Unnamed: 0,fpr,tpr,threshold
2,0.000000,0.154639,0.567130
3,0.001754,0.154639,0.565962
4,0.001754,0.206186,0.560548
5,0.003509,0.206186,0.559465
6,0.003509,0.247423,0.548212
...,...,...,...
149,0.971930,1.000000,0.054072
150,0.975439,1.000000,0.054061
151,0.987719,1.000000,0.053989
152,0.991228,1.000000,0.053985


In [16]:
# find euclidean distance from perfect model
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['fpr']**2 + (1 - cutoffs['tpr']**2))
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
2,0.000000,0.154639,0.567130,0.987971
3,0.001754,0.154639,0.565962,0.987973
4,0.001754,0.206186,0.560548,0.978514
5,0.003509,0.206186,0.559465,0.978519
6,0.003509,0.247423,0.548212,0.968914
...,...,...,...,...
149,0.971930,1.000000,0.054072,0.971930
150,0.975439,1.000000,0.054061,0.975439
151,0.987719,1.000000,0.053989,0.987719
152,0.991228,1.000000,0.053985,0.991228


In [18]:
cutoffs = cutoffs.sort_values(by = 'Euclidean_dist')
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
80,0.224561,0.896907,0.114093,0.495969
74,0.189474,0.886598,0.120583,0.499844
75,0.191228,0.886598,0.118093,0.500512
76,0.194737,0.886598,0.116607,0.501863
77,0.196491,0.886598,0.116058,0.502546
...,...,...,...,...
151,0.987719,1.000000,0.053989,0.987719
2,0.000000,0.154639,0.567130,0.987971
3,0.001754,0.154639,0.565962,0.987973
152,0.991228,1.000000,0.053985,0.991228


In [20]:
rf_labels = np.where(rf_pred < .138858, 0, 1)
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.98      0.82      0.89       570
           1       0.46      0.88      0.60        97

    accuracy                           0.83       667
   macro avg       0.72      0.85      0.75       667
weighted avg       0.90      0.83      0.85       667

