In [1]:
pip install imblearn

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/telecom_churn.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
churn_data = pd.read_csv(file_content_stream)
churn_data = churn_data.dropna()
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [3]:
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'DayMins', 'MonthlyCharge']]
Y = churn_data['Churn']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [4]:
## Running Smote ## Sinthetic Data ##
X_smote, Y_smote = SMOTE().fit_resample(X_train, Y_train)

In [5]:
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_smote, Y_smote)
rf_pred = rf_md.predict_proba(X_test)[:,1]

## ROC
fpr, tpr, threshold = roc_curve(Y_test, rf_pred)

## Optimize cutoff
rf_cutoff = pd.DataFrame({'FPR': fpr,
                          'TPR': tpr,
                          'cutoff': threshold})

## Calculate distance to perfect model
rf_cutoff['Distance'] = np.sqrt(rf_cutoff['FPR']**2 + (1 - rf_cutoff['TPR'])**2)

## sort by distance
rf_cutoff = rf_cutoff.sort_values(by = 'Distance').reset_index(drop = True)

## create labels
rf_label = np.where(rf_pred < rf_cutoff['cutoff'][0], 0, 1)

print(classification_report(Y_test, rf_label))

              precision    recall  f1-score   support

           0       0.96      0.89      0.93       570
           1       0.57      0.80      0.66        97

    accuracy                           0.88       667
   macro avg       0.76      0.85      0.80       667
weighted avg       0.91      0.88      0.89       667



In [6]:
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = .01).fit(X_smote, Y_smote)
ada_pred = ada_md.predict_proba(X_test)[:,1]

## ROC
fpr, tpr, threshold = roc_curve(Y_test, ada_pred)

## Optimize cutoff
ada_cutoff = pd.DataFrame({'FPR': fpr,
                          'TPR': tpr,
                          'cutoff': threshold})

## Calculate distance to perfect model
ada_cutoff['Distance'] = np.sqrt(ada_cutoff['FPR']**2 + (1 - ada_cutoff['TPR'])**2)

## sort by distance
ada_cutoff = ada_cutoff.sort_values(by = 'Distance').reset_index(drop = True)

## create labels
ada_label = np.where(ada_pred < ada_cutoff['cutoff'][0], 0, 1)

print(classification_report(Y_test, ada_label))



              precision    recall  f1-score   support

           0       0.96      0.89      0.93       570
           1       0.56      0.79      0.66        97

    accuracy                           0.88       667
   macro avg       0.76      0.84      0.79       667
weighted avg       0.90      0.88      0.89       667



In [None]:
## Based on my results. Random Forest is slightly better ##