### Establish environment

In [1]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np
from datetime import datetime

# Sagemaker/related Libraries
import sagemaker
from sagemaker import get_execution_role
import boto3
import io
from io import StringIO
from io import BytesIO
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac
import os
smclient = boto3.Session().client('sagemaker')
s3 = boto3.client('s3')

# ata Visualization 
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, precision_recall_curve, auc

  import scipy.sparse


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Matplotlib is building the font cache; this may take a moment.


In [2]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)

In [3]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = 'user-churn'  # Replace with your S3 bucket name
input_prefix = 'user-churn-processed-data'  # Replace with the S3 prefix where your protobuf data is stored
test_with_predictions = "test_with_predictions.csv"
hist_figsize=(5, 3)
xgb = "xgb"
linear = "linear"
stacked = "stacked"
cost = "Net_Cost_Savings"
ROC_AUC = "ROC_AUC"
Prevented_Churn = "Prevented_Churn"
stacked_predictions = "stacked_predictions"
columns_to_plot = ['xgb_predictions', 'linear_predictions', 'stacked_predictions']
xgb_predictions = "xgb_predictions"
linear_predictions = "linear_predictions"

In [4]:
Recall = "Recall"
PR_AUC = "PR AUC"

In [5]:
def CSV_Reader(bucket, subfolder, source_file_name):
    
    '''The CSV_Reader() function takes in the names of the bucket, subfolder, and source file name, and desired dataframe name. 
    It first constructs the filepath, and then imports the file residing at this filepath, based on the title fed to the function.
    
    Arguments 
    --------- 
    bucket: Head S3 repository bucket
    subfolder: Subfolder containing the source data
    source_file_name: Name of source CSV data file 
    
    Return
    ---------
    Returns the source data in a pandas dataframe '''
    
    data_location = 's3://{}/{}/{}'.format(bucket, subfolder, source_file_name)  
    dataset = pd.read_csv(data_location, low_memory=False, header='infer')
    return dataset

In [6]:
churn_test = CSV_Reader(bucket, input_prefix, test_with_predictions)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [7]:
print(churn_test.shape)
churn_test.columns

(1000, 35)


Index(['VideosWatched', 'CommentsMade', 'TimeSpent', 'AverageSessionDuration',
       'TotalSessions', 'ProfileCompleteness', 'VideoUploads', 'VideoShares',
       'LoginFrequency', 'AdInteractions', 'InAppPurchases', 'SupportTickets',
       'EngagementScore', 'Age_binned_encoded', 'SubscriptionStatus_encoded',
       'AccountType_encoded', 'NotificationsEnabled_encoded', 'Gender_Male',
       'Gender_Unknown', 'AppVersion_1.2', 'AppVersion_2.0', 'Country_Canada',
       'Country_US', 'DeviceType_Mobile', 'OS_iOS', 'ReferralSource_Organic',
       'ReferralSource_Social Media', 'FavoriteCategory_Meal Types',
       'FavoriteCategory_Recipes', 'DaysSinceLastLogin_binned_Last Week',
       'DaysSinceLastLogin_binned_Last Month',
       'DaysSinceLastLogin_binned_Last 3 Months',
       'DaysSinceLastLogin_binned_Last Year', 'xgb_predictions', 'Churn_num'],
      dtype='object')

In [16]:
churn_test.isnull().sum()

VideosWatched                              0
CommentsMade                               0
TimeSpent                                  0
AverageSessionDuration                     0
TotalSessions                              0
ProfileCompleteness                        0
VideoUploads                               0
VideoShares                                0
LoginFrequency                             0
AdInteractions                             0
InAppPurchases                             0
SupportTickets                             0
EngagementScore                            0
Age_binned_encoded                         0
SubscriptionStatus_encoded                 0
AccountType_encoded                        0
NotificationsEnabled_encoded               0
Gender_Male                                0
Gender_Unknown                             0
AppVersion_1.2                             0
AppVersion_2.0                             0
Country_Canada                             0
Country_US

In [18]:
churn_test['Churn_num'] = churn_test['Churn_num'].fillna(0)

In [20]:
churn_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 35 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   VideosWatched                            1000 non-null   float64
 1   CommentsMade                             1000 non-null   float64
 2   TimeSpent                                1000 non-null   float64
 3   AverageSessionDuration                   1000 non-null   float64
 4   TotalSessions                            1000 non-null   float64
 5   ProfileCompleteness                      1000 non-null   float64
 6   VideoUploads                             1000 non-null   float64
 7   VideoShares                              1000 non-null   float64
 8   LoginFrequency                           1000 non-null   float64
 9   AdInteractions                           1000 non-null   float64
 10  InAppPurchases                           1000 non

In [23]:
churn_test.sample(20)

Unnamed: 0,VideosWatched,CommentsMade,TimeSpent,AverageSessionDuration,TotalSessions,ProfileCompleteness,VideoUploads,VideoShares,LoginFrequency,AdInteractions,InAppPurchases,SupportTickets,EngagementScore,Age_binned_encoded,SubscriptionStatus_encoded,AccountType_encoded,NotificationsEnabled_encoded,Gender_Male,Gender_Unknown,AppVersion_1.2,AppVersion_2.0,Country_Canada,Country_US,DeviceType_Mobile,OS_iOS,ReferralSource_Organic,ReferralSource_Social Media,FavoriteCategory_Meal Types,FavoriteCategory_Recipes,DaysSinceLastLogin_binned_Last Week,DaysSinceLastLogin_binned_Last Month,DaysSinceLastLogin_binned_Last 3 Months,DaysSinceLastLogin_binned_Last Year,xgb_predictions,Churn_num
698,0.902471,0.709894,0.819895,1.443502,0.77228,-0.744253,0.240643,0.229148,0.20428,0.229404,-0.10283,0.149123,0.563264,1,0,0,1,0,0,1,0,0,1,1,1,0,1,0,1,0,0,0,1,0.752666,1.0
839,0.727546,-0.155346,1.274304,0.888712,0.612708,-0.988684,1.574625,1.333645,-1.030884,-0.337395,1.365472,-1.247519,0.767531,0,0,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,0.585934,1.0
954,0.727546,0.934728,0.749242,-0.346419,0.727589,-0.074353,0.794296,-2.506938,0.758972,1.331976,0.659752,0.149123,0.679942,5,0,0,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0.795396,1.0
414,0.116101,0.458552,0.8028,1.205469,-0.102743,-0.215788,-1.873668,-0.338641,1.201885,0.669048,1.066546,0.149123,0.170187,5,0,0,0,1,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,0.768379,1.0
593,-0.392681,-1.634484,0.239963,0.77514,-0.488967,1.158464,-1.873668,-1.138895,-1.665845,-1.136255,-1.091599,0.149123,-1.032914,0,0,0,1,1,0,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0.713557,1.0
213,0.116101,-1.634484,-0.414206,-0.021979,1.066968,-0.005082,-1.873668,0.66956,0.20428,-0.337395,-2.080369,1.545765,-0.263165,2,1,0,1,0,1,1,0,0,1,0,1,1,0,0,0,0,0,1,0,0.554921,1.0
224,-0.255598,0.458552,-1.306281,-0.561584,-0.289341,-1.245445,-1.873668,-1.138895,-0.135956,0.669048,1.169517,0.149123,-0.288818,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,1,0,0,1,0,0.625858,1.0
111,1.066103,0.934728,0.239963,-0.561584,0.493019,-0.14458,-0.539686,1.333645,0.499005,1.595063,1.743347,0.966106,0.96572,5,0,1,1,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,1,0.876985,0.0
932,0.440212,1.652741,-0.240783,1.314746,0.541497,0.579279,0.240643,0.66956,0.20428,0.669048,-0.10283,0.966106,0.576817,5,0,0,0,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,0,0.689152,1.0
72,1.293307,1.323792,1.307452,-1.898308,0.85962,0.700507,-1.873668,1.029403,1.5706,0.669048,1.261908,0.149123,1.625851,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0.473567,1.0


In [25]:
churn_test['xgb_predictions_num'] = [int(p >= 0.5) for p in churn_test['xgb_predictions']]

In [27]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(churn_test['Churn_num'], churn_test['xgb_predictions_num'])
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(classification_report(churn_test['Churn_num'], churn_test['xgb_predictions_num']))

Accuracy: 69.00%
              precision    recall  f1-score   support

         0.0       0.38      0.09      0.14       293
         1.0       0.71      0.94      0.81       707

    accuracy                           0.69      1000
   macro avg       0.55      0.51      0.48      1000
weighted avg       0.61      0.69      0.62      1000

