### Outline of notebook:

#### 3.1 Establish environment and define variables
#### 3.2 Perform K-Fold split and convert train datasets to protobuf format
#### 3.3 Tune Hyperparameters
#### 3.4 Build Models
#### 3.5 Deploy Models
#### 3.6 Make Batch Predictions on Test Data
#### 3.7 Export test data with predictions

### 3.1 Establish environment and define variables

In [1]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
current_date = datetime.now()

# Sagemaker/related Libraries
import boto3
import sagemaker
smclient = boto3.Session().client('sagemaker')
s3 = boto3.client('s3')
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac
from sagemaker.session import Session

# Machine Learning Libraries
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.inputs import TrainingInput
from sklearn.model_selection import KFold
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import image_uris

#Protobuf Libraries
import io
from io import StringIO
from io import BytesIO

  import scipy.sparse


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = 'user-churn'
input_prefix = 'user-churn-processed-data'
k_folder = 'k'  # subdirectory for k-fold datasets

test_dataset = 'reduced_dimensions_data_test.csv'
train_dataset = "reduced_dimensions_data_train.csv"
xgb_full_train_proto_filename = "xgb_full_train_proto.data"   # final training dataset to train XGBoost model on optimized hyperparameters
train_proto_filename = "train_proto.data"   # final training dataset to train a Linear Learner model on optimized hyperparameters
validation_proto_filename = "validation_proto.data"   # inal validation dataset to train a Linear Learner model on optimized hyperparameters
test_with_predictions = "test_with_predictions.csv"  # exporting our test dataset once all our predictions have been made

feature_dim = 33  # The number of features we will use to train our linear learner model
linear_job_name = "churn-job-linear"  # job name for the linear learner model

**Defining filepaths that we will use later**

In [3]:
s3_train_proto_filepath = 's3://{}/{}/{}/{}'.format(bucket, input_prefix, k_folder, train_proto_filename)
hyperparam_output_filepath = "s3://{}/{}/hyperparam_output".format(bucket, input_prefix)
XGB_model_output_filepath = "s3://{}/{}/xgb_output".format(bucket, input_prefix)
Linear_model_output_filepath = "s3://{}/{}/linear_output".format(bucket, input_prefix)
linear_training_data_location = 's3://{}/{}/{}/{}'.format(bucket, input_prefix, k_folder, train_proto_filename)
linear_validation_data_location = 's3://{}/{}/{}/{}'.format(bucket, input_prefix, k_folder, validation_proto_filename)

In [4]:
def CSV_Reader(bucket, subfolder, source_file_name):
    
    '''The CSV_Reader() function takes in the names of the bucket, subfolder, and source file name, and desired dataframe name. 
    It first constructs the filepath, and then imports the file residing at this filepath, based on the title fed to the function.
    
    Arguments 
    --------- 
    bucket: Head S3 repository bucket
    subfolder: Subfolder containing the source data
    source_file_name: Name of source CSV data file 
    
    Return
    ---------
    Returns the source data in a pandas dataframe '''
    
    data_location = 's3://{}/{}/{}'.format(bucket, subfolder, source_file_name)  
    dataset = pd.read_csv(data_location, low_memory=False, header='infer')
    return dataset

**Reading in the train dataset**

In [5]:
data = CSV_Reader(bucket, input_prefix, train_dataset)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [6]:
print(data.shape)
data.sample(2)

(124585, 34)


Unnamed: 0,VideosWatched,CommentsMade,TimeSpent,AverageSessionDuration,TotalSessions,ProfileCompleteness,VideoUploads,VideoShares,LoginFrequency,AdInteractions,...,DeviceType_Mobile,OS_iOS,ReferralSource_Organic,ReferralSource_Social Media,FavoriteCategory_Meal Types,FavoriteCategory_Recipes,DaysSinceLastLogin_binned_Last Week,DaysSinceLastLogin_binned_Last Month,DaysSinceLastLogin_binned_Last 3 Months,DaysSinceLastLogin_binned_Last Year
111225,0.816517,0.458552,-0.376431,-0.838979,-0.102743,1.372868,0.794296,-0.338641,-0.135956,0.669048,...,True,False,False,True,True,False,False,True,False,False
60868,-2.787279,-0.54441,1.007827,0.106778,-0.778851,-1.072841,0.794296,-0.338641,0.20428,1.028264,...,True,True,True,False,False,True,False,False,False,True


### 3.2 Perform K-Fold split and convert train datasets to protobuf format

**Preparing a function that will convert a dataset into protobuf format and export to S3**

In [7]:
def Export_Processed_Protobuf(bucket, processed_data_folder, k_folder, local_file, S3_file_name):
    
    '''Exports a dataframe in protobuf format, and sends it to a specified S3 bucket location
    
    Arguments 
    --------- 
    bucket: A list of the columns (i.e. the 3 diagnosis columns) to be updated
    processed_data_folder: the relevant subfolder within the main bucket
    local_file_name: The name of the dataframe within the notebook
    S3_file_name: The name of the file upon export (with .data extension included)
    
    Return
    ---------
    Exports a protobuf file to a specified S3 location'''
    
# Here we seperate out the input and output values
    X_values = local_file.drop(columns='Churn_num').values
    y_values = local_file['Churn_num'].values
    
# Here we set up our code to transform the data    
    f = io.BytesIO()
    smac.write_numpy_to_dense_tensor(f, X_values.astype('float32'), y_values.astype('float32'))
    f.seek(0)
    
# Here we upload the data    
    boto3.Session().resource('s3').Bucket(bucket).Object('{}/{}/{}'.format(processed_data_folder, k_folder, S3_file_name)).upload_fileobj(f)
    training_recordIO_protobuf_location = 's3://{}/{}/{}/{}'.format(bucket, processed_data_folder, k_folder, S3_file_name)
    
    print('The Pipe mode recordIO protobuf training data: {}'.format(training_recordIO_protobuf_location))

**Exporting the full train dataset to S3 in protobuf. This will be later used to train our final XGBoost model.**

In [8]:
Export_Processed_Protobuf(bucket, input_prefix, k_folder, data, xgb_full_train_proto_filename)

The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/xgb_full_train_proto.data


**Here, we create an 80/20 split on our training data for our final Linear Learner model.**

- XGBoost: Can be trained on a single large dataset without needing a separate validation set during training.
- Linear Learner: Requires a separate training and validation dataset (outside of the K-Fold process) to ensure proper tuning and evaluation, leading to better generalization and performance.

In [9]:
# Set the seed for reproducibility
np.random.seed(42)

# Generate random indices for train and test sets with an 80/20 split
train_size = int(0.8 * len(data))
train_indices = np.random.permutation(len(data))[:train_size]
test_indices = np.random.permutation(len(data))[train_size:]

# Create training and test sets using the selected indices
optimized_linear_train_set = data.iloc[train_indices]
optimized_linear_test_set = data.iloc[test_indices]

# Output the shapes of the datasets
print("Train Set Shape:", optimized_linear_train_set.shape)
print("Test Set Shape:", optimized_linear_test_set.shape)

Train Set Shape: (99668, 34)
Test Set Shape: (24917, 34)


**Here we export our final Linear Learner Train/Validation datasets**

In [10]:
Export_Processed_Protobuf(bucket, input_prefix, k_folder, optimized_linear_train_set, train_proto_filename)
Export_Processed_Protobuf(bucket, input_prefix, k_folder, optimized_linear_test_set, validation_proto_filename)

The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/train_proto.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/validation_proto.data


we perform 5-fold cross-validation, which will split our dataset into 5 different folds. Each fold will create distinct training and validation datasets, allowing us to train and validate our model on different subsets of the data. This approach helps in assessing the model’s performance more robustly by ensuring that every data point is used for both training and validation across the different folds.

In [11]:
# Number of folds for cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Generate train and validation datasets for each fold
for fold, (train_index, valid_index) in enumerate(kf.split(data), 1):
    train_data = data.iloc[train_index].copy()
    validation_data = data.iloc[valid_index].copy()
    
    # Store the data in variables dynamically (if needed)
    globals()[f'train_data_{fold}'] = train_data
    globals()[f'validation_data_{fold}'] = validation_data
    
    # Print sample to verify
    print(f"Sample from train_data_{fold}:")
    print(train_data.sample(1))

# Verify a sample from the first fold
train_data_1.sample(1)

Sample from train_data_1:
       VideosWatched  CommentsMade  TimeSpent  AverageSessionDuration  \
37768      -0.392681      0.934728   -1.01666               -1.229946   

       TotalSessions  ProfileCompleteness  VideoUploads  VideoShares  \
37768       0.210622            -0.988684      1.871292     1.029403   

       LoginFrequency  AdInteractions  ...  DeviceType_Mobile  OS_iOS  \
37768         0.20428        1.028264  ...               True    True   

       ReferralSource_Organic  ReferralSource_Social Media  \
37768                    True                        False   

       FavoriteCategory_Meal Types  FavoriteCategory_Recipes  \
37768                        False                      True   

       DaysSinceLastLogin_binned_Last Week  \
37768                                False   

       DaysSinceLastLogin_binned_Last Month  \
37768                                 False   

       DaysSinceLastLogin_binned_Last 3 Months  \
37768                                    Fa

Unnamed: 0,VideosWatched,CommentsMade,TimeSpent,AverageSessionDuration,TotalSessions,ProfileCompleteness,VideoUploads,VideoShares,LoginFrequency,AdInteractions,...,DeviceType_Mobile,OS_iOS,ReferralSource_Organic,ReferralSource_Social Media,FavoriteCategory_Meal Types,FavoriteCategory_Recipes,DaysSinceLastLogin_binned_Last Week,DaysSinceLastLogin_binned_Last Month,DaysSinceLastLogin_binned_Last 3 Months,DaysSinceLastLogin_binned_Last Year
16120,0.727546,0.458552,-0.495474,2.156036,0.816283,1.425087,-0.539686,-2.506938,0.20428,0.669048,...,True,False,True,False,False,True,False,False,False,True


**The code below iterates over the different K-Fold datasets (5 training sets and 5 validation sets) and exports each one to the specified S3 bucket in Protobuf format. This process ensures that the datasets are prepared and stored for later use in hyperparameter tuning and model training.**

In [16]:
# Loop over each fold (1 to 5) to export training and validation datasets
for i in range(1, 6):
    train_dataset_name = f'train_data_{i}'
    train_dataset = globals()[train_dataset_name]
    train_proto_filename = f'{train_dataset_name}.data'
    Export_Processed_Protobuf(bucket, input_prefix, k_folder, train_dataset, train_proto_filename)
    
    validation_dataset_name = f'validation_data_{i}'
    validation_dataset = globals()[validation_dataset_name]
    validation_csv_filename = f'{validation_dataset_name}.data'
    Export_Processed_Protobuf(bucket, input_prefix, k_folder, validation_dataset, validation_csv_filename)

The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/train_data_1.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/validation_data_1.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/train_data_2.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/validation_data_2.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/train_data_3.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/validation_data_3.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/train_data_4.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/validation_data_4.data
The Pipe mode recordIO protobuf training data: s3://user-churn/user-churn-processed-data/k/train_data_5.data

**The below code makes a list of the different train/validation filepath pairings to be referenced in our hyperparameter tuning jobs**

In [14]:
train_filepath = []
validation_filepath = []

for i in range(1, 6):
    train_filepath.append(f's3://{bucket}/{input_prefix}/{k_folder}/train_data_{i}.data')
    validation_filepath.append(f's3://{bucket}/{input_prefix}/{k_folder}/validation_data_{i}.data')

filepaths_list = list(zip(train_filepath, validation_filepath))
print(filepaths_list)

[('s3://user-churn/user-churn-processed-data/k/train_data_1.data', 's3://user-churn/user-churn-processed-data/k/validation_data_1.data'), ('s3://user-churn/user-churn-processed-data/k/train_data_2.data', 's3://user-churn/user-churn-processed-data/k/validation_data_2.data'), ('s3://user-churn/user-churn-processed-data/k/train_data_3.data', 's3://user-churn/user-churn-processed-data/k/validation_data_3.data'), ('s3://user-churn/user-churn-processed-data/k/train_data_4.data', 's3://user-churn/user-churn-processed-data/k/validation_data_4.data'), ('s3://user-churn/user-churn-processed-data/k/train_data_5.data', 's3://user-churn/user-churn-processed-data/k/validation_data_5.data')]


### 3.3 Tune Hyperparameters

Here we determine the optimal hyperparameters for both our XGBoost and Linear Learner models.

#### 3.3.1 XGBoost Hyperparameters

Below we configure a range of hyperparameters, and related variables, for use in our XGBoost model

In [12]:
XGB_tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"  # This is the learning rate for XGBoost
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"  # This is the L1 regularization term on weights
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        },

      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 50,
      "MaxParallelTrainingJobs": 5
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:auc",  # the model's performance will be evaluated based on the "Area Under the Curve" (AUC) for the validation set.
      "Type": "Maximize"
    },
    "RandomSeed" : 123
  }

In [21]:
# sagemaker.image_uris.retrieve
training_image = sagemaker.image_uris.retrieve(framework='xgboost', region='us-east-1', version='1.0-1')

# Identifying the optimal hyperparameters, and specifying input/output file paths
XGB_training_job_definition = {
    "AlgorithmSpecification": {
        "TrainingImage": training_image,
        "TrainingInputMode": "Pipe"
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "CompressionType": "None",
            "ContentType": "application/x-recordio-protobuf",  # Change content type to protobuf
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": filepaths_list[0][0]
                }
            }
        },
        {
            "ChannelName": "validation",
            "CompressionType": "None",
            "ContentType": "application/x-recordio-protobuf",  # Change content type to protobuf
            "DataSource": {
                "S3DataSource": {
                    "S3DataDistributionType": "FullyReplicated",
                    "S3DataType": "S3Prefix",
                    "S3Uri": filepaths_list[0][1]
                }
            }
        }
    ],
    "OutputDataConfig": {
        "S3OutputPath": hyperparam_output_filepath
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.c4.2xlarge",
        "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
        "eval_metric": "auc",
        "num_round": "50",
        "objective": "binary:logistic",
        "rate_drop": "0.3",
        "tweedie_variance_power": "1.4"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 43200
    }
}

XGB_tuning_job_name = 'customer-extrgboost2'

smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName=XGB_tuning_job_name,
                                           HyperParameterTuningJobConfig=XGB_tuning_job_config,
                                           TrainingJobDefinition=XGB_training_job_definition)

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:339712697475:hyper-parameter-tuning-job/customer-extrgboost2',
 'ResponseMetadata': {'RequestId': '602429e0-cbb3-4c0d-8f1a-5811cbcd8c3f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '602429e0-cbb3-4c0d-8f1a-5811cbcd8c3f',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '121',
   'date': 'Tue, 10 Sep 2024 21:48:08 GMT'},
  'RetryAttempts': 0}}

### 3.4 Training Optimized Models

In [23]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", 'us-east-1', "1.7-1")

# construct a SageMaker estimator that calls the xgboost-container
xgb_estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          #The optimal hyperparameters from tuning are brought into our new model
                                          hyperparameters=XGB_best_hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=XGB_model_output_filepath)

# define the data type and paths to the training and validation datasets
content_type = "application/x-recordio-protobuf"
train_input = TrainingInput(s3_train_proto_filepath, content_type=content_type)

# execute the XGBoost training job
xgb_estimator.fit({'train': train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-09-10-21-48-55-969


2024-09-10 21:48:57 Starting - Starting the training job...
2024-09-10 21:49:12 Starting - Preparing the instances for training...
2024-09-10 21:49:47 Downloading - Downloading the training image......
2024-09-10 21:50:33 Training - Training image download completed. Training in progress.[34m[2024-09-10 21:50:54.009 ip-10-2-227-86.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-09-10 21:50:54.032 ip-10-2-227-86.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-09-10:21:50:54:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-09-10:21:50:54:INFO] Failed to parse hyperparameter _tuning_objective_metric value validation:auc to Json.[0m
[34mReturning the value itself[0m
[34m[2024-09-10:21:50:54:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2024-09-10:21:50:54:INFO] Failed to parse hyperparameter objective value bi

### 3.5 Deploy Optimized Model

In [24]:
xgb_deployed_predictor = xgb_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-09-10-21-53-37-728
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-09-10-21-53-37-728
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-09-10-21-53-37-728


--------!

### 3.6 Make Predictions

In [59]:
churn_test = CSV_Reader(bucket, input_prefix, test_dataset)

In [60]:
churn_test.shape

(31147, 34)

In [61]:
churn_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31147 entries, 0 to 31146
Data columns (total 34 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   VideosWatched                            31147 non-null  float64
 1   CommentsMade                             31147 non-null  float64
 2   TimeSpent                                31147 non-null  float64
 3   AverageSessionDuration                   31147 non-null  float64
 4   TotalSessions                            31147 non-null  float64
 5   ProfileCompleteness                      31147 non-null  float64
 6   VideoUploads                             31147 non-null  float64
 7   VideoShares                              31147 non-null  float64
 8   LoginFrequency                           31147 non-null  float64
 9   AdInteractions                           31147 non-null  float64
 10  InAppPurchases                           31147

In [62]:
# 500 rows and 200 columns will be presented
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)

In [66]:
churn_test.head()

Unnamed: 0,VideosWatched,CommentsMade,TimeSpent,AverageSessionDuration,TotalSessions,ProfileCompleteness,VideoUploads,VideoShares,LoginFrequency,AdInteractions,InAppPurchases,SupportTickets,EngagementScore,Churn_num,Age_binned_encoded,SubscriptionStatus_encoded,AccountType_encoded,NotificationsEnabled_encoded,Gender_Male,Gender_Unknown,AppVersion_1.2,AppVersion_2.0,Country_Canada,Country_US,DeviceType_Mobile,OS_iOS,ReferralSource_Organic,ReferralSource_Social Media,FavoriteCategory_Meal Types,FavoriteCategory_Recipes,DaysSinceLastLogin_binned_Last Week,DaysSinceLastLogin_binned_Last Month,DaysSinceLastLogin_binned_Last 3 Months,DaysSinceLastLogin_binned_Last Year
0,0.727546,1.323792,1.352574,0.712909,0.393525,0.517565,0.794296,1.829657,1.5706,-1.136255,0.851972,-1.247519,1.203505,0,5,0,0,1,1,0,0,1,0,1,1,1,1,0,1,0,0,0,0,1
1,-1.029866,-1.634484,-0.453823,0.321943,-1.465147,1.320108,-1.873668,-1.138895,-0.53837,1.028264,-0.631848,0.966106,-1.325679,1,0,0,1,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0
2,-0.854941,-1.634484,-0.687422,-1.229946,-0.666674,-0.905897,-1.873668,-2.506938,1.393933,0.669048,-0.631848,-1.247519,-1.205249,0,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,1,0,0,0,1
3,0.116101,-0.155346,-0.585789,0.990305,-0.630238,0.130695,0.240643,-0.338641,0.20428,-1.136255,0.779932,0.149123,-0.398455,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0,1,0,1,0,0
4,-1.420702,0.458552,0.129064,-0.170617,-0.488967,-0.905897,0.794296,-1.138895,0.20428,-1.136255,-2.080369,0.149123,-0.892468,1,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1


In [64]:
columns_to_convert = [
    'Gender_Male', 'Gender_Unknown', 'AppVersion_1.2', 'AppVersion_2.0', 
    'Country_Canada', 'Country_US', 'DeviceType_Mobile', 'OS_iOS', 
    'ReferralSource_Organic', 'ReferralSource_Social Media', 
    'FavoriteCategory_Meal Types', 'FavoriteCategory_Recipes', 
    'DaysSinceLastLogin_binned_Last Week', 'DaysSinceLastLogin_binned_Last Month', 
    'DaysSinceLastLogin_binned_Last 3 Months', 'DaysSinceLastLogin_binned_Last Year'
]

for column in columns_to_convert:
    churn_test[column] = churn_test[column].astype(int)

In [65]:
churn_test.head()

Unnamed: 0,VideosWatched,CommentsMade,TimeSpent,AverageSessionDuration,TotalSessions,ProfileCompleteness,VideoUploads,VideoShares,LoginFrequency,AdInteractions,InAppPurchases,SupportTickets,EngagementScore,Churn_num,Age_binned_encoded,SubscriptionStatus_encoded,AccountType_encoded,NotificationsEnabled_encoded,Gender_Male,Gender_Unknown,AppVersion_1.2,AppVersion_2.0,Country_Canada,Country_US,DeviceType_Mobile,OS_iOS,ReferralSource_Organic,ReferralSource_Social Media,FavoriteCategory_Meal Types,FavoriteCategory_Recipes,DaysSinceLastLogin_binned_Last Week,DaysSinceLastLogin_binned_Last Month,DaysSinceLastLogin_binned_Last 3 Months,DaysSinceLastLogin_binned_Last Year
0,0.727546,1.323792,1.352574,0.712909,0.393525,0.517565,0.794296,1.829657,1.5706,-1.136255,0.851972,-1.247519,1.203505,0,5,0,0,1,1,0,0,1,0,1,1,1,1,0,1,0,0,0,0,1
1,-1.029866,-1.634484,-0.453823,0.321943,-1.465147,1.320108,-1.873668,-1.138895,-0.53837,1.028264,-0.631848,0.966106,-1.325679,1,0,0,1,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0
2,-0.854941,-1.634484,-0.687422,-1.229946,-0.666674,-0.905897,-1.873668,-2.506938,1.393933,0.669048,-0.631848,-1.247519,-1.205249,0,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,1,0,0,0,1
3,0.116101,-0.155346,-0.585789,0.990305,-0.630238,0.130695,0.240643,-0.338641,0.20428,-1.136255,0.779932,0.149123,-0.398455,1,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0,1,0,1,0,0
4,-1.420702,0.458552,0.129064,-0.170617,-0.488967,-0.905897,0.794296,-1.138895,0.20428,-1.136255,-2.080369,0.149123,-0.892468,1,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1


In [71]:
test_X = churn_test.drop(columns='Churn_num')
test_y = churn_test['Churn_num']

In [72]:
smaller_test_X = test_X.head(1000)
smaller_test_y = test_y.head(1000)

In [73]:
xgb_deployed_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_deployed_predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [76]:
xgb_predictions = []
xgb_results = xgb_deployed_predictor.predict(smaller_test_X)
xgb_predictions += [r['score'] for r in xgb_results['predictions']]

print(len(xgb_predictions))

1000


In [77]:
smaller_test_X['xgb_predictions'] = xgb_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smaller_test_X['xgb_predictions'] = xgb_predictions


In [81]:
smaller_test_X['Churn_num'] = churn_test['Churn_num'][0:999]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smaller_test_X['Churn_num'] = churn_test['Churn_num'][0:999]


In [82]:
smaller_test_X.head()

Unnamed: 0,VideosWatched,CommentsMade,TimeSpent,AverageSessionDuration,TotalSessions,ProfileCompleteness,VideoUploads,VideoShares,LoginFrequency,AdInteractions,InAppPurchases,SupportTickets,EngagementScore,Age_binned_encoded,SubscriptionStatus_encoded,AccountType_encoded,NotificationsEnabled_encoded,Gender_Male,Gender_Unknown,AppVersion_1.2,AppVersion_2.0,Country_Canada,Country_US,DeviceType_Mobile,OS_iOS,ReferralSource_Organic,ReferralSource_Social Media,FavoriteCategory_Meal Types,FavoriteCategory_Recipes,DaysSinceLastLogin_binned_Last Week,DaysSinceLastLogin_binned_Last Month,DaysSinceLastLogin_binned_Last 3 Months,DaysSinceLastLogin_binned_Last Year,xgb_predictions,Churn_num
0,0.727546,1.323792,1.352574,0.712909,0.393525,0.517565,0.794296,1.829657,1.5706,-1.136255,0.851972,-1.247519,1.203505,5,0,0,1,1,0,0,1,0,1,1,1,1,0,1,0,0,0,0,1,0.463832,0.0
1,-1.029866,-1.634484,-0.453823,0.321943,-1.465147,1.320108,-1.873668,-1.138895,-0.53837,1.028264,-0.631848,0.966106,-1.325679,0,0,1,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0.678847,1.0
2,-0.854941,-1.634484,-0.687422,-1.229946,-0.666674,-0.905897,-1.873668,-2.506938,1.393933,0.669048,-0.631848,-1.247519,-1.205249,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,1,0,0,0,1,0.8431,0.0
3,0.116101,-0.155346,-0.585789,0.990305,-0.630238,0.130695,0.240643,-0.338641,0.20428,-1.136255,0.779932,0.149123,-0.398455,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0,1,0,1,0,0,0.689988,1.0
4,-1.420702,0.458552,0.129064,-0.170617,-0.488967,-0.905897,0.794296,-1.138895,0.20428,-1.136255,-2.080369,0.149123,-0.892468,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0.759735,1.0


In [85]:
smaller_test = smaller_test_X.copy()

In [86]:
smaller_test.head()

Unnamed: 0,VideosWatched,CommentsMade,TimeSpent,AverageSessionDuration,TotalSessions,ProfileCompleteness,VideoUploads,VideoShares,LoginFrequency,AdInteractions,InAppPurchases,SupportTickets,EngagementScore,Age_binned_encoded,SubscriptionStatus_encoded,AccountType_encoded,NotificationsEnabled_encoded,Gender_Male,Gender_Unknown,AppVersion_1.2,AppVersion_2.0,Country_Canada,Country_US,DeviceType_Mobile,OS_iOS,ReferralSource_Organic,ReferralSource_Social Media,FavoriteCategory_Meal Types,FavoriteCategory_Recipes,DaysSinceLastLogin_binned_Last Week,DaysSinceLastLogin_binned_Last Month,DaysSinceLastLogin_binned_Last 3 Months,DaysSinceLastLogin_binned_Last Year,xgb_predictions,Churn_num
0,0.727546,1.323792,1.352574,0.712909,0.393525,0.517565,0.794296,1.829657,1.5706,-1.136255,0.851972,-1.247519,1.203505,5,0,0,1,1,0,0,1,0,1,1,1,1,0,1,0,0,0,0,1,0.463832,0.0
1,-1.029866,-1.634484,-0.453823,0.321943,-1.465147,1.320108,-1.873668,-1.138895,-0.53837,1.028264,-0.631848,0.966106,-1.325679,0,0,1,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0.678847,1.0
2,-0.854941,-1.634484,-0.687422,-1.229946,-0.666674,-0.905897,-1.873668,-2.506938,1.393933,0.669048,-0.631848,-1.247519,-1.205249,0,0,0,1,1,0,1,0,0,1,1,1,0,1,0,1,0,0,0,1,0.8431,0.0
3,0.116101,-0.155346,-0.585789,0.990305,-0.630238,0.130695,0.240643,-0.338641,0.20428,-1.136255,0.779932,0.149123,-0.398455,0,0,0,0,1,0,1,0,0,1,1,1,0,1,0,1,0,1,0,0,0.689988,1.0
4,-1.420702,0.458552,0.129064,-0.170617,-0.488967,-0.905897,0.794296,-1.138895,0.20428,-1.136255,-2.080369,0.149123,-0.892468,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0.759735,1.0


### 3.7 Export test data with predictions

In [83]:
def Export_Processed_CSV(bucket, processed_data_folder, local_file_name, S3_file_name, header_presence):
        
    '''Exports a dataframe into CSV format, and sends to a specified S3 bucket location
    
    Arguments 
    --------- 
    bucket: A list of the columns (i.e. the 3 diagnosis columns) to be updated
    processed_data_folder: the relevant subfolder within the main bucket
    local_file_name: The name of the dataframe within the notebook
    S3_file_name: The name of the file uppn export (with .csv extension included)
    header_presence: whether or not a header will be present within the exported csv
    
    Return
    ---------
    Exports a csv file to a specified S3 location'''
    
    local_file_name.to_csv(S3_file_name, index=False, header=header_presence)
    boto3.Session().resource('s3').Bucket(bucket).Object('{}/{}'.format(processed_data_folder, S3_file_name)).upload_file(S3_file_name)

In [84]:
test_with_predictions = "test_with_predictions.csv"

In [87]:
Export_Processed_CSV(bucket, input_prefix, smaller_test, test_with_predictions, True)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Please continue to the fourth script in this repository: 4. Evaluation_on_Test_Data