In [None]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

In [None]:
import pandas as pd
import numpy as np

In [None]:
!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/synthetic/churn.txt ./churn.txt

In [None]:
!head churn.txt

# Read data from CSV

In [None]:
churn_df = pd.read_csv('./churn.txt')
churn_df.head()

# Dataset description

**State** – The US state in which the customer resides, indicated by a two-letter abbreviation; for example, OH or NJ

**Account Length** – The number of days that this account has been active

**Area Code** – The three-digit area code of the customer’s phone number

**Phone** – The remaining seven-digit phone number

**Int’l Plan** – Whether the customer has an international calling plan (yes/no)

**VMail Plan** – Whether the customer has a voice mail feature (yes/no)

**VMail Message** – The average number of voice mail messages per month

**Day Mins** – The total number of calling minutes used during the day

**Day Calls** – The total number of calls placed during the day

**Day Charge** – The billed cost of daytime calls

**Eve Mins**, **Eve Calls**, **Eve Charge**– The billed cost for evening calls

**Night Mins**, **Night Calls**, **Night Charge** – The billed cost for nighttime calls

**Intl Mins**, **Intl Calls**, **Intl Charge** – The billed cost for international calls

**CustServ Calls** – The number of calls placed to customer service

**Churn?** – Whether the customer left the service (true/false)

# Data preprocessing

Remove `Phone` column

In [None]:
churn_df = churn_df.drop('Phone', axis=1)

Convert `Area Code` from `int` to string

In [None]:
churn_df['Area Code'] = churn_df['Area Code'].astype(object)

Convert result variable from `True.`/`False.` to `1`/`0`

In [None]:
churn_df['Churn?'] = np.where(churn_df['Churn?'] == 'False.', 0, 1)

Move `Churn?` column to the beginning of the table

In [None]:
churn_df = pd.concat(
    [churn_df['Churn?'], churn_df.drop(['Churn?'], axis=1)], axis=1
)

In [None]:
churn_df.head()

Use one hot encoding for non-numeric fields.

In [None]:
churn_df = pd.get_dummies(churn_df)

In [None]:
churn_df.head()

# Data splitting

Randomly shuffle the dataset

In [None]:
churn_df_shuffled = churn_df.sample(frac=1, random_state=42)

Split datset into train/validation/test datasets

In [None]:
churn_df_len = len(churn_df_shuffled)
churn_df_train, churn_df_validate, churn_df_test = np.split(
    churn_df_shuffled, 
    [
        int(0.6 * churn_df_len),
        int(0.8 * churn_df_len)
    ]
)

Write datsets as CSV files. 

We want to write a file without row names (`index=False`) and without a CSV header (`header=False`)

In [None]:
churn_df_train.to_csv('churn_train.csv', header=False, index=False)
churn_df_validate.to_csv('churn_validate.csv', header=False, index=False)

Copy result files to S3

In [None]:
!aws s3 cp churn_train.csv s3://{bucket}/churn_train.csv
!aws s3 cp churn_validate.csv s3://{bucket}/churn_validate.csv

Create inputs for the estimator

In [None]:
from sagemaker.inputs import TrainingInput

s3_input_train = TrainingInput(
    s3_data=f's3://{bucket}/churn_train.csv', content_type='csv'
)
s3_input_validate = TrainingInput(
    s3_data=f's3://{bucket}/churn_validate.csv', content_type='csv'
)

Load a container XGBoost algorithm

In [None]:
xgb_image = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, '1.5-1')

Train a model

In [None]:
xgb = sagemaker.estimator.Estimator(
    xgb_image,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/output',
    sagemaker_session=session,
)
xgb.set_hyperparameters(
    max_depth=5,
    objective='binary:logistic',
    num_round=100,
)

xgb.fit(
    {
        'train': s3_input_train,
        'validation': s3_input_validate
    }
)