# Deciding whether to escalate a customer support issue

## Part 1: Load and examine the data

In [3]:
data_bucket = '' #we should use SageMaker default s3 bucket
subfolder = 'ch04'
dataset = 'inbound.csv'

In [None]:
import pandas as pd                               
import boto3
import sagemaker
from sklearn.model_selection import train_test_split
import json
import nltk
import csv

nltk.download('punkt')

role = sagemaker.get_execution_role()
sess = sagemaker.session.Session()
data_bucket = sess.default_bucket()

print(f'SageMaker exectuion role: {role},  default s3 bucket: {data_bucket}')

In [5]:
%%time
df = pd.read_csv(f'./{dataset}')
display(df.head())

Unnamed: 0,tweet_id,author_id,created_at,in_reply_to,text,escalate
0,2,115712,Tue Oct 31 22:11:45 +0000 2017,sprintcare,@sprintcare and how do you propose we do that,False
1,3,115712,Tue Oct 31 22:08:27 +0000 2017,sprintcare,@sprintcare I have sent several private messag...,True
2,5,115712,Tue Oct 31 21:49:35 +0000 2017,sprintcare,@sprintcare I did.,False
3,16,115713,Tue Oct 31 20:00:43 +0000 2017,sprintcare,@sprintcare Since I signed up with you....Sinc...,False
4,22,115716,Tue Oct 31 22:16:48 +0000 2017,Ask_Spectrum,@Ask_Spectrum Would you like me to email you a...,False


CPU times: user 1.68 s, sys: 136 ms, total: 1.82 s
Wall time: 1.84 s


In [6]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df['escalate'].value_counts())

Number of rows in dataset: 520793
False    417800
True     102993
Name: escalate, dtype: int64


## Part 2: Get the data into the right shape

In [7]:
train_df, val_df, _, _ = train_test_split(df, df['escalate'], test_size=0.2, random_state=0)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

416634 rows in training data
104159 rows in validation data


In [8]:
%%time

def preprocess(df):
    all_rows = df.values.tolist()
    transformed_rows = list(map(transform_instance, all_rows))
    transformed_df = pd.DataFrame(transformed_rows)
    return transformed_df

def transform_instance(row):
    cur_row = []
    label = "__label__1" if row[5] == True else "__label__0" # Prefix 0 or 1 from sentiment
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row[4].lower()))
    return ' '.join(cur_row)

transformed_validation_rows = preprocess(val_df)
display(transformed_validation_rows.head())

Unnamed: 0,0
0,__label__1 @ 115990 no joke ... this is one of...
1,__label__0 @ amazonhelp primeira camada ... ht...
2,__label__1 @ microsofthelps my mistake
3,__label__1 @ 770932 @ americanair they notorio...
4,__label__1 @ amazonhelp neither man seems to k...


CPU times: user 32.1 s, sys: 76.6 ms, total: 32.2 s
Wall time: 32.5 s


In [12]:
s3_validation_data = f's3://{data_bucket}/{subfolder}/processed/validation.csv'

transformed_validation_rows.to_csv(
        f's3://{data_bucket}/{subfolder}/processed/validation.csv',header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^')



In [13]:
%%time
transformed_train_rows = preprocess(train_df)
display(transformed_train_rows.head())

s3_train_data = f's3://{data_bucket}/{subfolder}/processed/train.csv'

transformed_train_rows.to_csv(
        f's3://{data_bucket}/{subfolder}/processed/train.csv',header=False, index=False, quoting=csv.QUOTE_NONE, sep='|', escapechar='^')


Unnamed: 0,0
0,__label__0 @ amazonhelp et en plus se faire en...
1,__label__1 @ morrisons @ 641226 standard reply...
2,__label__1 @ idea_cares @ 1936 @ 116590 this i...
3,__label__0 @ askamex yes i did weeks ago and n...
4,__label__0 @ amazonhelp i do n't want your stu...


CPU times: user 2min 11s, sys: 455 ms, total: 2min 11s
Wall time: 2min 14s


## Part 3: Create training and validation datasets

In [14]:
%%time

train_data = sagemaker.inputs.TrainingInput(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.inputs.TrainingInput(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')

CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 21 µs


## Part 4: Train the model

In [15]:
s3_output_location = f's3://{data_bucket}/{subfolder}/output'

sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve(
            region=boto3.Session().region_name, 
            framework= "blazingtext", 
            version="1")

estimator = sagemaker.estimator.Estimator(container,
                                         role, 
                                         instance_count=1, 
                                         instance_type='ml.c5.4xlarge',
                                         max_run = 600,
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

estimator.set_hyperparameters(mode="supervised",
                            epochs=10,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

estimator.fit({'train': train_data, 'validation': validation_data})

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


2022-02-08 08:47:41 Starting - Starting the training job...
2022-02-08 08:48:05 Starting - Launching requested ML instancesProfilerReport-1644310061: InProgress
......
2022-02-08 08:49:05 Starting - Preparing the instances for training.........
2022-02-08 08:50:39 Downloading - Downloading input data
2022-02-08 08:50:39 Training - Downloading the training image...
2022-02-08 08:51:06 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[02/08/2022 08:51:01 INFO 140473464378752] nvidia-smi took: 0.025217056274414062 secs to identify 0 gpus[0m
[34m[02/08/2022 08:51:01 INFO 140473464378752] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[02/08/2022 08:51:01 INFO 140473464378752] Processing /opt/ml/input/data/train/train.csv . File size: 43.80211067199707 MB[0m
[34m[02/08/2022 08:51:01 INFO 140473464378752] Processing /opt/ml/input/data/validation/validat

## Part 5: Host the Model

In [16]:
endpoint_name = 'customer-support-ch04'
try:
    predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)
    predictor.endpoint_context()
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
except:
    print(f'endpoint: {endpoint_name} not exits')
    pass

endpoint: customer-support-ch04 not exits


In [39]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

text_classifier = estimator.deploy(initial_instance_count = 1,
                                instance_type = 'ml.m5.large',
                                endpoint_name=endpoint_name,
                                serializer=JSONSerializer()
                                )

-----!

## Test the Model

In [41]:
tweet = "I'm not angry!"

tokenized_tweet = [' '.join(nltk.word_tokenize(tweet))]
payload = {"instances" : tokenized_tweet}
response = text_classifier.predict(data=payload)
escalate = pd.read_json(response)
escalate

Unnamed: 0,label,prob
0,[__label__1],[0.5290865302085871]


In [42]:
runtime= boto3.client('sagemaker-runtime')
response = runtime.invoke_endpoint(EndpointName=endpoint_name,
                                       ContentType='application/json',
                                       Body=json.dumps(payload))
print(response)
print("ResponseMetadata:", response["ResponseMetadata"])
print("Body:", response['Body'].read().decode())

{'ResponseMetadata': {'RequestId': 'b3234185-dccd-456d-bf78-91786670af20', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'b3234185-dccd-456d-bf78-91786670af20', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Tue, 08 Feb 2022 10:04:58 GMT', 'content-type': 'application/json', 'content-length': '57'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f083c15fcf8>}
ResponseMetadata: {'RequestId': 'b3234185-dccd-456d-bf78-91786670af20', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'b3234185-dccd-456d-bf78-91786670af20', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Tue, 08 Feb 2022 10:04:58 GMT', 'content-type': 'application/json', 'content-length': '57'}, 'RetryAttempts': 0}
Body: [{"label": ["__label__1"], "prob": [0.5290865302085876]}]


## Remove the Endpoint (optional)

Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [43]:
# Remove the Endpoint (optional)
# Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

sess.delete_endpoint(endpoint_name)