## IMDB Sentiment Analysis Data

### Download Data

In [1]:
%mkdir ../data
!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

mkdir: cannot create directory ‘../data’: File exists
--2020-07-14 16:18:47--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘../data/aclImdb_v1.tar.gz’


2020-07-14 16:18:52 (17.7 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [13]:
import os
import glob

def read_data(filepath = '../data/aclImdb'):
    data = {}
    labels = {}
    
    for data_type in ['train','test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos','neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(filepath, data_type, sentiment, '*.txt' )
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
    return data,labels                    

In [14]:
data,labels = read_data()

In [17]:
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [21]:
from sklearn.utils import shuffle

def prepare_data(data,labels):
    
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    return shuffle(data_train,data_test, labels_train,labels_test )

In [22]:
train_X, test_X, train_y, test_y = prepare_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


### Preprocessing Text Data

In [23]:
#Trying out TEXTHERO : https://texthero.org/docs/getting-started
!pip install texthero

Collecting texthero
  Downloading texthero-1.0.9-py3-none-any.whl (25 kB)
Collecting gensim>=3.6.0
  Downloading gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 10.8 MB/s eta 0:00:01
Collecting unidecode>=1.1.1
  Downloading Unidecode-1.1.1-py2.py3-none-any.whl (238 kB)
[K     |████████████████████████████████| 238 kB 43.2 MB/s eta 0:00:01
[?25hCollecting wordcloud>=1.5.0
  Downloading wordcloud-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (364 kB)
[K     |████████████████████████████████| 364 kB 58.8 MB/s eta 0:00:01
Collecting spacy>=2.2.2
  Downloading spacy-2.3.2-cp36-cp36m-manylinux1_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 43.9 MB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-2.1.0.tar.gz (116 kB)
[K     |████████████████████████████████| 116 kB 45.2 MB/s eta 0:00:01
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.2-cp36-cp36m-manylinux1_x86_64.whl (19 kB)

In [46]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def review_to_words(review):
    words = REPLACE_NO_SPACE.sub("", review.lower())
    words = REPLACE_WITH_SPACE.sub(" ", words)
    return words

In [47]:
review_to_words(train_X[100])


'the second alternate gundam universe tale g gundam being the first gundam wing is yet another different view into the gundam verse the familiar elements are found but gundam wing is actually different then its counterparts the biggest being the gundams are nothing more than terrorists combating one lone organization in truth the series doesnt really become a show about war until episode 7 but in truth the real conflict the eve wars dont happen until the later episodes  the greatest positives of this series are its characters all the main characters are fleshed out throughout the 49 episode run and you can really sympathize with each of the roles their put in another great plus is the fantastic character and mecha design of the series the designs put some of its other gundam counterparts to shame  one of the biggest criticism of this series is how many die hard uc fans claim rip off of the original uc saga why gundam wing gets this rap when the more apparent uc clone of gundam seed is 

In [52]:
cache_dir = os.path.join("../cache", "preprocessed_data")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

import pickle


def preprocess_data(data_train,data_test, labels_train,labels_test,cache_file="preprocessed_data.pkl" ):
    
    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
   
    if cache_data is None:
        
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test
    

In [53]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Wrote preprocessed data to cache file: preprocessed_data.pkl


### Extract Bag-of-Words features

In [54]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

def extract_BoW_features(words_train, words_test, vocabulary_size=5000,
                         cache_dir=cache_dir, cache_file="bow_features.pkl"):
    """Extract Bag-of-Words for a given set of documents, already preprocessed into words."""
    
    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Fit a vectorizer to training documents and use it to transform them
        # NOTE: Training documents have already been preprocessed and tokenized into words;
        #       pass in dummy functions to skip those steps, e.g. preprocessor=lambda x: x
        vectorizer = CountVectorizer(max_features=vocabulary_size)
        features_train = vectorizer.fit_transform(words_train).toarray()

        # Apply the same vectorizer to transform the test documents (ignore unknown words)
        features_test = vectorizer.transform(words_test).toarray()
        
        # NOTE: Remember to convert the features using .toarray() for a compact representation
        
        # Write to cache file for future runs (store vocabulary as well)
        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            cache_data = dict(features_train=features_train, features_test=features_test,
                             vocabulary=vocabulary)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                joblib.dump(cache_data, f)
            print("Wrote features to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        features_train, features_test, vocabulary = (cache_data['features_train'],
                cache_data['features_test'], cache_data['vocabulary'])
    
    # Return both the extracted features as well as the vocabulary
    return features_train, features_test, vocabulary





In [55]:
train_X, test_X, vocabulary = extract_BoW_features(train_X, test_X)
len(train_X[100])

Wrote features to cache file: bow_features.pkl


5000

### Upload data to S3

In [56]:
import pandas as pd

# Earlier we shuffled the training dataset so to make things simple we can just assign
# the first 10 000 reviews to the validation set and use the remaining reviews for training.
val_X = pd.DataFrame(train_X[:10000])
train_X = pd.DataFrame(train_X[10000:])

val_y = pd.DataFrame(train_y[:10000])
train_y = pd.DataFrame(train_y[10000:])

The __XGBoost classifier__ that we will be using requires the dataset to be written to a file and stored using Amazon S3. To do this, we will start by splitting the training dataset into two parts, the data we will train the model with and a validation set. Then, we will write those datasets to a file locally and then upload the files to S3. In addition, we will write the test set to a file and upload that file to S3. This is so that we can use SageMakers Batch Transform functionality to test our model once we've fit it.
The documentation for the XGBoost algorithm in SageMaker requires that the training and validation datasets should contain no headers or index and that the label should occur first for each sample.

In [57]:
data_dir = '../'
pd.DataFrame(test_X).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

pd.concat([val_y, val_X], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [62]:
import boto3

def upload_to_s3(file_to_upload,file_in_s3):
    bucket='exercisedatasj' 
    prefix = f'sagemaker/sentiment_analysis/{file_in_s3}' 


    key = "{}".format(prefix)

    url = 's3://{}/{}'.format(bucket, key)
    boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_file(file_to_upload)
    print('Done writing to {}'.format(url))

In [64]:
upload_to_s3(os.path.join(data_dir, 'test.csv'),'test.csv')
upload_to_s3(os.path.join(data_dir, 'validation.csv'),'validation.csv')
upload_to_s3(os.path.join(data_dir, 'train.csv'),'train.csv')

Done writing to s3://exercisedatasj/sagemaker/sentiment_analysis/test.csv
Done writing to s3://exercisedatasj/sagemaker/sentiment_analysis/validation.csv
Done writing to s3://exercisedatasj/sagemaker/sentiment_analysis/train.csv


### Creating XGBoost Model 

In [67]:
import sagemaker
from sagemaker import get_execution_role

session = sagemaker.Session() # Store the current SageMaker session

# Our current execution role is required when creating the model as the training
# and inference code will need to access the model artifacts.
role = get_execution_role()

In [68]:
# We need to retrieve the location of the container which is provided by Amazon for using XGBoost.
# As a matter of convenience, the training and inference code both use the same container.
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
	get_image_uri(region, 'xgboost', '1.0-1').


In [69]:
# First we create a SageMaker estimator object for our model.
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)



In [72]:
s3_input_train = sagemaker.s3_input(s3_data='s3://exercisedatasj/sagemaker/sentiment_analysis/train.csv', content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://exercisedatasj/sagemaker/sentiment_analysis/validation.csv', content_type='csv')



In [73]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-07-14 17:30:07 Starting - Starting the training job...
2020-07-14 17:30:10 Starting - Launching requested ML instances......
2020-07-14 17:31:29 Starting - Preparing the instances for training......
2020-07-14 17:32:28 Downloading - Downloading input data...
2020-07-14 17:32:43 Training - Downloading the training image.[34mArguments: train[0m
[34m[2020-07-14:17:33:10:INFO] Running standalone xgboost training.[0m
[34m[2020-07-14:17:33:10:INFO] File size need to be processed in the node: 238.5mb. Available memory size in the node: 8506.81mb[0m
[34m[2020-07-14:17:33:10:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:33:10] S3DistributionType set as FullyReplicated[0m
[34m[17:33:12] 15000x5000 matrix with 75000000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-14:17:33:12:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:33:12] S3DistributionType set as FullyReplicated[0m
[34m[17:33:13] 10000x5000 m

### Testing the model with Batch Transformation

In [84]:
test_X = train_X = val_X = train_y = val_y = None

In [74]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')



In [75]:
xgb_transformer.transform('s3://exercisedatasj/sagemaker/sentiment_analysis/test.csv', content_type='text/csv', split_type='Line')

__Now the transform job has executed and the result, the estimated sentiment of each review, has been saved on S3. Since we would rather work on this file locally we can perform a bit of notebook magic to copy the file to the data_dir__.

In [86]:
# !aws s3 cp --recursive $xgb_transformer.output_path $data_dir

In [87]:
xgb_transformer.output_path

's3://sagemaker-us-east-1-577153973786/xgboost-2020-07-14-17-37-22-327'

In [96]:
predictions = pd.read_csv(os.path.join(xgb_transformer.output_path,'test.csv.out'),header=None)

In [97]:
predictions

Unnamed: 0,0
0,0.011025
1,0.054424
2,0.861829
3,0.178202
4,0.125888
...,...
24995,0.460890
24996,0.947942
24997,0.798385
24998,0.614617


In [98]:
predictions = [round(num) for num in predictions.squeeze().values]

In [99]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

0.8584

### Deploying the model

In [100]:
xgb_predictor = xgb.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')



-------------!

In [None]:
import boto3
runtime = boto3.Session().client('sagemaker-runtime')

In [101]:
xgb_predictor.endpoint

'xgboost-2020-07-14-17-30-07-049'

In [102]:
test_review = "Nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies, totally devoid of any heart or heat. A romantic comedy that has zero romantic chemestry and zero laughs!"

In [103]:
test_words = review_to_words(test_review)
print(test_words)

nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies totally devoid of any heart or heat a romantic comedy that has zero romantic chemestry and zero laughs


In [107]:
def bow_encoding(words, vocabulary):
    bow = [0] * len(vocabulary) # Start by setting the count for each word in the vocabulary to zero.
    for word in words.split():  # For each word in the string
        if word in vocabulary:  # If the word is one that occurs in the vocabulary, increase its count.
            bow[vocabulary[word]] += 1
    return bow

test_bow = bow_encoding(test_words, vocabulary)

In [108]:
len(test_bow)

5000

In [109]:
runtime = boto3.Session().client('sagemaker-runtime')

response = runtime.invoke_endpoint(EndpointName = xgb_predictor.endpoint, # The name of the endpoint we created
                                       ContentType = 'text/csv',                     # The data format that is expected
                                       Body = ','.join([str(val) for val in test_bow]).encode('utf-8'))

In [110]:
response = response['Body'].read().decode('utf-8')
print(response)

0.399025201797


In [111]:
xgb_predictor.delete_endpoint()