# Amazon Reviews - Sentiment Model Experimentation

### Imports

In [1]:
import boto3
import sagemaker
import sys
import os
import re
import numpy as np
import pandas as pd
import subprocess
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import gzip
from io import BytesIO
import zipfile
import random
import json
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.metrics import classification_report
import nltk

### Configs and Global Vars

In [28]:
configs = {
    'aws_region' :  'us-east-1',
    'bucket_name': 'demos-amazon-reviews',
    'prefix' : 'preprocessed_reviews', #only use this if you want to have your files in a folder 
    'index_key' : 'review_date_str',
    'file_extension' : '.parquet'
   
}

global_vars = {}

### Environment Setup

Setting up the environment involves ensuring all the corret session and IAM roles are configured. We also need to ensure the correct region and bucket is made available.

In [29]:
def setup_env(configs, global_vars):
    
    sess = sagemaker.Session()
    
    role = get_execution_role()

    AWS_REGION = configs['aws_region']
    s3 = boto3.resource('s3')

    s3_bucket = s3.Bucket(configs['bucket_name'])

    if s3_bucket.creation_date == None:
    # create S3 bucket because it does not exist yet
        print('Creating S3 bucket {}.'.format(bucket))
        resp = s3.create_bucket(
            ACL='private',
            Bucket=bucket
        )
    else:
        print('Bucket already exists')
        
    global_vars['role'] = role
    global_vars['sess'] = sess
    global_vars['s3'] = s3
    global_vars['s3_bucket'] = s3_bucket
    
    return global_vars

global_vars = setup_env(configs, global_vars)

Bucket already exists


### Create Data Manifest

In [None]:
def create_dataset_manifest(configs, global_vars):
    
    interval_printer_idx = 100000
    idx = 0
    
    conn = global_vars['s3_bucket']
    file_format = configs['file_extension']

    manifest = []    
    for file in conn.objects.all():
        path = file.key
        if (file_format in path):
            relative_path = path.replace(configs['prefix'],'')
            man = {'idx':idx, 'path':relative_path, 'path_with_prefix':path}
            manifest.append(man)  
            idx += 1
        if (idx % interval_printer_idx) == 0:
            print('Processed {} files'.format(idx))
    print('Training Dataset Size {}'.format(len(manifest)))
    return manifest
            
manifest = create_dataset_manifest(configs, global_vars)   
    

Processed 100000 files
