In [1]:
!pip install sagemaker==1.72.0

Collecting sagemaker==1.72.0
  Downloading sagemaker-1.72.0.tar.gz (297 kB)
[K     |████████████████████████████████| 297 kB 6.4 MB/s eta 0:00:01
Collecting smdebug-rulesconfig==0.1.4
  Downloading smdebug_rulesconfig-0.1.4-py2.py3-none-any.whl (10 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-1.72.0-py2.py3-none-any.whl size=386358 sha256=32b96b8dc96b6a5b82333208c59bb7bd20ad0234cf42c89e5563633b5e7ad303
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c3/58/70/85faf4437568bfaa4c419937569ba1fe54d44c5db42406bbd7
Successfully built sagemaker
Installing collected packages: smdebug-rulesconfig, sagemaker
  Attempting uninstall: smdebug-rulesconfig
    Found existing installation: smdebug-rulesconfig 1.0.1
    Uninstalling smdebug-rulesconfig-1.0.1:
      Successfully uninstalled smdebug-rulesconfig-1.0.1
  Attempting uninstall: sagemaker
    Found existing install

In [2]:
%mkdir ../data
!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

--2021-05-27 16:15:07--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘../data/aclImdb_v1.tar.gz’


2021-05-27 16:15:11 (21.0 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



### Data Preparation

In [9]:
#Import necessary files 
import os
import glob

def read_imdb_data(dir='../data/aclImdb'):
    """ Segregate data into training and testing data"""
    data = {}
    labels = {}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(dir, data_type,sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                     "{}/{} data size does not match labels size".format(data_type, sentiment)
             
    return data, labels

In [10]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / neg".format(
       len(data['train']['pos']), len(data['train']['neg']),
       len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / neg


In [11]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    """Prepare data for training and testing from IMDB reviews"""
    
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    return data_train, data_test, labels_train, labels_test

In [12]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDB reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDB reviews (combined): train = 25000, test = 25000


In [13]:
train_X[100]

'The quote I used for my summary occurs about halfway through THE GOOD EARTH, as a captain of a Chinese revolutionary army (played by Philip Ahn) apologizes to a mob for not having time to shoot MORE of the looters among them, as his unit has just been called back to the front lines. Of course, the next looter about to be found out and shot is the main character of the film, the former kitchen slave girl O-Lan (for whose portrayal Luise Rainer, now 99-years-old, won her second consecutive best actress Oscar).<br /><br />The next scene finds O-Lan dutifully delivering her bag of looted jewels to her under-appreciative husband, farmer Wang Lung (Paul Muni), setting in motion that classic dichotomy of a man\'s upward financial mobility being the direct inverse of his moral decline.<br /><br />For a movie dealing with subject matter including slavery, false accusations, misogyny, starvation, home invasion, eating family pets, mental retardation, infanticide, exploited refugees, riots, civi

###  Data Processing 

In [14]:
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [22]:
import re
from bs4 import BeautifulSoup

def review_to_words(review):
    text = BeautifulSoup(review, 'html.parser').get_text()
    text = re.sub(r"[^a-zA-Z0-9]", " ",text.lower())
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')] 
    words = [PorterStemmer().stem(w) for w in words]
    
    return words

In [25]:
review_to_words(train_X[100])

['quot',
 'use',
 'summari',
 'occur',
 'halfway',
 'good',
 'earth',
 'captain',
 'chines',
 'revolutionari',
 'armi',
 'play',
 'philip',
 'ahn',
 'apolog',
 'mob',
 'time',
 'shoot',
 'looter',
 'among',
 'unit',
 'call',
 'back',
 'front',
 'line',
 'cours',
 'next',
 'looter',
 'found',
 'shot',
 'main',
 'charact',
 'film',
 'former',
 'kitchen',
 'slave',
 'girl',
 'lan',
 'whose',
 'portray',
 'luis',
 'rainer',
 '99',
 'year',
 'old',
 'second',
 'consecut',
 'best',
 'actress',
 'oscar',
 'next',
 'scene',
 'find',
 'lan',
 'duti',
 'deliv',
 'bag',
 'loot',
 'jewel',
 'appreci',
 'husband',
 'farmer',
 'wang',
 'lung',
 'paul',
 'muni',
 'set',
 'motion',
 'classic',
 'dichotomi',
 'man',
 'upward',
 'financi',
 'mobil',
 'direct',
 'invers',
 'moral',
 'declin',
 'movi',
 'deal',
 'subject',
 'matter',
 'includ',
 'slaveri',
 'fals',
 'accus',
 'misogyni',
 'starvat',
 'home',
 'invas',
 'eat',
 'famili',
 'pet',
 'mental',
 'retard',
 'infanticid',
 'exploit',
 'refuge',
 

In [28]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_analysis")
os.makedirs(cache_dir, exist_ok = True)

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available"""
    
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass
        
    if cache_data is None:
        words_train = [review_to_words(review) for review in data_train]
        words_test =  [review_to_words(review) for review in data_test ]
        
    if cache_file is not None:
            cache_data = dict(words_train = words_train , words_test = words_test,
                              labels_train = labels_train, labels_test = labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache files:", cache_file)
    else:
        words_train, words_test, labels_train, labels_test = (cache_data["words_train"],
                cache_data["words_test"], cache_data["labels_train"], cache_data["labels_test"])
        
    
    return words_train, words_test, labels_train, labels_test

In [None]:
#Preprocess data
train_X, test_X, train_y, test_y = preprocess_data(train_X, train_y, test_X, test_y)

### Extract Bag-of-features

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

def extract_bow_features(words_train, words_test, vocabulary_size=5000, 
                         cache_dir = cache_dir, cache_file="bow_features.pkl"):
    
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass
        
    if cache_data is None:
        vectorizer =  CountVectorizer(max_features= vocabulary_size, 
                                     preprocessor = lambda x:x, tokenizer = lambda x:x)
        
        features_train = vectorizer.fit_transform(words_train).toarray()
        
        features_test = vectorizer.transform(words_test).toarray()