## Download the IMDB Dataset

In [1]:
#Download reviews.txt and labels.txt from here: https://github.com/udacity/deep-learning/tree/master/sentiment-network

def pretty_print_review_and_label(i):
   print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

## Preprocessing: Capturing Word Correlation in Input Data

In [3]:
import numpy as np

onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the','cat','sat']
x = onehots[sentence[0]] + \
    onehots[sentence[1]] + \
    onehots[sentence[2]]

print("Sent Encoding:" + str(x))

Sent Encoding:[1 1 0 1]


## Predicting Movie Reviews

In [4]:
import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [5]:
tokens

[{'',
  '\n',
  '.',
  'a',
  'about',
  'adults',
  'age',
  'all',
  'and',
  'as',
  'at',
  'believe',
  'bromwell',
  'burn',
  'can',
  'cartoon',
  'classic',
  'closer',
  'comedy',
  'down',
  'episode',
  'expect',
  'far',
  'fetched',
  'financially',
  'here',
  'high',
  'i',
  'immediately',
  'in',
  'insightful',
  'inspector',
  'is',
  'isn',
  'it',
  'knew',
  'lead',
  'life',
  'line',
  'm',
  'many',
  'me',
  'much',
  'my',
  'of',
  'one',
  'other',
  'pathetic',
  'pettiness',
  'pity',
  'pomp',
  'profession',
  'programs',
  'ran',
  'reality',
  'recalled',
  'remind',
  'repeatedly',
  'right',
  's',
  'sack',
  'same',
  'satire',
  'saw',
  'school',
  'schools',
  'scramble',
  'see',
  'situation',
  'some',
  'student',
  'students',
  'such',
  'survive',
  't',
  'teachers',
  'teaching',
  'than',
  'that',
  'the',
  'their',
  'think',
  'through',
  'time',
  'to',
  'tried',
  'welcome',
  'what',
  'when',
  'which',
  'who',
  'whole',


In [6]:
vocab

['conceptions',
 'gruner',
 'introducing',
 'evil',
 'hughes',
 'galico',
 'taz',
 'toulouse',
 'felini',
 'purchased',
 'devalued',
 'ivresse',
 'euripides',
 'repository',
 'haggis',
 'routemaster',
 'imitating',
 'brakeman',
 'gutteridge',
 'ineffably',
 'comedian',
 'ingredient',
 'ohtsji',
 'foley',
 'kaige',
 'kerby',
 'clit',
 'badjatyas',
 'mikels',
 'haid',
 'moved',
 'intolerable',
 'provided',
 'dessicated',
 'guides',
 'thriving',
 'crammed',
 'ska',
 'acclimation',
 'caterwauling',
 'splainin',
 'ingenuos',
 'lurie',
 'put',
 'douanier',
 'prescient',
 'warming',
 'lukes',
 'synthesized',
 'billiard',
 'neorealism',
 'ponytails',
 'parking',
 'mistuharu',
 'mackichan',
 'stethoscope',
 'downes',
 'eurotrash',
 'myron',
 'unread',
 'retaliated',
 'luby',
 'taylor',
 'oless',
 'atmosphre',
 'mediocrity',
 'minorly',
 'dominated',
 'solomon',
 'contrasted',
 'guantanamera',
 'adherent',
 'strangelove',
 'cheepnis',
 'arteries',
 'finisham',
 'cozies',
 'xvii',
 'enraptured',


In [7]:
word2index

{'conceptions': 0,
 'gruner': 1,
 'introducing': 2,
 'evil': 3,
 'hughes': 4,
 'galico': 5,
 'taz': 6,
 'toulouse': 7,
 'felini': 8,
 'purchased': 9,
 'devalued': 10,
 'ivresse': 11,
 'euripides': 12,
 'repository': 13,
 'haggis': 14,
 'routemaster': 15,
 'imitating': 16,
 'brakeman': 17,
 'gutteridge': 18,
 'ineffably': 19,
 'comedian': 20,
 'ingredient': 21,
 'ohtsji': 22,
 'foley': 23,
 'kaige': 24,
 'kerby': 25,
 'clit': 26,
 'badjatyas': 27,
 'mikels': 28,
 'haid': 29,
 'moved': 30,
 'intolerable': 31,
 'provided': 32,
 'dessicated': 33,
 'guides': 34,
 'thriving': 35,
 'crammed': 36,
 'ska': 37,
 'acclimation': 38,
 'caterwauling': 39,
 'splainin': 40,
 'ingenuos': 41,
 'lurie': 42,
 'put': 43,
 'douanier': 44,
 'prescient': 45,
 'warming': 46,
 'lukes': 47,
 'synthesized': 48,
 'billiard': 49,
 'neorealism': 50,
 'ponytails': 51,
 'parking': 52,
 'mistuharu': 53,
 'mackichan': 54,
 'stethoscope': 55,
 'downes': 56,
 'eurotrash': 57,
 'myron': 58,
 'unread': 59,
 'retaliated': 60

In [8]:
input_dataset

[[67586,
  19458,
  40965,
  1541,
  44039,
  38411,
  13333,
  22045,
  12832,
  15410,
  39474,
  58427,
  55869,
  31298,
  51269,
  27719,
  30282,
  18003,
  59475,
  40538,
  68195,
  16484,
  22631,
  62056,
  34408,
  19561,
  41581,
  57464,
  36473,
  45197,
  10893,
  4769,
  62640,
  69296,
  25782,
  19141,
  32967,
  31437,
  38608,
  25296,
  54995,
  13528,
  69853,
  63711,
  3326,
  60678,
  22281,
  2841,
  35613,
  53023,
  36138,
  18740,
  55098,
  23870,
  62789,
  33609,
  1871,
  70486,
  38240,
  54112,
  55143,
  44908,
  39277,
  22389,
  62849,
  44420,
  56710,
  45452,
  19340,
  58253,
  72602,
  64410,
  66465,
  10660,
  48556,
  33718,
  51128,
  65980,
  33725,
  50111,
  40898,
  35780,
  20434,
  73684,
  55768,
  31193,
  23515,
  58336,
  32229,
  60902,
  18920,
  53228,
  50159],
 [40965,
  73739,
  22547,
  24083,
  73755,
  23068,
  69167,
  14387,
  56374,
  63544,
  13890,
  47173,
  26191,
  16989,
  6749,
  62559,
  26211,
  36963,
  1494

In [9]:
target_dataset

[1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,


In [18]:
import numpy as np 
np.random.seed(1) 

def sigmoid(x): 
    return 1/(1 + np.exp(-x)) 

alpha, iterations = (0.01, 2) 
hidden_size = 100 
weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1 
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1 
correct,total = (0,0) 
for iter in range(iterations): # train on first 24,000 
    for i in range(len(input_dataset)-1000):
        x,y = (input_dataset[i],target_dataset[i]) 
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) #embed + sigmoid 
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2)) # linear + softmax 
        layer_2_delta = layer_2 - y # compare pred with truth 
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) #backprop 
        weights_0_1[x] -= layer_1_delta * alpha 
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha 
        if(np.abs(layer_2_delta) < 0.5): 
            correct += 1 
        total += 1 
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset))) 
            sys.stdout.write('\rIter:'+str(iter)+' Progress:'+progress[2:4]  +'.'+progress[4:6]  +'% Training Accuracy:' + str(correct/float(total)) + '%') 
    print() 
correct,total = (0,0) 
for i in range(len(input_dataset)-1000,len(input_dataset)): 
    x = input_dataset[i] 
    y = target_dataset[i] 
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) 
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    if(np.abs(layer_2 - y) < 0.5): 
        correct += 1 
    total += 1 
print("Test Accuracy:" + str(correct / float(total)))

Iter:0 Progress:95.99% Training Accuracy:0.831%129220508545%%
Iter:1 Progress:95.99% Training Accuracy:0.8654166666666666%
Test Accuracy:0.849
