In [1]:
# import click
# import logging
# from pathlib import Path
# from dotenv import find_dotenv, load_dotenv
import os
import sys
import pandas as pd
import numpy as np
# import tensorflow as tf
from tensorflow import keras
# from tensorflow.python.keras.callbacks import TensorBoard
# tensorboard --logdir ./ --host=127.0.0.1
# from time import time

# print("Tensorflow version:", tf.__version__)

### Load IMDB dataset 

In [2]:
SEQUENCE_LENGTH = 200
VACAB_FEATURES = 100
VOCAB_SIZE = 60064 # 3575 # 3443

train_tokens_file = '../data/processed/train/reviews_21122_tokens_200_sentiment.npy'  # Check SEQUENCE_LENGTH flag
# Load the file
train_array = np.load(train_tokens_file)

In [3]:
review_text_array = train_array

# Shuffle the array
np.random.shuffle(review_text_array)

# Prepare tokenizer
t = keras.preprocessing.text.Tokenizer()
t.fit_on_texts(review_text_array[0:, 0])
vocab_size = len(t.word_index) + 1
text_seq = t.texts_to_sequences(review_text_array[0:, 0])
print('Vocabulary size : {}'.format(vocab_size))
print('Sequence length : {}'.format(SEQUENCE_LENGTH))

assert(len(t.word_index) + 1 <= VOCAB_SIZE)
assert(max([len(iner_list) for iner_list in text_seq]) == SEQUENCE_LENGTH)

Vocabulary size : 60064
Sequence length : 200


In [4]:
print('IMDB Word dict: ', '==============', sep='\n')
print(t.word_index)

IMDB Word dict: 


### Load GloVe embeddings

In [5]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../data/raw/glove.6B.100d.txt',encoding="utf8")
print('Loading GloVe in to memory ...','='*30, sep='\n' )
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loading GloVe in to memory ...
Loaded 400000 word vectors.


In [6]:
print('Embedding feature lenght : {}'.format(embeddings_index.get('the').shape), '='*33,sep = '\n')
print('Sample embedding for \'the\' :',embeddings_index.get('pro'), sep='\n')

Embedding feature lenght : (100,)
Sample embedding for 'the' :
[-0.23321   0.19875  -0.21014  -0.52006   0.014643  1.0865    0.35926
 -0.53367  -0.25686   0.4887   -0.069825  0.02606   0.55488  -0.4796
 -0.45381   0.39645   0.035548 -1.0047   -0.19075   0.45009   0.65632
  0.09321   0.039597  0.10317   0.29774  -0.25584  -0.38396  -0.75715
  0.99461   0.99504   0.22618   1.1416    0.064549  0.23267   0.78807
 -0.20461  -0.023498  0.61369  -0.70191  -0.29458  -0.60298  -0.22445
  1.0954   -0.46363  -0.31758  -0.89995   0.22988  -0.39046  -0.055351
 -0.52759  -0.33558   0.1659   -0.22073   0.088225  0.50555  -1.6934
  0.52847   0.60238   1.6912    0.34935  -0.77398  -0.53206  -0.59611
 -0.037936  0.22795   0.15998   0.5961    0.046729  0.08823   0.51839
  1.0067   -0.23595  -0.86539   0.037259 -0.86479  -0.34324   0.39889
  0.528    -0.31003  -0.56212   1.2702   -0.56297  -0.14321  -0.34887
 -0.36167   0.010414 -0.18539   0.23723   0.51651  -0.31101   0.1728
  0.37043  -0.80387   0.99567

### Cross match GloVe and IMDB

In [7]:
glove_embedding_dict = dict()
words_not_found_in_glove = []
for count, key in enumerate(t.word_index.keys()):
#     print('Key:', key)
    if key in embeddings_index.keys():
#         print(embeddings_index[key])
        glove_embedding_dict[key]=embeddings_index[key]
    else:
        words_not_found_in_glove.append(key)
        glove_embedding_dict[key] = np.random.uniform(low=-1.0, high=1.0, size=(100,))
        
    
    
#     if count > 1:
#         break

print('New Vocabulary size : ', len(glove_embedding_dict.keys()))
print('Words not found : ', len(words_not_found_in_glove))

New Vocabulary size :  60063
Words not found :  9658


In [8]:
import random
print('Sample words Not-Found in GloVe : ',
      '='*len('Sample words Not-Found in GloVe : '),
      random.sample(words_not_found_in_glove, k=100), sep='\n')

Sample words Not-Found in GloVe : 
['personnaly', 'lespart', 'liasons', 'contraversy', 'unwatch', '08th', 'leway', 'bhopali', 'montossé', 'nickolodeon', 'mentirosos', 'c3p0', 'bloodfeast', 'ferroukhi', 'chosson', 'snoozefest', 'americian', 'iaac', 'prabhats', 'horor', 'kevetch', 'bumpuses', 'misrepresentative', 'cartwrightbride', 'astronaust', 'disant', 'nandjiwarra', 'fyall', 'wrightman', 'darkend', 'redresses', 'tuous', 'rescueman', 'tombes', 'funnybones', 'stinkingly', 'correl', 'menaikkan', 'sebastians', 'wodehousian', 'southerrners', 'laughometer', 'unsynchronised', 'tzc', 'deeeeeep', 'kamerdaschaft', '7ish', 'ahista', 'dilettantish', 'gallactica', 'enormeous', 'beatliest', 'slackly', 'smyrner', 'einstien', 'obcession', 'charachter', 'thied', 'dishonours', 'flawlessness', 'zucovic', 'thuggees', 'picutres', 'glanse', 'enjoythe', 'holdall', 'mondrians', 'valientes', 'technerds', 'motived', 'zues', 'raechel', 'weinbauer', 'críticos', 'amitji', 'navuoo', 'anyhoo', 'twomarlowe', 'meski

In [9]:
msg = 'One Sample input :'
print(msg,'='*len(msg), review_text_array[0:1, 0], sep='\n')

One Sample input :
['lot talk torture days movie good person makes bad decision kindness becomes vulnerable two psychotic women kicks assault know point something wife child somewhere great feelings guilt fear times could acted movie seems somebody joke suppose wake manson murders bit fixation likes two nevertheless would someone make film like appeals except sadism conclusion totally unsatisfying could remedied obvious plot twist oh well another hour half life']


In [10]:
# sample=review_text_array[0:1, 0][0].split()
msg = 'Distribution of Non-GloVe-Words in samples :'
print(msg, '='*len(msg), sep='\n')
samples = review_text_array[0:len(review_text_array), 0]
for sample in samples:
    sample_arr = sample.split()

    sample_odds = [x for x in sample_arr if x in words_not_found_in_glove]
    if len(sample_odds)> 0:
        print(sample_odds)


Distribution of Non-GloVe-Words in samples :
['objectiveness']
['cadavra']
['mustve', 'dimeco']
['profster']
['poiré', 'chazel', 'bujeau', 'poiré']
['smooshed']
['postmodernistic']
['swiching']
['poésy']
['koslo']
['yosimite']
['lightpost']
['hundstage', 'hundstage', 'hundstage', 'nachtgestalten']
['redresses']
['troyepolsky', 'rostotsky']
['budgeter']
['30pm', '30pm']
['picturisations', 'swiztertland']
['crocky', 'crocky', 'crocky', 'speach', 'crocky']
['weihenmeyer', 'tenberken', 'summitting', 'unsubtly']
['odyssée']
['tarrinno']
['yasutake']
['polemize']
['terribleness', 'wertmueller']
['prollific', 'musicly']
['demonous']
['moonlanding']
['homere']
['pelicule', 'luckely']
['cuttingly']
['twoddle']
['ondricek', 'samotari']
['expositionary', 'overbaked']
['decieve']
['19k']
['1100ad']
['bartend']
['overpraise']
['grusomely', 'steveday', 'ropey']
['ericco', 'dismals']
['pianiste']
['dammes', 'actionscenes']
['miswrote', 'misfilmed', 'unredeemably']
['orbitting']
['assult', 'flinstone'

['bonejack']
['coulardeau']
['beullar']
['hogbottom']
['loosly', 'worls', 'worsel']
['everytown']
['dispersement']
['imho']
['bucketfuls']
['hamliton']
['trevethyn']
['emporer']
['chalonte']
['hornophobic', 'hornomania']
['stepmotherhood']
['definetly']
['temperment']
['painfull']
['manfish', 'manfish']
['benecio']
['looneys', 'gazooks', 'looneys']
['zappruder']
['hatefully', 'hatefully']
['mcreedy', 'trymane']
['karmically']
['materializer']
['overexplanation']
['sneedeker', 'hollyweed']
['lillihamer']
['hoffbrauhaus']
['andrewjlau', 'unmysterious']
['astoundlingly']
['klemper', 'klemper', 'klemper', 'gnatpole']
['grannys']
['zerneck', 'sertner']
['rubbishy']
['harlins', 'aciton']
['geeeeeetttttttt', 'itttttttt']
['infantilize']
['tilse']
['ohtherwise', 'lugia', 'lugia', 'deoxys']
['heorine']
['koersk', 'thas']
['fuckwood']
['oedpius', 'omfg']
['1980ies']
['immortel']
['dieing', 'chearator']
['uncalculatedly']
['apparenly', 'hipocracy', 'rienforcation', 'steryotypes', 'dissapointment'

['macliammóir']
['heronimo']
['nyugen', 'nyugens', 'ashame', 'nyugen', 'offsuit']
['darkheart']
['soliti', 'ignoti']
['nietzche']
['thomp']
['kôji', 'tamiyo', 'kusakari', 'dansu', 'tamiyo', 'kusakari', '06th', 'dansu', 'comigo']
['disastor']
['harmann']
['conchatta']
['risibly', 'migenes', 'migenes', 'bowdlerised']
['schintzy']
['solino']
['schlockmeister', 'hollwyood', 'hollywoodize', 'connivers']
['frizzyhead']
['soetman']
['valientes', 'simuladores', 'szifron', 'valientes', 'valientes', 'szifron']
['beatlemaniac']
['medichlorians', 'winterbolt', 'winterbolt']
['hobbitt', 'loveably']
['cource', 'deosnt']
['hahahaha', 'gooooooodddd']
['konkana', 'mumabi']
['buchfellner', 'buchfellner']
['protée']
['macliammóir']
['bugaloo']
['zelah']
['aintry']
['gorehounds']
['shwartzeneger']
['arganauts']
['huêt', 'hensema', 'wagter', 'spoorloos']
['fullmoondirect', 'cheezy']
['ghosties']
['demonicus']
['oftenly']
['hahahah']
['ejames6342']
['archiev']
['dansu']
['elinore']
['yetians', 'macrabe']
['

['headtripping', 'oiks']
['tawnyteel']
['mechenosets', 'mechenosets']
['lartigau']
['spectular', '9as', 'interwhined']
['depalmas']
['scaryt', 'knoks']
['corniest']
['tribilistic', 'dearz', 'meduim']
['toliet', 'electricuted']
['3rds']
['belaney']
['sovjet']
['dessicated']
['baaaaaaaaaaaaaad']
['gonnabe']
['bravi']
['absurdness']
['duchovney', 'duchovney', 'wounderfull', 'recomend']
['willona']
['jansens']
['wowsers', 'wowser', 'gadgetmobile', 'gadgetmobile']
['eggbert', 'dosn']
['fleapit']
['ashame']
['gombell']
['vipco']
['anisio', 'anisio', 'anisio']
['deadful']
['expcept']
['streetfighters']
['meiks']
['rosselinni']
['definatly']
['guetary']
['haurs']
['govida']
['anansa', 'anansa', 'anansa']
['burlesk', 'babified']
['plasticness', 'anyday']
['shawnham', 'snidley']
['polarisdib']
['stargaard']
['quetin', 'taratino', 'quetin', 'phoormola']
['smithonites', 'whedonettes']
['jariwala']
['escapistic']
['oopps']
['watcheable']
['bfgw']
['fightfest']
['hundstage', 'unignorable']
['dialoug

['toooooo']
['minimalistically']
['m4tv', 'boyum']
['serlingesq']
['xica', 'xica']
['ohhhhh', 'rofl']
['irréversible']
['dissabordinate']
['twomarlowe', 'gambleing']
['esoterically']
['alexs']
['unban']
['vaxham']
['svenon']
['bleepesque']
['maratonci', 'trce', 'pocasni', 'spijun', 'otac', 'sluzbenom']
['schombing', 'saboto']
['horseshit']
['draaaaaaaawl']
['muscats', 'milfune', 'kusugi', 'mayedas']
['krocodylus', 'vertido', 'malecio', 'amayao']
['colagrande']
['critised']
['frownland', 'distributer']
['ossessione', 'ossessione', 'ossessione', 'electrifyingly', 'ossessione', 'ossessione']
['beffe']
['ewaste', '480m', 'haoren', 'baichwal', 'pencier']
['physcological', 'bodycount', 'icant', 'malefique']
['amusedly']
['unamusing']
['isoyc', 'ipoyg', 'isoyc', 'ipoyg', 'isoyc', 'ipoyg', 'zarchi']
['unentertaining', 'unsuspensful']
['orignal']
['coholic']
['thesinger']
['parinda', 'parinda', 'parinda']
['mattter', 'btas', 'twoface']
['goebels', 'goebels']
['contradictive']
['discplines']
['r

['crispies']
['hadnt']
['entertaingly', 'waaaaaaaaaaay', 'becuz']
['nunchuks']
['farcial']
['cloudkicker']
['outlooking']
['crummier']
['19thc']
['hellbreeder']
['possesor']
['tarentino', 'tarentino']
['nativetex4u']
['diry']
['videothek', 'hollandish', 'undertitles', 'horrormovies', 'trelkovski', 'weired', 'choule', 'halluzinations', 'shizophrenic']
['unambitiously']
['splatterfest', 'filmaker', 'dahmers']
['tarded']
['unisols', 'unisols', 'unisol', 'unisols', 'unisol', 'unisol', 'forgetaboutit']
['carlas', 'emmanuell']
['natassia', 'malthe']
['wips']
['kacia', 'dibler']
['burkley']
['romanticising', 'cringeworthy']
['hardworker', 'dialouge']
['suspenser', 'menephta']
['ladened', 'mochcinno']
['genorisity']
['hoechlin']
['stepmotherhood']
['aniversy', 'poppens', 'aniversy']
['morolla']
['historicaly']
['consigliori', 'campily']
['wowzers']
['podges']
['wantabedde']
['jedna', 'netlaska']
['shoveller']
['branaugh', 'branaughs']
['anyhoo']
['impertubable']
['jusenkkyo', 'kodachi', 'happo

['tk427']
['michlle', 'favourtie']
['uneffective']
['colagrande']
['koyla', 'koyla', 'haara', 'challiya']
['zappati', 'zappati', 'alselmo']
['completist']
['ring2', 'jaysun']
['gailard']
['trannies']
['tt0059080', 'gammera']
['unkiddy']
['gr88']
['raimy']
['christmass', 'dateing', 'dinosuar']
['pumkinhead', 'mansquito']
['yutte', 'stensgaard']
['tromaville']
['zealnd']
['hhaha']
['simpathetic', 'clownified']
['toped']
['mainstrain']
['ibéria']
['fictively']
['hornophobia', 'rychard', 'pzazz']
['bliep', 'pasé']
['marolla']
['cahulawassee']
['miscarrage', 'miscarrage', 'busness', 'disapears', 'playgroud', 'traped']
['poiré', 'chazel', 'bujeau', 'poiré']
['dissappointed']
['futureistic']
['poice']
['mechanik', 'puposelessly']
['gurdebeke']
['definetely']
['necroborgs', 'splattery']
['horibble']
['attilla']
['unmystied']
['kinnepolis']
['traditionaled']
['corkymeter']
['pulasky']
['brigante']
['charlia']
['catogoricaly', 'supposibly', 'indendoes', 'aplogise']
['sierck', 'hadly', 'holmann']

['00001']
['spellbounding']
['nutzo']
['blanzee']
['indefensibly']
['oragami']
['ebing']
['letch']
['rissole', 'victoriain']
['brooksophile', 'undistilled']
['unassured']
['calomari', 'sullesteian']
['churlishly', 'mejding']
['330am', 'umiak', 'skers', 'geograpically']
['abyssmal']
['baichwal']
['salutory', 'accessability']
['screenin']
['inversed', 'inversed']
['bashki', 'bashki']
['eventless']
['familys', 'definetly', 'recomend', 'appelonia']
['masacism']
['kapture']
['univeral', 'sudser', 'portrayl']
['assery']
['magestic', 'correl']
['mclaghlan']
['part1']
['gitwisters']
['didactically']
['imho']
['discustingly', 'masterbates']
['matkondar', 'bhajpai']
['zzzzzzzzzzzzz']
['showerman']
['plaggy']
['codependence']
['plonked']
['dinocrap', 'ahahahahahhahahahahahahahahahhahahahahahahah', 'homoeric', 'dinocrap']
['h3ll', 'daym']
['plumpish', 'worlde']
['laemlee']
['fillum']
['mustan']
['corsaut']
['cringeworthy', 'vaut']
['ursla']
['intriquing']
['unibomber']
['kerching', 'rudeboy']
['jo

['lipsync', 'carrys']
['zombiez']
['everytown', 'everytown', 'everytown']
['pffeifer']
['maléfique']
['enviormentally']
['dudettes']
['uncinematic']
['jist']
['wardh', 'whotta']
['suwkowa']
['decivilization']
['eeeeeeeek']
['okona']
['creepies', 'macgavin']
['unacted', 'jox', 'beauticin']
['kicha', 'acin']
['gaslit']
['crappiest']
['huzoor']
['roedel', 'geurilla', 'roedel', 'roedel', 'roedel']
['cadilac']
['szifrón', 'valientes', 'szifron']
['cardiotoxic', 'dermatonecrotic']
['kopins']
['riiiight']
['allmighty']
['konkana']
['1hour', 'chracter', 'algrant']
['roadwarrior', 'venantino', 'venantini', 'dardano', 'venantini', 'viventi']
['iffr']
['everyones']
['watsoever']
['verbosely', 'zabalza', 'unatmospherically']
['blowjobs']
['dmd2222']
['armaggeddon']
['swordfights']
['guillespe', 'guillespe']
['dandys']
['ekeing']
['schüte', 'schüte']
['engletine']
['consigliare']
['gruntled']
['screweyes']
['kaabee', 'shoufukutei', 'tsurube', 'wittiness', 'yuunagi']
['goosebump']
['clicheish', 'ram

['jox', 'midsts']
['reaally']
['putain', 'amoureuses', 'véronika', 'pornographe']
['cack']
['karva', 'friers', 'karva']
['ironists', 'dvdtalk', '3199']
['bromidic', 'nercessian', 'ferroukhi']
['89or']
['rossitto']
['reportary']
['ob101']
['payaso', 'plinplin']
['frownland', 'frownland']
['hideos']
['pyrokinetics', 'unbeknownest']
['throuout']
['offon', 'offon', 'imho']
['coccio', 'werent']
['soninha', 'waldomiro', 'soninha']
['kapture']
['desparte']
['lunohod', 'lunohod', '1tv']
['convolute']
['blathered']
['smuttishness']
['innaccurate']
['pissible', 'ciggy', 'upchucking', 'garloupis']
['everlovin', 'lemondrop']
['deforrest', 'lickerish']
['lazerov']
['kaddiddlehopper']
['azjazz']
['kirckland']
['prate']
['despict']
['trekkish', 'technerds']
['unflaunting']
['allsuperb', 'admarible', 'cinemaphotography', 'juvenille']
['rutkay', 'actings']
['waterdance', 'sisabled']
['definetely']
['uped']
['kabbalism']
['noltie', 'wimpiest']
['stensvold']
['animes', 'acedmy']
['frakkin']
['bi1']
['flu

['acomplication']
['nickolodeon']
['lanquage']
['grinchy']
['dipsh']
['lazerov', 'lazerov', 'lazerov']
['lv2', 'lv2', 'lv2', 'lv1', 'lv1', 'lv1']
['tatsuhito', 'shihito', 'tatsuhito', 'tatsuhito', 'tatsuhito', 'tatsuhito', 'tatsuhito', 'tatsuhito', 'tatsuhito', 'polarisdib']
['wounder']
['appollonia', 'kotero', 'sexshooter']
['cheorgraphed']
['axellent']
['morolla']
['hateable', 'unloveable']
['subotsky', 'pavillions']
['gymkata', 'gymakta']
['fwwm']
['daraar', 'daraar']
['gwizdo', 'gwizdo', 'gwizdo', 'gwizdo']
['malefique', 'malefique']
['puertorican', 'leguzaimo', 'puertorican']
['mingozzi', 'gorehounds', 'bolkin']
['becase']
['metalstorm']
['pedofile']
['stoumen']
['mrudul']
['unscary', 'crododile']
['spaghettis']
['ropey']
['dumann', 'dumann']
['gnashingly']
['semetary']
['hoast', 'hoast']
['soid']
['unentertaining', 'suckingly', 'morisette']
['communistophobia']
['vulvas', 'schlongs']
['dumbland', 'dumbland']
['synopsize', 'dorkish']
['monstervision', 'bargin']
['entwisle']
['acha

['distributers', 'reguritated', 'traumatising', 'innappropriate', 'igniminiously']
['freakiest', 'shits', 'loooong']
['rentalrack']
['bogdonavich', 'bogdonavich', 'bogdonavich']
['pergado']
['perfomances']
['hulchul', 'paagal']
['desica']
['foxs']
['kvell']
['aquawhite']
['kennyhotz']
['convida', 'dançar']
['sofcore']
['jerkwad', 'snottiness']
['franclisco']
['jacquouille', 'poiré', 'frenegonde', 'poiré']
['chirila']
['ingor']
['ariauna', 'ariauna', 'ariauna', 'nough']
['footmats', 'foresay', 'boulange']
['hautefeuille']
['pransky', 'strombel']
['superceeds']
['bombastically', 'counterpointing']
['trivilized']
['meercat']
['definetly']
['gratituous']
['masiela', 'masiela']
['atually']
['steenky']
['mockney']
['oldish', 'dobermann', 'dobermann']
['guetary', 'musn']
['sluttish']
['mfer', 'dessicated', 'poseiden']
['halmark', 'halmark']
['nosbusch', 'nosbusch']
['leonidus', '10yr', 'kapoors']
['operish', 'caucasions']
['caprican']
['terrorises']
['happing']
['dwelves', 'konkana', 'konkana

['pantalino', 'pantalino']
['megessey']
['oppinion']
['suares']
['jox']
['wwwaaaaayyyyy', 'crissakes']
['ahah']
['tybor']
['fidani', 'fidani', 'ingenuos', 'demential', 'comportaments', 'fugace']
['hassie']
['christianty']
['libertini']
['campest', 'cringeworthy']
['thoes', 'hymilayan']
['averback']
['larryjoe76']
['denigh']
['sacrine']
['awstruck']
['seagals', 'seagals', 'aikidoist']
['charasmatic', 'redeaming']
['cavernously', 'f117']
['purile', 'adle', 'moovie', 'moovies', 'moocow']
['othewise']
['woooooosaaaaaah', 'twentyfive']
['consumingly', 'dorcey']
['ahista', 'ahista']
['offworlders']
['vanhook']
['waalkes']
['3lbs', 'neurlogical']
['unformulaic']
['veiwing']
['daneliuc', 'professionist']
['cheadles']
['deprave']
['hundstage', 'hundstage']
['fastward', 'johntopping', '20perr', '20widow']
['omirus', 'iliada', 'odysseia', 'omirus', 'omirus']
['dorkiest']
['prigs']
['klaveno', 'klaveno']
['sebastiaans']
['shoudln', 'recomend']
['memoral']
['ummmph', 'ummmph', 'twasn']
['soutendijk

['enrapt']
['trods']
['magsel']
['puffinstuff', 'witchie', 'lidsville', 'puffinstuff', 'puffinstuff']
['orwelll']
['vanhook']
['screecher']
['taly']
['bastardised']
['midgetorgy']
['rmb4']
['fallafel']
['tchiness']
['cussword']
['spoily']
['aditiya']
['cheoreography', 'vaibhavi', 'haara', 'hinglish', 'maare']
['disreguarded']
['discombobulation']
['zardkuh', 'irankian']
['stupifyingly', '12383499143743701']
['brethern', 'chrissakes']
['grinchmas']
['roobi']
['genuises']
['benis']
['brewskies']
['muckerji', 'melandez', 'muckerji']
['fastforwarding', 'gangmembers']
['maclaglen']
['jamrom4']
['characatures']
['tritely']
['coercible']
['rusell']
['pusses', 'ewwww', 'robitussen']
['waisting']
['grewing']
['gueule']
['fratboy']
['danelia', 'danelia']
['sloppish']
['ragdolls']
['duologue']
['abkani', 'carnby', 'abkani', 'cedrac']
['comedygenre', 'honkong']
['schlockmeister']
['leway']
['aquires']
['miriad', 'offcourse', 'whih', 'itelf']
['s1m0ne']
['supersentimentality']
['livington']
['execr

['thalmus', 'rasulala', 'supertank']
['gingerman', 'vanning', 'boobage']
['quida']
['sleazes']
['cheekboned']
['corniest', 'ewashen']
['chauffers']
['rosza']
['necroborg', 'scums', 'necroborgs', 'necroborg']
['guietary', 'gershwyn']
['apeing']
['thoes', 'inescapeable']
['cineastic', 'undestand', 'homour']
['siriaque', 'umbopa']
['splatterfest']
['braselle', 'eschelons', 'panged', 'braselle']
['guptil']
['doophus']
['diazes', 'mendezes']
['tushies', 'tushies']
['salkow', 'salkow', 'salkow']
['nandjiwarra', 'amagula', 'mulkurul', 'countlessly']
['incovenient']
['timetraveling']
['marylee', 'aventurera']
['definatly']
['egdy']
['unbelivebly', 'catylast', 'oppurunity']
['antecedently', 'chetas']
['mulleted']
['brosan']
['aranoa', 'aranoa', 'aranoa']
['hokeyness']
['xplosiv']
['subtlties']
['spacecamp', 'spacecamp', 'spacecamp']
['ladiesman']
['talkiest']
['hysterion', 'afest', 'wetters', 'yawneroony']
['frakken']
['tupinambas', 'frenches', 'tupiniquins', 'arduíno', 'colassanti', 'tupinambá

['deteste', 'dillemma', 'basiclly', 'harvery']
['merr']
['elsehere']
['megahy']
['durokov', 'cameraderie']
['heero']
['rumblefish']
['hokiest', 'unmistakeably', 'severeid']
['chrecter', 'jumpedtheshark', 'scrappys', 'scoobys', 'intriguded']
['symbolisms', 'pandemoniums']
['austrailian', 'baffeling']
['cbtl', 'cbtl']
['tgmb', 'criminey', 'groovay', 'pornostalgia']
['javo']
['pakeezah']
['shoenumber']
['theieves']
['sexploits']
['sevalas']
['splatterish']
['60ties', 'flowes', 'slowely']
['yummm']
['lilleheie']
['tlog']
['denemark']
['columbos']
['topness']
['dukesofhazzard']
['semisubmerged']
['preumably', 'jusassic']
['poisen', 'airphone', 'sporks']
['dillute']
['dond']
['lockstock']
['obtrusively']
['woodmobile', 'flakiest']
['fillums', 'summersisle']
['becall']
['foxhunt']
['cinepoem', 'polarisdib']
['defintly']
['himmelen']
['aaaaaaah']
['nuddie']
['unziker', 'antevleva', 'palassio', 'giusstissia']
['emergance']
['magnavision']
['civl']
['idiotize']
['talliban', 'torenstra', 'hilbran

['pianiste', 'pianiste']
['mundae', 'mundae']
['bigv']
['psilcybe', 'everybodys']
['undeservingly']
['scalese']
['marylee', 'marylee']
['pathedic']
['expresssions']
['reluctantpopstar']
['obcession', 'obcession', 'obcession', 'espectator', 'thourough', 'excelent', 'obcession']
['crappest']
['imho']
['farcelike', 'huggie']
['adgth']
['varshi']
['sumpthin']
['phainomena', 'chainguns', 'ambidexterous', 'clyve', 'excelent']
['unfortuntly', 'naieve', 'fowarded']
['hitchcok']
['deffinately']
['molie']
['bussinessmen']
['zomcom']
['paxtons']
['brotherwood', 'decaunes', 'artagnan']
['suneil']
['heatbeats']
['lettich']
['gailard']
['ftagn', 'sototh']
['paperhouse']
['mankinds', 'madnes', 'appologise']
['pleasently']
['merendino']
['aprox']
['dudettes']
['batarda']
['relevation']
['kiddifying']
['mouthings']
['maricarmen', 'ferpecto', 'hongos']
['shakesspeare']
['apidistra']
['nuteral', 'delivere', 'roofthooft', 'upcomming', 'voogdt']
['100min', '30mins']
['6hours', '1h40', '6hours', '1h40', '10

['dukey']
['macmahone']
['kazakos', 'papamoschou', 'clytemnastrae', 'kazakos', 'papamoschou']
['0079', 'newtypes', 'izuruha', '0080', '0083', '0080']
['colagrande']
['mostey']
['aborigone', 'aboriginies']
['loooong']
['coer', 'misinforms']
['boringlane']
['jhene', 'lastewka', 'lastewka', 'jhene']
['luckly', 'encyclopidie']
['filmde']
['frenchfilm']
['schnaas']
['unfunniest', 'avoide', 'avoide']
['spt11']
['escreve']
['highen']
['romcom']
['postlewaite']
['gorefests']
['farrells', 'charictor']
['stirba']
['tbu', 'tbu']
['uder']
['fastforward', 'wheedon', 'finese']
['stynwyck', 'stynwyck', 'doozie']
['hammeresses', 'chappu']
['fartsy', 'bunuels', 'daneliucs', 'nicolaescus', 'saizescus', 'muresans', 'marinescus', 'margineanus', 'terribilisms']
['discribe', 'misirable', 'alredy']
['wanking']
['exce', 'deathline', 'deathline', 'wienstein']
['gozilla', 'toly']
['kopsa']
['clichè', 'clichès']
['nuyen']
['murpy']
['mcmurty']
['brokovich']
['giaconda', 'eroticize']
['definitley']
['mockinbird']

['yaara', 'yaara', 'yaara', 'yaara', 'yaara', 'yaara', 'yaara']
['weidstraughan']
['00am', 'honkytonks', 'satnitefever']
['classiness']
['afterstory']
['serges']
['pfeh', 'cluemaster', 'fastardization']
['hatian']
['extreamly']
['howz', 'sumthin']
['jaregard']
['carnosaurs', 'ungoriest']
['homicidally']
['lenghth']
['renying']
['argeninean']
['eraticate']
['recomeçar']
['hindersome', 'clomps', 'gariazzo']
['mochanian']
['crimedies']
['duologue']
['leguizano']
['abominator', 'evilmaker']
['cheesey', 'cheesey']
['gadg', 'gadg']
['muchly', 'imom', 'imom']
['qissi', 'meanacing']
['strauli', 'saire', 'ryecart']
['pornos']
['seens']
['highjinx']
['velankar', 'sadahiv', 'amrapurkar', 'nihlan', 'amrapurkar', 'amrapurkars', 'amrapurkar']
['reah', 'cicatillo']
['bianlian', 'bianlian', 'bianlian']
['blubbered']
['victis']
['miiko', 'yochobel']
['ariauna', 'ariauna']
['gokbakar', 'gokbakar', 'rutkay', 'gokbakar']
['duskfall']
['ah56a']
['boners']
['dysantry']
['1mln']
['herendous']
['genious', 'kn

['taime', 'thirbly', 'thirbly', 'nataile', 'taime', 'taime']
['experiental', 'manqué']
['crashingly']
['rustlings', 'overtops']
['athsma']
['imdbs']
['conelly']
['absolutly', 'phenominal', 'ocar']
['phedon']
['fanfictions', 'blubbers']
['disembowelments', 'unratedx', 'reallllllllly']
['caligary', 'superimpositions']
['pimpy']
['resolutive', 'botcher']
['pg13']
['decerebrate']
['simonsons', 'donnovan', 'desposal']
['heikkilä']
['mayleses']
['awkrawrd']
['okona']
['franticly', 'drifty']
['knuckleface']
['terrfic', 'slightyly']
['paxinou', 'paxinou', 'muckerji', 'conteras', 'auie']
['rebelious', 'nikelodean']
['1and', 'of5']
['auscrit', 'auscrit', 'sommersault', 'auscrit']
['homesetting', 'froing']
['bibbidi', 'bobbidi']
['dialoques', 'phonebooth']
['illogicalities']
['mousiness', 'marylee']
['rewinder']
['kumer', 'ragpal']
['mordrid', 'mordrid']
['formulative', 'exiter']
['pseud']
['rehumanization', 'demoninators']
['movieclips']
['fraggles', 'doozers', 'gorgs', 'fraggles', 'mokey', 'boo

['tessari', 'tessari', 'scenarists', 'gandus', 'verucci', 'actioneer', 'tessari', 'tessari']
['shitters', 'overgeneralizing']
['delancie']
['chockful']
['enprisoned', 'desides', 'momoselle', 'sades', 'hireing']
['joies']
['coslow', 'marahuana']
['mvc2', 'ahahahahahaaaaa']
['keneth', 'madoona']
['98minutes']
['supertank', 'hardbitten', 'fluegel', 'fluegel']
['manged', 'absoluter']
['suckiness']
['rotflmao', 'directer']
['completest']
['aimants']
['unscience', 'chemystry', 'extincted']
['tentatives']
['mofu']
['vandebrouck']
['vaudevillesque']
['undistinguishable']
['coverbox', 'videostore']
['hrshitta', 'shayaris', 'bhatts']
['padayappa', 'modail', 'nallae']
['brookmyre', 'parlablane']
['alittle', 'cthd']
['collete']
['shyt']
['comlex']
['grifting']
['todean']
['unentertaining', 'uninstructive']
['darkwolf', 'apalling']
['simialr']
['brettschnieder']
['bejebees']
['burnstyn', 'burnstyn']
['sequituurs']
['holdall']
['somnambulistic']
['hinglish', 'movieee']
['liquer']
['sketchlike', 'una

### Create an Embedding layer matrix

In [11]:

print('VACAB_FEATURES ', VACAB_FEATURES)  # 100
print('VOCAB_SIZE ', VOCAB_SIZE)  # 60064 # 3575 # 3443


VOCAB_SIZE = len(glove_embedding_dict.keys()) + 1 # 1 extra for zero sequence value
imdb_glove_embedding_matrix = np.zeros((VOCAB_SIZE, VACAB_FEATURES))

for index, (word, vector) in enumerate( glove_embedding_dict.items()):
    # print(vector)    
    # break
    if vector is not None:
        imdb_glove_embedding_matrix[index] = vector

print('Embedding matrix shape', imdb_glove_embedding_matrix.shape)

VACAB_FEATURES  100
VOCAB_SIZE  60064
Embedding matrix shape (60064, 100)


### Prepare input

In [12]:
msg = 'Sample Input before padding: '
print(msg, '='*len(msg), sep='\n' )
for s in text_seq[0:3]:
    print(s) 

Sample Input before padding: 
[67, 640, 1532, 359, 2, 6, 288, 69, 14, 2385, 8625, 399, 4771, 44, 3560, 247, 3179, 5168, 43, 132, 54, 249, 342, 995, 15, 1358, 3180, 1087, 95, 24, 637, 2, 102, 1449, 721, 1152, 3362, 8626, 1514, 122, 15402, 1023, 44, 2062, 9, 168, 25, 3, 5, 5092, 400, 11145, 1009, 327, 6086, 24, 36781, 487, 30, 850, 299, 13, 74, 379, 204, 39]
[341, 3089, 808, 4, 3207, 19, 133, 5659, 26, 171, 21727, 1480, 163, 8, 645, 6362, 8627, 163, 108, 37, 19, 120, 263, 8, 14466, 1087, 2294, 17, 5, 199, 2, 24, 2, 14467, 220, 559, 2767, 1642, 411, 2, 62, 921, 1940, 6805, 184, 793, 168, 8, 605, 103, 2793, 93, 2, 1318, 25, 2, 69, 98, 26, 69, 98, 1678, 1827, 174, 93, 2, 1019, 944, 2767, 1642, 93, 425, 1533, 425, 241, 60, 359, 57, 474, 462, 359, 1505, 14468, 1341, 1658, 3399, 359, 19, 1534, 26, 5169, 411, 228, 1534, 6, 3035, 2, 1079, 62, 4, 7122, 3811, 2, 364, 3400, 225, 148, 2]
[259, 498, 1028, 851, 2, 949, 1207, 2052, 9504, 1354, 13, 6633, 5992, 1, 1, 3, 564, 630, 774, 36782, 509, 9505, 5

In [13]:
# Padding Sequences
input_seq_padded = keras.preprocessing.sequence.pad_sequences(text_seq, maxlen=SEQUENCE_LENGTH)
msg = 'Sample Input After padding: '
print(msg, '='*len(msg), sep='\n' )
[print(s) for s in input_seq_padded[0:2]]
print('Input shape :', input_seq_padded.shape)

Sample Input After padding: 
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0    67   640  1532   359     2     6   288    69    14  2385
  8625   399  4771    44  3560   247  3179  5168    43   132    54   249
   342   995    15  13

In [14]:
# Input data type conversion
input_seq_casted = np.asarray(input_seq_padded, dtype=np.float)

# Output data type conversion
output = np.asarray(review_text_array[0:, 2], dtype=np.float)
# As the scores are in 1-10 we need to transform to 0-9 for categorical
# output -= 1
output_y = keras.utils.to_categorical(output, num_classes=2)

train_x = input_seq_casted
train_y = output_y

print('train_x shape ', train_x.shape)
print('train_y shape ', train_y.shape)


train_x shape  (21122, 200)
train_y shape  (21122, 2)


In [15]:
# 'VOCAB_SIZE must match the weight matrix dymention'
assert(imdb_glove_embedding_matrix.shape[0]==VOCAB_SIZE)
# 'VACAB_FEATURES must match the weight matrix dymention'
assert(imdb_glove_embedding_matrix.shape[1]==VACAB_FEATURES)

In [16]:
deep_inputs = keras.layers.Input(shape=(SEQUENCE_LENGTH, ))
embedding = keras.layers.Embedding(VOCAB_SIZE, VACAB_FEATURES, weights=[imdb_glove_embedding_matrix] , input_length=SEQUENCE_LENGTH, trainable=True)(deep_inputs)

In [17]:
dropout = keras.layers.Dropout(0.2)(embedding)
lstm_1 = keras.layers.LSTM(units=100, return_sequences=True)(dropout)  # batch_input_shape=[None, SEQUENCE_LENGTH, VACAB_FEATURES]
lstm_2 = keras.layers.LSTM(units=100, return_sequences=False)(lstm_1)  # batch_input_shape=[None, SEQUENCE_LENGTH, VACAB_FEATURES],
final_dense = keras.layers.Dense(2, activation='softmax')(lstm_2)
deep_model = keras.Model(inputs=deep_inputs, outputs=final_dense)
print(deep_model.summary())

# tensorboard = TensorBoard(log_dir='logs/{}'.format(time()))
deep_model.compile(loss='categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])

history = deep_model.fit(train_x, train_y, batch_size=100, 
                         epochs=9, 
                         validation_split=.2,
                         ) # callbacks=[tensorboard]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 100)          6006400   
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 100)          80400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202       
Total params: 6,167,402
Trainable params: 6,167,402
Non-trainable params: 0
_________________________________________________________________
