# Language Model Data Parsing
This notebook does some of the heavy lifting needed to transform our source data into something we can use for a language model.  Since Amazon's review data for movies and TV are too big to keep in memory, we want to write everything to disk and then read it on demand when we are training our model later on.

We also create custom word vectors using the gensim library, so that we have embeddings tuned to our domain, and also all of the important vocabulary will have an associated word vector. 

### Tokenize and Save Amazon and IMDB Reviews to Disk

In [None]:
import re, json, os
import numpy as np
import pandas as pd

In [58]:
def tokenize_review(review_text):
    review_text = re.sub('[^A-Za-z0-9.?!\',-/ ]+', '', review_text)
    review_text = review_text.replace('.', ' . ').replace(',', ' , ').replace('-', ' - ').replace('/', ' / ').replace('!', ' ! ').replace('?', ' ? ').replace('\'s', ' \'s ').lower()
    review_text = ['<sos>'] + review_text.split(' ') + ['<eos>']
    review_text = [x for x in review_text if len(x) > 0]
    return review_text

In [59]:
with open('data/amazon/Movies_and_TV_5.json', 'r') as in_file:
    with open('data/amazon/movie_review_tokenized.json', 'a') as out_file:
        completed_reviews = 0
        for line in in_file:
            review = ast.literal_eval(line)
            review['reviewTextTokenized'] = tokenize_review(review.get('reviewText', ''))
            review['summaryTokenized'] = tokenize_review(review.get('summary', ''))
            json.dump(review, out_file)
            out_file.write('\n')
            completed_reviews += 1
            if completed_reviews % 10000 == 0:
                print('Completed: {}'.format(completed_reviews))

Completed: 10000
Completed: 20000
Completed: 30000
Completed: 40000
Completed: 50000
Completed: 60000
Completed: 70000
Completed: 80000
Completed: 90000
Completed: 100000
Completed: 110000
Completed: 120000
Completed: 130000
Completed: 140000
Completed: 150000
Completed: 160000
Completed: 170000
Completed: 180000
Completed: 190000
Completed: 200000
Completed: 210000
Completed: 220000
Completed: 230000
Completed: 240000
Completed: 250000
Completed: 260000
Completed: 270000
Completed: 280000
Completed: 290000
Completed: 300000
Completed: 310000
Completed: 320000
Completed: 330000
Completed: 340000
Completed: 350000
Completed: 360000
Completed: 370000
Completed: 380000
Completed: 390000
Completed: 400000
Completed: 410000
Completed: 420000
Completed: 430000
Completed: 440000
Completed: 450000
Completed: 460000
Completed: 470000
Completed: 480000
Completed: 490000
Completed: 500000
Completed: 510000
Completed: 520000
Completed: 530000
Completed: 540000
Completed: 550000
Completed: 560000
C

In [17]:
TRAIN_DATA_FOLDER = 'data/aclImdb/train/'
TEST_DATA_FOLDER = 'data/aclImdb/test/'

In [21]:
def create_dataframe_from_files(data_folder):
    examples = list()
    for d in ['pos','neg']:
        for f in os.listdir(os.path.join(data_folder,d)):
            _tmp = open(os.path.join(data_folder,d,f),'r', encoding='utf-8')
            if d=='pos':
                examples += [(_tmp.read(),f,1)]
            else:
                examples += [(_tmp.read(),f,0)]
    df_tmp = pd.DataFrame(examples, columns=['text','file','target'])
    df_tmp = df_tmp.sample(frac=1)
    df_tmp = df_tmp.reset_index(drop=True)
    return df_tmp
                
df_train = create_dataframe_from_files(TRAIN_DATA_FOLDER)
df_test = create_dataframe_from_files(TEST_DATA_FOLDER)

print(df_train.shape)
print(df_test.shape)

(25000, 3)
(25000, 3)


In [22]:
examples = list()

for f in os.listdir(os.path.join(TRAIN_DATA_FOLDER,'unsup')):
    _tmp = open(os.path.join(TRAIN_DATA_FOLDER,'unsup',f),'r', encoding='utf-8')
    examples += [_tmp.read()]
df_unsup = pd.DataFrame(examples, columns=['text'])
df_unsup = df_unsup.sample(frac=1)
df_unsup = df_unsup.reset_index(drop=True)

In [23]:
df_unsup.shape

(50000, 1)

In [24]:
imdb_text_data = df_train['text'].tolist() + df_test['text'].tolist() + df_unsup['text'].tolist() 

In [25]:
len(imdb_text_data)

100000

In [60]:
with open('data/aclImdb/movie_text_tokenized.json', 'a') as out_file:
    completed_reviews = 0
    for line in imdb_text_data:
        review = {'reviewText':line}
        review['reviewTextTokenized'] = tokenize_review(review.get('reviewText', ''))
        json.dump(review, out_file)
        out_file.write('\n')
        completed_reviews += 1
        if completed_reviews % 10000 == 0:
            print('Completed: {}'.format(completed_reviews))

Completed: 10000
Completed: 20000
Completed: 30000
Completed: 40000
Completed: 50000
Completed: 60000
Completed: 70000
Completed: 80000
Completed: 90000
Completed: 100000


In [61]:
with open('data/amazon/movie_review_tokenized.json', 'r') as in_file:
    with open('data/all_data.json', 'a') as out_file:
        completed_reviews = 0
        for line in in_file:
            review = ast.literal_eval(line)
            tmp = {'text':review.get('reviewTextTokenized','')}
            json.dump(tmp, out_file)
            out_file.write('\n')
            tmp = {'text':review.get('summaryTokenized','')}
            json.dump(tmp, out_file)
            out_file.write('\n')
            completed_reviews += 1
            if completed_reviews % 10000 == 0:
                print('Completed: {}'.format(completed_reviews))

Completed: 10000
Completed: 20000
Completed: 30000
Completed: 40000
Completed: 50000
Completed: 60000
Completed: 70000
Completed: 80000
Completed: 90000
Completed: 100000
Completed: 110000
Completed: 120000
Completed: 130000
Completed: 140000
Completed: 150000
Completed: 160000
Completed: 170000
Completed: 180000
Completed: 190000
Completed: 200000
Completed: 210000
Completed: 220000
Completed: 230000
Completed: 240000
Completed: 250000
Completed: 260000
Completed: 270000
Completed: 280000
Completed: 290000
Completed: 300000
Completed: 310000
Completed: 320000
Completed: 330000
Completed: 340000
Completed: 350000
Completed: 360000
Completed: 370000
Completed: 380000
Completed: 390000
Completed: 400000
Completed: 410000
Completed: 420000
Completed: 430000
Completed: 440000
Completed: 450000
Completed: 460000
Completed: 470000
Completed: 480000
Completed: 490000
Completed: 500000
Completed: 510000
Completed: 520000
Completed: 530000
Completed: 540000
Completed: 550000
Completed: 560000
C

In [62]:
with open('data/aclImdb/movie_text_tokenized.json', 'r') as in_file:
    with open('data/all_data.json', 'a') as out_file:
        completed_reviews = 0
        for line in in_file:
            review = ast.literal_eval(line)
            tmp = {'text':review.get('reviewTextTokenized','')}
            json.dump(tmp, out_file)
            out_file.write('\n')
            completed_reviews += 1
            if completed_reviews % 10000 == 0:
                print('Completed: {}'.format(completed_reviews))

Completed: 10000
Completed: 20000
Completed: 30000
Completed: 40000
Completed: 50000
Completed: 60000
Completed: 70000
Completed: 80000
Completed: 90000
Completed: 100000


In [63]:
!head -n 125 data/all_data.json

{"text": ["<sos>", "this", "is", "a", "charming", "version", "of", "the", "classic", "dicken", "'s", "tale", ".", "henry", "winkler", "makes", "a", "good", "showing", "as", "the", "scrooge", "character", ".", "even", "though", "you", "know", "what", "will", "happen", "this", "version", "has", "enough", "of", "a", "change", "to", "make", "it", "better", "that", "average", ".", "if", "you", "love", "a", "christmas", "carol", "in", "any", "version", ",", "then", "you", "will", "love", "this", ".", "<eos>"]}
{"text": ["<sos>", "good", "version", "of", "a", "classic", "<eos>"]}
{"text": ["<sos>", "it", "was", "good", "but", "not", "as", "emotionally", "moving", "as", "the", "the", "christmas", "carol", "by", "dickens", "i", "like", "christmas", "movies", "that", "make", "me", "sigh", "<eos>"]}
{"text": ["<sos>", "good", "but", "not", "as", "moving", "<eos>"]}
{"text": ["<sos>", "don't", "get", "me", "wrong", ",", "winkler", "is", "a", "wonderful", "character", "actor", "and", "i", "woul

### Create Word Vectors

In [64]:
from gensim.models import Word2Vec
import multiprocessing, json

In [65]:
class TokenizedSentences(object):
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        for line in open(self.filename, 'r'):
            review = json.loads(line)
            tokens = review.get('text',[])
            yield tokens

In [66]:
N_WORKERS = (multiprocessing.cpu_count() - 1)
print('Number of workers: '+ str(N_WORKERS))

Number of workers: 11


In [67]:
s = TokenizedSentences('data/all_data.json')

In [68]:
EMBED_SIZE = 192
w2v_model = Word2Vec(sentences=s,
                     size=EMBED_SIZE,
                     window=9,
                     min_count=4,
                     sample=0.001,
                     seed=42,
                     workers=N_WORKERS,
                     sg=0,
                     hs=0,
                     negative=7,
                     iter=15
            )

In [69]:
len(w2v_model.wv.vocab)

250136

In [73]:
w2v_model.most_similar(positive=['movie'])

[('film', 0.8554957509040833),
 ('flick', 0.7618805170059204),
 ('sequel', 0.544894814491272),
 ('movies', 0.542639434337616),
 ('it', 0.5421469807624817),
 ('storyline', 0.5206243395805359),
 ('anime', 0.5040236115455627),
 ('ppv', 0.5023794174194336),
 ('story', 0.4962218403816223),
 ('series', 0.47777867317199707)]

In [74]:
w2v_model.most_similar(positive=['television'])

[('tv', 0.9101788401603699),
 ('televison', 0.825933575630188),
 ('televsion', 0.7536742687225342),
 ('pbs', 0.7485703229904175),
 ('hbo', 0.719831645488739),
 ('abc', 0.6930111646652222),
 ('nbc', 0.687646746635437),
 ('network', 0.6467430591583252),
 ('tnt', 0.6417787075042725),
 ('showtime', 0.6372470855712891)]

In [70]:
w2v_model.most_similar(positive=['king','woman'], negative=['man'])

[('queen', 0.6551686525344849),
 ('princess', 0.5688230991363525),
 ('bekassy', 0.5557050704956055),
 ('endelman', 0.5515807867050171),
 ('empress', 0.5449009537696838),
 ('churchett', 0.5369232296943665),
 ('prince', 0.5355309844017029),
 ('moyer', 0.5312000513076782),
 ('victoria', 0.5283535718917847),
 ('kingish', 0.5273270606994629)]

In [75]:
w2v_model.save('data/w2v_192_language_model_tokens_')

### Create Integer to Word Map

In [None]:
import os, json, pickle
from collections import Counter

In [None]:
vocab_count = Counter(p for o in tokens for p in o)
print('Vocab length: {}'.format(len(vocab_count)))
vocab_count.most_common(25)

In [None]:
with open('vocab_count.pkl', 'wb') as f:
    pickle.dump(vocab_count, f, pickle.HIGHEST_PROTOCOL)

In [None]:
int2word = {i:x[0] for i, x in enumerate(vocab_count.most_common())}

In [None]:
with open('int2word.pkl', 'wb') as f:
    pickle.dump(int2word, f, pickle.HIGHEST_PROTOCOL)