<a href="https://colab.research.google.com/github/serfsup/thinkful-final-capstone/blob/master/final_capstone_01_build_vocab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
cd 'drive/My Drive/Colab Datasets'

/content/drive/My Drive/Colab Datasets


In [0]:
import json
import os
import re

import dask.array as da
from dask import dataframe as dd
import numpy as np
import pandas as pd
from tqdm import trange, tqdm

# Display preferences.
pd.options.display.float_format = '{:.3f}'.format

In [0]:
os.listdir()

['train_clean.tsv',
 'eval_clean.tsv',
 'holdout_clean.tsv',
 'text_col_names.csv',
 'X_train_clean.tsv',
 'dask-worker-space',
 'word_to_index.json',
 'index_to_word.json',
 'eval_uniform_length.tsv',
 'holdout_uniform_length.tsv',
 'train_numeric.tsv',
 'eval_numeric.tsv',
 'holdout_numeric.tsv',
 'train_uniform_length.tsv']

In [0]:
train_clean = dd.read_csv('train_clean.tsv', sep='\t')
train_clean = train_clean.dropna(how='any')

In [0]:
eval_clean = dd.read_csv('eval_clean.tsv', sep='\t')
eval_clean.head()

Unnamed: 0,index,marketplace,product_category,helpful_votes,total_votes,vine,verified_purchase,review_body,helpful_rate,review_body_len,target
0,5568261,US,Video DVD,0,1,N,Y,Great movie.,0.0,2.0,5
1,400452,US,Music,0,2,N,N,This is a good CD and is for any nirvana fan. ...,0.0,46.0,5
2,92658,US,Music,2,5,N,N,The V.W. commercial was surprising and certain...,0.4,47.0,5
3,2896871,US,Digital_Ebook_Purchase,0,0,N,Y,Great for any age! Read before the movie comes...,-1.0,21.0,5
4,3228665,US,Mobile_Apps,0,0,N,Y,Love it. Awesome Free app. Lots of fame option...,-1.0,23.0,5


In [0]:
def make_lengths_uniform(df: dd.DataFrame, mean: float, std: float,
                         col: str = 'review_body', padding_tok: str = '<pad>'):
  """
  Makes the lengths of the reviews the same to help with embeddings

  Args:
    df: the dataset
    mean: the mean review length
    std: the std of review length
    col: the base column to be set to uniform length
    padding_token: str to add if the review is shorter than spec
  Returns:
    The string with padding added or cropped from end to spec
  """
  to_return = []
  spec = int(round(mean + (std * 3), 0))
  for _, row in tqdm(df.iterrows()):
    review = row[col]
    review_length = len(review.split())
    if review_length < spec:
      amount_to_add = int(spec - review_length)
      padding = f' {padding_tok}' * amount_to_add
      to_return.append((f'{review} {padding}', row['target']))
    elif review_length > spec:
      split = review.split()[:spec]
      to_return.append((' '.join(split), row['target']))
    else:
      to_return.append((review, row['target']))
  return to_return

In [0]:
_ = make_lengths_uniform(train_clean, 68.885, 136.525)

6024321it [10:55, 9192.76it/s]


In [0]:
assert len(_) == len(train_clean)

In [0]:
with open('./train_uniform_length.tsv', 'w') as f:
  for data in tqdm(_):
    review, label = data
    f.write(f'{review}\t{label}\n')

100%|██████████| 6024321/6024321 [01:26<00:00, 69905.20it/s]


In [0]:
cat train_uniform_length.tsv | wc -l

6024321


In [0]:
del _

In [0]:
with open('./train_uniform_length.tsv', 'r') as f:
  data = f.readlines()

In [0]:
unique_words = set()
for line in tqdm(data):
  line = line.strip()
  words = line.split()
  for word in words[:-1]:  # to not leak the label
    unique_words.add(word)

In [0]:
del data

In [0]:
word_to_index = {val: key for key, val in enumerate(unique_words)}
unk_index = max([*word_to_index.values()]) + 1
word_to_index['<unk>'] = unk_index  # add unknown token for out of vocab words.

In [0]:
assert len(unique_words) + 1 == len(word_to_index.keys())

In [0]:
index_to_word = {val: key for key, val in word_to_index.items()}
assert len(word_to_index) == len(index_to_word)

In [0]:
with open('./word_to_index.json', 'w') as f:
  json.dump(word_to_index, f)

In [0]:
with open('./index_to_word.json', 'w') as f:
  json.dump(index_to_word, f)