<a href="https://colab.research.google.com/github/serfsup/thinkful-final-capstone/blob/master/final_capstone_02_apply_vocab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
cd 'drive/My Drive/Colab Datasets'

/content/drive/My Drive/Colab Datasets


In [0]:
import json
import os
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

# Display preferences.
pd.options.display.float_format = '{:.3f}'.format

In [0]:
os.listdir()

['train_clean.tsv',
 'eval_clean.tsv',
 'holdout_clean.tsv',
 'text_col_names.csv',
 'X_train_clean.tsv',
 'dask-worker-space',
 'eval_uniform_length.tsv',
 'holdout_uniform_length.tsv',
 'train_numeric.tsv',
 'eval_numeric.tsv',
 'holdout_numeric.tsv',
 'train_uniform_length.tsv',
 'word_to_index.json',
 'index_to_word.json']

In [0]:
eval_clean = pd.read_csv('eval_clean.tsv', sep='\t')
eval_clean.head()

Unnamed: 0,index,marketplace,product_category,helpful_votes,total_votes,vine,verified_purchase,review_body,helpful_rate,review_body_len,target
0,5568261,US,Video DVD,0,1,N,Y,Great movie.,0.0,2.0,5
1,400452,US,Music,0,2,N,N,This is a good CD and is for any nirvana fan. ...,0.0,46.0,5
2,92658,US,Music,2,5,N,N,The V.W. commercial was surprising and certain...,0.4,47.0,5
3,2896871,US,Digital_Ebook_Purchase,0,0,N,Y,Great for any age! Read before the movie comes...,-1.0,21.0,5
4,3228665,US,Mobile_Apps,0,0,N,Y,Love it. Awesome Free app. Lots of fame option...,-1.0,23.0,5


In [0]:
def make_lengths_uniform(df: pd.DataFrame, mean: float, std: float,
                         col: str = 'review_body', padding_tok: str = '<pad>'):
  """
  Makes the lengths of the reviews the same to help with embeddings

  Args:
    df: the dataset
    mean: the mean review length
    std: the std of review length
    col: the base column to be set to uniform length
    padding_token: str to add if the review is shorter than spec
  Returns:
    The string with padding added or cropped from end to spec
  """
  to_return = []
  spec = int(round(mean + (std * 3), 0))
  for _, row in tqdm(df.iterrows()):
    review = row[col]
    review_length = len(review.split())
    if review_length < spec:
      amount_to_add = int(spec - review_length)
      padding = f' {padding_tok}' * amount_to_add
      to_return.append((f'{review} {padding}', row['target']))
    elif review_length > spec:
      split = review.split()[:spec]
      to_return.append((' '.join(split), row['target']))
    else:
      to_return.append((review, row['target']))
  return to_return

In [0]:
eval_uniform_length = make_lengths_uniform(eval_clean, 68.885, 136.525)

1290927it [02:14, 9629.19it/s] 


In [0]:
assert len(eval_uniform_length) == len(eval_clean)

In [0]:
with open('./eval_uniform_length.tsv', 'w') as f:
  for data in tqdm(eval_uniform_length):
    review, label = data
    f.write(f'{review}\t{label}\n')

100%|██████████| 1290927/1290927 [00:14<00:00, 88027.65it/s] 


In [0]:
ls -lah

total 51G
drwx------ 2 root root 4.0K Nov 20 20:39 [0m[01;34mdask-worker-space[0m/
-rw------- 1 root root 529M Nov 16 01:46 eval_clean.tsv
-rw------- 1 root root 3.6G Nov 28 01:33 eval_numeric.tsv
-rw------- 1 root root 3.5G Dec  2 02:14 eval_uniform_length.tsv
-rw------- 1 root root 531M Nov 16 01:46 holdout_clean.tsv
-rw------- 1 root root 3.6G Nov 30 01:09 holdout_numeric.tsv
-rw------- 1 root root 3.5G Nov 26 03:43 holdout_uniform_length.tsv
-rw------- 1 root root 113M Dec  2 02:05 index_to_word.json
-rw------- 1 root root 127K Nov 16 01:48 text_col_names.csv
-rw------- 1 root root 2.5G Nov 16 01:46 train_clean.tsv
-rw------- 1 root root  17G Nov 27 11:42 train_numeric.tsv
-rw------- 1 root root  16G Dec  2 01:54 train_uniform_length.tsv
-rw------- 1 root root 104M Dec  2 02:04 word_to_index.json
-rw------- 1 root root  45M Nov 19 01:15 X_train_clean.tsv


In [0]:
eval_uniform_length[:5]

[('Great movie.  <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [0]:
holdout_clean = pd.read_csv('holdout_clean.tsv', sep='\t')
holdout_clean = holdout_clean.dropna(how='any')

In [0]:
holdout_uniform_length = make_lengths_uniform(holdout_clean, 68.885, 136.525)

1290926it [02:16, 9430.51it/s]


In [0]:
assert len(holdout_uniform_length) == len(holdout_clean)

In [0]:
with open('./holdout_uniform_length.tsv', 'w') as f:
  for data in tqdm(holdout_uniform_length):
    review, label = data
    f.write(f'{review}\t{label}\n')

100%|██████████| 1290926/1290926 [00:15<00:00, 85843.71it/s]


In [0]:
os.listdir()

['train_clean.tsv',
 'eval_clean.tsv',
 'holdout_clean.tsv',
 'text_col_names.csv',
 'X_train_clean.tsv',
 'dask-worker-space',
 'eval_uniform_length.tsv',
 'holdout_uniform_length.tsv',
 'train_numeric.tsv',
 'eval_numeric.tsv',
 'holdout_numeric.tsv',
 'train_uniform_length.tsv',
 'word_to_index.json',
 'index_to_word.json']

In [0]:
!head eval_uniform_length.tsv

Great movie.  <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p