In [53]:
# read data from text files
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [54]:
print(f"Reviews: {reviews[:500]}")

Reviews: bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which


In [55]:
print(f"Labels: {labels[:100]}")

Labels: positive
negative
positive
negative
positive
negative
positive
negative
positive
negative
positive
n


In [56]:
print(f"""
Size of Review dataset : {len(reviews)}
Size of labels dataset : {len(labels)}
""")


Size of Review dataset : 33678267
Size of labels dataset : 225000



### Data pre-processing

From the example of the reviews data above. For the processing steps, we'll want to take:
>* get rid of periods and extraneous punctuation.
* Deal with `\n`. 
* Then combined all the reviews back together.


In [57]:
from string import punctuation

#removing punctuation
reviews = reviews.lower()
text = ''.join([char for char in reviews if char not in punctuation])

# split by new lines and spaces
reviews_split = text.split('\n')
text = ' '.join(reviews_split)

# create a list of words and labels
words = text.split()
labels = labels.split()

In [58]:
words[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

In [59]:
labels[:10]

['positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

In [60]:
int_vocab = dict(enumerate(set(words)))
vocab_int = {int_vocab[i]: i for i in int_vocab}

reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_int[word] for word in review.split()])

In [61]:
print(f'''
Unique words: {len((vocab_int))}

Tokenized review: {reviews_ints[1]}''')


Unique words: 74072

Tokenized review: [14371, 4663, 19394, 40688, 32119, 65521, 44550, 64067, 70072, 19394, 2119, 3391, 43041, 54269, 19394, 55905, 10256, 43854, 34832, 19394, 67618, 45442, 4663, 5928, 64965, 19394, 38291, 1150, 56774, 34832, 46547, 26973, 1168, 51654, 22594, 70333, 11291, 48776, 53100, 59432, 4663, 26679, 63872, 29227, 33819, 26679, 19985, 5928, 48776, 2439, 24955, 54269, 46809, 6535, 20925, 31721, 32308, 26679, 30615, 67094, 70182, 471, 12371, 66911, 42406, 48776, 14828, 11994, 1764, 46547, 70182, 48776, 47970, 64187, 19957, 3521, 67616, 66010, 56693, 51839, 19394, 65972, 11221, 53371, 19394, 5671, 49821, 26679, 63872, 61974, 61747, 19347, 9086, 19265, 54269, 13164, 17489, 50145, 11291, 3344, 73189, 3020, 64313, 3344, 16234, 11838, 16921, 39397, 69032, 9644, 13925, 1764, 64128, 38718]


In [62]:
#reviews with zero length
review_lengths = Counter([len(x) for x in reviews_ints])
print(f"Zero-length reviews: {review_lengths[0]}")

Zero-length reviews: 1


In [63]:
#remove zero length review
zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length reviews and their labels
reviews = [reviews_ints[i] for i in zero_idx]
labels = np.array([labels[i] for i in zero_idx])

In [65]:
# 1=positive, 0=negative label conversion
labels = np.array([1 if label == 'positive' else 0 for label in labels])

In [80]:
reviews[0:1]

[[293,
  44651,
  34832,
  19394,
  14792,
  64965,
  26679,
  5566,
  33651,
  48776,
  33194,
  24955,
  48329,
  13164,
  69006,
  36304,
  69477,
  6515,
  21367,
  60877,
  48329,
  58915,
  27567,
  57428,
  58321,
  48776,
  21656,
  4398,
  18942,
  71370,
  51839,
  72395,
  43854,
  293,
  44651,
  63872,
  37051,
  34832,
  47315,
  13174,
  51839,
  29214,
  61747,
  34832,
  58915,
  48776,
  61979,
  51839,
  14418,
  59153,
  48776,
  67215,
  38781,
  32119,
  13925,
  29391,
  49478,
  11544,
  67999,
  50877,
  58915,
  9858,
  48776,
  40327,
  4663,
  48776,
  2439,
  31219,
  39444,
  11933,
  71370,
  4663,
  48776,
  31843,
  20829,
  37590,
  39397,
  67999,
  38781,
  21945,
  20829,
  44485,
  48776,
  11588,
  58321,
  5189,
  19394,
  11760,
  2698,
  52089,
  51839,
  22458,
  55935,
  48776,
  6515,
  20829,
  62889,
  50818,
  33651,
  44651,
  19394,
  62648,
  50577,
  25811,
  20829,
  23003,
  56237,
  51839,
  2590,
  12559,
  4663,
  13954,
  58915,

In [68]:
labels

array([1, 0, 1, ..., 0, 1, 0])

In [79]:
import pandas as pd
import os

data = pd.DataFrame((reviews,labels), index=["Reviews", "Labels"]).T
data.head()

Unnamed: 0,Reviews,Labels
0,"[293, 44651, 34832, 19394, 14792, 64965, 26679...",1
1,"[14371, 4663, 19394, 40688, 32119, 65521, 4455...",0
2,"[37789, 3671, 67463, 48329, 30329, 64971, 2756...",1
3,"[40859, 3391, 48329, 19394, 6949, 8224, 15086,...",0
4,"[50407, 11495, 70083, 11291, 55273, 11599, 178...",1


In [77]:
#save data as cleaned.csv
data.to_csv('./data/cleaned_data.csv', index=False)