In [4]:
# read data from text files
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [5]:
print(f"Reviews: {reviews[:500]}")

Reviews: bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which


In [6]:
print(f"Labels: {labels[:100]}")

Labels: positive
negative
positive
negative
positive
negative
positive
negative
positive
negative
positive
n


In [7]:
print(f"""
Size of Review dataset : {len(reviews)}
Size of labels dataset : {len(labels)}
""")


Size of Review dataset : 33678267
Size of labels dataset : 225000



### Data pre-processing

From the example of the reviews data above. For the processing steps, we'll want to take:
>* get rid of periods and extraneous punctuation.
* Deal with `\n`. 
* Then combined all the reviews back together.


In [8]:
from string import punctuation

#removing punctuation
reviews = reviews.lower()
text = ''.join([char for char in reviews if char not in punctuation])

# split by new lines and spaces
reviews_split = text.split('\n')
text = ' '.join(reviews_split)

# create a list of words and labels
words = text.split()
labels = labels.split()

In [9]:
words[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

In [10]:
labels[:10]

['positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative']

In [11]:
int_vocab = dict(enumerate(set(words)))
vocab_int = {int_vocab[i]: i for i in int_vocab}

reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_int[word] for word in review.split()])

In [12]:
print(f'''
Unique words: {len((vocab_int))}

Tokenized review: {reviews_ints[1]}''')


Unique words: 74072

Tokenized review: [49996, 11927, 44533, 12599, 24241, 56281, 17855, 62979, 39587, 44533, 50770, 25044, 26092, 5444, 44533, 53477, 41010, 63441, 1882, 44533, 17704, 63052, 11927, 72843, 37548, 44533, 26503, 35212, 58835, 1882, 67510, 63626, 33794, 49440, 32881, 61707, 30520, 35524, 38247, 10535, 11927, 5007, 44230, 17995, 65462, 5007, 43635, 72843, 35524, 51274, 72306, 5444, 9007, 50214, 55510, 10486, 2268, 5007, 9162, 11335, 16053, 9948, 68940, 48181, 73798, 35524, 9603, 64821, 2368, 67510, 16053, 35524, 47294, 43506, 37387, 17291, 147, 15497, 48149, 10937, 44533, 51872, 56586, 6986, 44533, 20548, 6025, 5007, 44230, 17942, 47684, 70887, 30533, 5279, 5444, 15843, 5710, 58734, 30520, 71885, 41608, 56015, 64220, 71885, 50801, 64936, 18811, 47113, 17099, 52700, 36040, 2368, 71203, 21747]


In [14]:
#reviews with zero length
from collections import Counter
import numpy as np

review_lengths = Counter([len(x) for x in reviews_ints])
print(f"Zero-length reviews: {review_lengths[0]}")

Zero-length reviews: 1


In [15]:
#remove zero length review
zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length reviews and their labels
reviews = [reviews_ints[i] for i in zero_idx]
labels = np.array([labels[i] for i in zero_idx])

In [16]:
#padding

def pad_feature(token_reviews, seq_length):
    features = np.zeros((len(token_reviews),seq_length), dtype=int)
    for i, row in enumerate(token_reviews):
        features[i,-len(row):]=np.array(row)[:seq_length]
    return features

In [17]:
seq_length = 512
features = pad_feature(reviews, seq_length)

In [18]:
# 1=positive, 0=negative label conversion
labels = np.array([1 if label == 'positive' else 0 for label in labels])

In [19]:
import pandas as pd
import os

data = pd.DataFrame((reviews,labels), index=["Reviews", "Labels"]).T
data.head()

Unnamed: 0,Reviews,Labels
0,"[46131, 59410, 1882, 44533, 31357, 37548, 5007...",1
1,"[49996, 11927, 44533, 12599, 24241, 56281, 178...",0
2,"[55026, 64890, 40864, 57773, 50752, 42051, 213...",1
3,"[4269, 25044, 57773, 44533, 65703, 30532, 4841...",0
4,"[20883, 9517, 49689, 30520, 43144, 58060, 1282...",1


In [20]:
#save data as cleaned.csv
data.to_csv('./data/cleaned_data.csv', index=False)

In [21]:
np.save("data/dict/int_vocab.npy",int_vocab)
np.save("data/dict/vocab_int.npy",vocab_int)

In [22]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
        :prams: reviews_ints: An array of tokenized words
        :prams: seq_length: length of resultant reviews
        :return: features: reviews_ints with seq_length feature
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [23]:
seq_length = 2046

#pad reviews
features = pad_features(reviews, seq_length=seq_length)

In [24]:
#split dataset in train val and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)

print(f"""Feature Shapes:
Train set:      {X_train.shape}
Validation set: {X_val.shape}
Test set:       {X_test.shape} """)

Feature Shapes:
Train set:      (12250, 2046)
Validation set: (5250, 2046)
Test set:       (7500, 2046) 


In [25]:
pd.DataFrame(y_train,columns=["y"]).join(pd.DataFrame(X_train)).to_csv('data/train.csv', index=False)
pd.DataFrame(y_test,columns=["y"]).join(pd.DataFrame(X_test)).to_csv('data/test.csv', index=False)
pd.DataFrame(y_val,columns=["y"]).join(pd.DataFrame(X_val)).to_csv('data/val.csv', index=False)