In [1]:
import pandas as pd
import numpy as np

In [39]:
# Load cleaned dataset
data = pd.read_csv('./data/cleaned_data.csv')

reviews = list(data['Reviews'].apply(lambda x: np.array(list(map(int,x[1:-1].split(','))))))
labels = list(data['Labels'].apply(int))

In [40]:
reviews[1]

array([14371,  4663, 19394, 40688, 32119, 65521, 44550, 64067, 70072,
       19394,  2119,  3391, 43041, 54269, 19394, 55905, 10256, 43854,
       34832, 19394, 67618, 45442,  4663,  5928, 64965, 19394, 38291,
        1150, 56774, 34832, 46547, 26973,  1168, 51654, 22594, 70333,
       11291, 48776, 53100, 59432,  4663, 26679, 63872, 29227, 33819,
       26679, 19985,  5928, 48776,  2439, 24955, 54269, 46809,  6535,
       20925, 31721, 32308, 26679, 30615, 67094, 70182,   471, 12371,
       66911, 42406, 48776, 14828, 11994,  1764, 46547, 70182, 48776,
       47970, 64187, 19957,  3521, 67616, 66010, 56693, 51839, 19394,
       65972, 11221, 53371, 19394,  5671, 49821, 26679, 63872, 61974,
       61747, 19347,  9086, 19265, 54269, 13164, 17489, 50145, 11291,
        3344, 73189,  3020, 64313,  3344, 16234, 11838, 16921, 39397,
       69032,  9644, 13925,  1764, 64128, 38718])

In [41]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
        :prams: reviews_ints: An array of tokenized words
        :prams: seq_length: length of resultant reviews
        :return: features: reviews_ints with seq_length feature
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [42]:
seq_length = 2046

#pad reviews
features = pad_features(reviews, seq_length=seq_length)


In [43]:
#split dataset in train val and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)

print(f"""Feature Shapes:
Train set:      {X_train.shape}
Validation set: {X_val.shape}
Test set:       {X_test.shape} """)

Feature Shapes:
Train set:      (12250, 2046)
Validation set: (5250, 2046)
Test set:       (7500, 2046) 


In [70]:
pd.DataFrame(y_train,columns=["y"]).join(pd.DataFrame(X_train)).to_csv('data/train.csv', index=False)
pd.DataFrame(y_test,columns=["y"]).join(pd.DataFrame(X_test)).to_csv('data/test.csv', index=False)
pd.DataFrame(y_val,columns=["y"]).join(pd.DataFrame(X_val)).to_csv('data/val.csv', index=False)