In [1]:
import numpy as np
import pandas as pd
import collections
import re

from argparse import Namespace

In [2]:
args = Namespace(
    raw_train_dataset = "../data/yelp/raw_train.csv",
    raw_test_dataset = "../data/yelp/raw_test.csv",
    train_prop = 0.7,
    val_prop = 0.3,
    processed_data = "../data/yelp/processed_data.csv",
    seed=1337
)

In [3]:
train_data = pd.read_csv(args.raw_train_dataset, header=None, names=['rating', 'review'])
train_data = train_data[~pd.isnull(train_data.review)]
test_data = pd.read_csv(args.raw_test_dataset, header=None, names=['rating', 'review'])
test_data = test_data[~pd.isnull(test_data.review)]

In [4]:
train_data.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [5]:
test_data.head()

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


In [6]:
# unique ratings
set(train_data.rating)

{1, 2}

In [7]:
# Split train dataset by rating (Stratified Split)
by_rating = collections.defaultdict(list)
for _, row in train_data.iterrows():
    by_rating[row.rating].append(row.to_dict())

In [8]:
by_rating[1][0]


{'rating': 1,
 'review': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."}

In [9]:
# create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_prop * n_total)
    n_val = int(args.val_prop * n_total)
    
    for item in item_list[:n_train]:
        item['split'] = 'train'
        
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    final_list.extend(item_list)

In [10]:
for _, row in test_data.iterrows():
    row_dict = row.to_dict()
    row_dict['split'] = 'test'
    final_list.append(row_dict)

In [11]:
final_data = pd.DataFrame(final_list)

In [12]:
final_data.split.value_counts()

train    392000
val      168000
test      38000
Name: split, dtype: int64

In [13]:
final_data.review.head()

0    The entrance was the #1 impressive thing about...
1    I'm a Mclover, and I had no problem\nwith the ...
2    Less than good here, not terrible, but I see n...
3    I don't know if I can ever bring myself to go ...
4    Food was OK/Good but the service was terrible....
Name: review, dtype: object

In [14]:
final_data[pd.isnull(final_data.review)]

Unnamed: 0,rating,review,split


In [15]:
def preprocess_data(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    
    return text

final_data.review = final_data.review.apply(preprocess_data)

In [16]:
final_data['rating'] = final_data.rating.apply({1: 'negative', 2: 'positive'}.get)

In [17]:
final_data.head()

Unnamed: 0,rating,review,split
0,negative,the entrance was the impressive thing about th...,train
1,negative,"i m a mclover , and i had no problem nwith the...",train
2,negative,"less than good here , not terrible , but i see...",train
3,negative,i don t know if i can ever bring myself to go ...,train
4,negative,food was ok good but the service was terrible ...,train


In [18]:
final_data.to_csv(args.processed_data, index=False)