In [88]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [89]:
import os
os.getcwd()

'/Users/thomassullivan/projects/GitHub/PyTorchNLPBook/chapters/chapter_3'

In [90]:
args = Namespace(
    raw_train_dataset_csv="spam_train.csv",
    raw_test_dataset_csv="spam_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="spam_test_with_splits_lite.csv",
    seed=1337
)

In [91]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [92]:
train_reviews['rating']

0       Email Type
1                2
2                1
3                2
4                2
           ...    
5009             2
5010             2
5011             2
5012             1
5013             2
Name: rating, Length: 5014, dtype: object

In [93]:
# making the subset equal across the review classes
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [94]:
review_subset.head()

Unnamed: 0,rating,review
0,1,Free entry in 2 a wkly comp to win FA Cup fina...
1,1,FreeMsg Hey there darling it's been 3 week's n...
2,1,WINNER!! As a valued network customer you have...
3,1,Had your mobile 11 months or more? U R entitle...
4,1,"SIX chances to win CASH! From 100 to 20,000 po..."


In [95]:
train_reviews.rating.value_counts()

2             4338
1              675
Email Type       1
Name: rating, dtype: int64

In [96]:
review_subset.rating.value_counts()

2    433
1     67
Name: rating, dtype: int64

In [97]:
# Unique classes


In [98]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [99]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [100]:
# Write split data to file
final_reviews = pd.DataFrame(final_list)

In [101]:
#result1 = final_reviews[rating] 

In [102]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [106]:
#def classify_row(row):
    

#final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)
final_reviews['rating'] = final_reviews['rating'].apply(lambda x: 'positive' if x== 1 else 'negative')

In [107]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,free entry in a wkly comp to win fa cup final ...,train
1,negative,hmv bonus special pounds of genuine hmv vouche...,train
2,negative,please call our customer service representativ...,train
3,negative,congratulations ur awarded of cd vouchers or g...,train
4,negative,themob check out our newest selection of conte...,train


In [108]:
final_reviews.to_csv(args.output_munged_csv, index=False)