In [422]:
#Let's start by importing the most important libraries. We will primarily use Pandas and Numpy\
#, some modules from NLTK, and re for regular expressions
import pandas as pd;
import numpy as np;
import re;
import time;
from nltk.corpus import stopwords;
import json;
import sys;
import pickle;

In [249]:
#Fill in the data location, which has been obtained from https://www.kaggle.com/bittlingmayer/amazonreviews/data

data_location = 'input/amazon_review_full_csv/';

In [250]:
#Load in the train and test sets. No header.
data_train = pd.read_csv(data_location + 'train.csv', header=None);

In [251]:
data_test = pd.read_csv(data_location + 'test.csv', header=None);

In [252]:
#Update the headers with the right column names
data_train.columns = ['rating', 'subject', 'review'];
data_test.columns = ['rating', 'subject', 'review'];

In [254]:
print(len(data_train));
print(len(data_test));

3000000
650000


In [386]:
#There are a lot of rows in both the train and test sets, so to make processing easier, we will 
#use a small subset of the rows to verify that the pipeline works without errors.

#We can do the final training on an AWS server.

#First, we need to get a subset of indices. The sampling dataset
#will be divided into the following ratio: 75% TRAIN, 20% DEV, 5% TEST

np.random.seed(1024);

total_samples = int(0.06 * len(data_train))
rand_indices = np.random.choice(len(data_train), total_samples, replace=False);

train_split_index = int(0.75 * total_samples);
dev_split_index   = int(0.95 * total_samples);

data_sample_train = data_train.iloc[rand_indices[:train_split_index]];
data_sample_dev   = data_train.iloc[rand_indices[train_split_index:dev_split_index]];
data_sample_test  = data_train.iloc[rand_indices[dev_split_index:]];

sample_ratio = len(data_train) / len(data_sample_train);
print("How much of the data we are using: " + str(100.0 / sample_ratio) + '%')

How much of the data we are using: 4.5%


In [387]:
#Next step is cleaning. We will go through different steps, and in the end combine them into a single function. Because we
#are only writing the functions to build the pipeline, I will use a small subset of the data sample to verify it.

data_subsample = data_sample_train.iloc[0:100];

In [388]:
#First step: Remove all items that have a Neutral (3 star) rating

data_sub_filtered = data_subsample[data_subsample.rating != 3];
rows_removed = len(data_subsample) - len(data_sub_filtered);

print('Removing ' + str(100.0 * rows_removed / len(data_subsample)) + '% of rows');

Removing 22.0% of rows


In [389]:
#Second step: Binary ratings. 0 for {1, 2} and 1 for {4, 5}

data_sub_filtered.loc[data_sub_filtered.rating <= 2, 'rating'] = 0;
data_sub_filtered.loc[data_sub_filtered.rating >= 4, 'rating'] = 1;

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [390]:
#Third step: Remove all NaNs.
#We have enough rows in our dataset that we can safely remove all rows with NaNs and still have enough data.

rows_before = len(data_sub_filtered);
data_sub_filtered = data_sub_filtered.dropna();
print('Removed ' + str(100*(len(data_sub_filtered)-rows_before) / len(data_sub_filtered)) + '% of rows');

Removed 0.0% of rows


In [391]:
#I don't understand the warning above, but it looks like the data was properly processed
#Llet's look at a few rows of the data so far
data_sub_filtered.iloc[0:5]

Unnamed: 0,rating,subject,review
893521,0,Lousy,This is a lousy movie and does not follow the ...
2810525,0,failed on induction cooktop,I bought it to use on my new Kitchenaid induct...
1363307,1,Heart Warming,This was a great movie for the whole family. E...
2999390,0,Maddening.,"About the region problem, the best solution is..."
2325074,1,A great soundtrack!,the waynes world soundtrack is hella tight! th...


In [392]:
#Fourth step: Remove all symbols such as ! % & * @ #

#To do this, I will use a regular expression that only keeps alphanumerics and punctuation symbols.
pattern_to_find = "[^a-zA-Z0-9' \.,!\?]";
pattern_to_repl = "";

start = time.time();

for row in data_sub_filtered.index:
    data_sub_filtered.loc[row, 'subject'] = re.sub(pattern_to_find, pattern_to_repl, data_sub_filtered.loc[row, 'subject']).lower();
    
    data_sub_filtered.loc[row, 'review'] = re.sub(pattern_to_find, pattern_to_repl, data_sub_filtered.loc[row, 'review']).lower();
    
print('Total time taken: ' + str(time.time() - start) + 's');

Total time taken: 0.07255768775939941s


In [393]:
#Fifth step: Remove alll urls from the text

#ref: https://stackoverflow.com/a/6883094/1843486

start = time.time();
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+';

for row in data_sub_filtered.index:

    data_sub_filtered.loc[row, 'subject'] = re.sub(url_regex, '', data_sub_filtered.loc[row, 'subject']);
    data_sub_filtered.loc[row, 'review'] = re.sub(url_regex, '', data_sub_filtered.loc[row, 'review']);
    
print('Total time taken: ' + str(time.time() - start) + 's');

Total time taken: 0.07422900199890137s


In [394]:
#Sixth step: Expand all contractions. This will reduce disambiguation between similar phrases such as I'll and I will.
#ref: https://stackoverflow.com/a/19794953/1843486

#I placed all contraction mappings in a text json file, which we can load in
#and look at a few values to see what we have
contractions = json.load(open('contractions.json', 'r'));
for pos, key in enumerate(contractions.keys()):
    print(key, ' -> ', contractions[key]);
    
    if pos > 5:
        break;

ain't  ->  is not
aren't  ->  are not
can't  ->  cannot
can't've  ->  cannot have
'cause  ->  because
could've  ->  could have
couldn't  ->  could not


In [395]:
#This is a little tricky, and not completely perfect. There are two types of contractions I will be looking at
#of the form < is'nt > and of the form < apple's >. The first can be mapped from the json dict,
#but the second form cannot because apple is a proper noun. For all instances of the second, I will just expand it to < apple is >
start = time.time();

apos_regex = "'[a-z]+|[a-z]+'[a-z]+|[a-z]+'";

#A function allows us to iterate over quickly rather than having a complicated lambda expression
def expand(word):
    if "'" in word:
        if word in contractions:
            return contractions[word]
        if word.endswith("'s"):
            return word[:-2] + " is"
        else:
            return word;
    else:
        return word;

for row in data_sub_filtered.index:
    data_sub_filtered.loc[row, 'subject'] = ' '.join(([expand(word) for word in data_sub_filtered.loc[row, 'subject'].split()]));
    
    data_sub_filtered.loc[row, 'review'] =  ' '.join(([expand(word) for word in data_sub_filtered.loc[row,  'review'].split()]));


print('Total time taken: ' + str(time.time() - start) + 's');

Total time taken: 0.07723617553710938s


In [396]:
#Seventh step: Remove all stopwords such as and but if so then

eng_stopwords = set(stopwords.words("english"));

start = time.time();

for row in data_sub_filtered.index:
    text_subj = data_sub_filtered.loc[row, 'subject'].split();
    text_revi = data_sub_filtered.loc[row, 'review'].split();
    
    data_sub_filtered.loc[row, 'subject'] = ' '.join([word for word in text_subj if word not in eng_stopwords]);
    data_sub_filtered.loc[row, 'review']  = ' '.join([word for word in text_revi if word not in eng_stopwords]);

print('Total time taken: ' + str(time.time() - start) + 's');

Total time taken: 0.07076811790466309s


In [397]:
#Now that all of the different processes have been completed, it is time to combine them all into a single function.

#The process_sentence method will process a single sentence \
#and return a version without URLs, symbols, contractions and stopwords.
def process_sentence(sentence):
    
    #Remove special symbols and lowercase everything
    alphanum = re.sub(pattern_to_find, pattern_to_repl, sentence).lower();
    
    #Remove URLs
    nourls = re.sub(url_regex, '', alphanum);
    
    #Expand all contractions
    noapos = ' '.join(([expand(word) for word in nourls.split()]));
    
    #Remove stopwords
    bigwords = ' '.join([word for word in noapos.split() if word not in eng_stopwords]);
    
    return bigwords;

In [418]:
#The clean_data method will take in a dataframe, filter out neutral rows, binarize ratings, remove na's, and process rows.
def clean_data(dframe):
    start = time.time();
    
    dframe = dframe[dframe.rating != 3]
    
    dframe.loc[dframe.rating <= 2, 'rating'] = 0;
    dframe.loc[dframe.rating >= 4, 'rating'] = 1;

    dframe = dframe.dropna();
    
    for pos, row in enumerate(dframe.index):
        dframe.loc[row, 'subject'] = process_sentence(dframe.loc[row, 'subject']);

        dframe.loc[row,  'review'] = process_sentence(dframe.loc[row,  'review']);
        
        if pos % 1000 == 0 and pos > 0:
            time_so_far = (time.time() - start)/60;
            time_eta = time_so_far * (len(dframe) / pos) - time_so_far;
            sys.stdout.write("\rCompleted " + str(pos) + " / " + str(len(dframe)) + " in " + str(time_so_far) + "m eta: " + str(time_eta) + 'm');
           
    print('\n')
    print('Total time taken: ' + str(time.time() - start) + 's');
    
    return dframe;

In [407]:
data_sub_processed = clean_data(data_subsample)

Completed 0 / 78 in 0.002055493990580241mTotal time taken: 0.19357585906982422s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [408]:
#sanity check
data_sub_processed.equals(data_sub_filtered)

True

In [421]:
data_train_processed = clean_data(data_sample_train);

Completed 107000 / 107991 in 35.250710145632425m eta: 0.32648087620862043m


Total time taken: 2127.7107167243958s


In [419]:
data_dev_processed = clean_data(data_sample_dev);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Completed 28000 / 28859 in 2.6152098536491395m eta: 0.0802309022958787mm

Total time taken: 159.50025272369385s


In [420]:
data_test_processed = clean_data(data_sample_test);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Completed 7000 / 7150 in 0.20186723868052164m eta: 0.004325726543154024m

Total time taken: 12.259939908981323s


In [424]:
pickle.dump(data_train_processed, open("picklefiles/data_train_processed.pkl", 'wb'));
pickle.dump(data_train_processed, open("picklefiles/data_dev_processed.pkl", 'wb'))
pickle.dump(data_train_processed, open("picklefiles/data_test_processed.pkl", 'wb'))

In [None]:
#Now that we have our data, the next step is to obtain features. I will be training a 
#convolutional neural network, for which we need vectorized input. One way to do this
#is to define each word with a Word2Vec model.

#If you read my previous post on Word2Vec, the model is a shallow neural network trained
#to learn word embeddings. We will be learning features with the Word2Vec model first,
#and then using the features generated from it as inputs to the convolutional neural network.