In [40]:
#Basic Libraries
import numpy as np
import pandas as pd

In [41]:
#load the train and test data
df_train = pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/train.csv')
df_test = pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/test.csv')
df_train.head()

Unnamed: 0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [42]:
#the columns are: polarity (1 for negative and 2 for positive), title (review heading), and text (review body)
len(df_train.columns)

3

In [43]:
#the data does not have column title, so we need to assign titles to the columns
df_train.columns = ['polarity', 'title', 'text']
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


#### Step 1: After loading and exploring the data, it is time to preprocess it (that includes lowercasing, removing punctuation, and tokenizing)

In [44]:
#convert the review title and text to lowercase
df_train['text'] = df_train['text'].str.lower()

In [45]:
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,i'm reading a lot of reviews saying that this ...
1,2,Amazing!,this soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,i truly like this soundtrack and i enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","if you've played the game, you know how divine..."
4,2,an absolute masterpiece,i am quite sure any of you actually taking the...


In [46]:
#remove punctuations
import string
df_train['text'] = df_train['text'].str.translate(str.maketrans('', '', string.punctuation))
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,im reading a lot of reviews saying that this i...
1,2,Amazing!,this soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,i truly like this soundtrack and i enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...",if youve played the game you know how divine t...
4,2,an absolute masterpiece,i am quite sure any of you actually taking the...


In [47]:
#tokenizing (breaking down text into words or tokens)
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#tokenize the text column
df_train['text'] = df_train['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /Users/smirghor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"[im, reading, a, lot, of, reviews, saying, tha..."
1,2,Amazing!,"[this, soundtrack, is, my, favorite, music, of..."
2,2,Excellent Soundtrack,"[i, truly, like, this, soundtrack, and, i, enj..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","[if, youve, played, the, game, you, know, how,..."
4,2,an absolute masterpiece,"[i, am, quite, sure, any, of, you, actually, t..."


In [49]:
#removing stop words such as 'the', 'and', 'is'
from nltk.corpus import stopwords
nltk.download('stopwords')

#remove stop words
stop_words = set(stopwords.words('english'))

df_train['text'] = df_train['text'].apply(lambda x: [word for word in x if word not in stop_words])
df_train.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"[im, reading, lot, reviews, saying, best, game..."
1,2,Amazing!,"[soundtrack, favorite, music, time, hands, int..."
2,2,Excellent Soundtrack,"[truly, like, soundtrack, enjoy, video, game, ..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","[youve, played, game, know, divine, music, eve..."
4,2,an absolute masterpiece,"[quite, sure, actually, taking, time, read, pl..."


In [50]:
#stemming words to their base form , Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Lemmatize the words
df_train['text'] = df_train['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df_train.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"[im, reading, lot, review, saying, best, game,..."
1,2,Amazing!,"[soundtrack, favorite, music, time, hand, inte..."
2,2,Excellent Soundtrack,"[truly, like, soundtrack, enjoy, video, game, ..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","[youve, played, game, know, divine, music, eve..."
4,2,an absolute masterpiece,"[quite, sure, actually, taking, time, read, pl..."


In [1]:
#Done with preprocessing, and now need to save the file
#df_train.to_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/preprocessed_reviews.csv', index=False)

#### Step 2: Now is time to convert text to numerical features. Options are: Bag of word (BoW), Term Frequency-Inverse Document Frequency (TF-IDF), and Word Embeddings (Optional for Deep Learning)