In [1]:
#Basic Libraries
import numpy as np
import pandas as pd


In [4]:
#load the train and test data
df_train = pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/train.csv')
df_train.head()

Unnamed: 0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [5]:
#the columns are: polarity (1 for negative and 2 for positive), title (review heading), and text (review body)
len(df_train.columns)

3

In [6]:
#the data does not have column title, so we need to assign titles to the columns
df_train.columns = ['polarity', 'title', 'text']
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


#### Step 1: After loading and exploring the data, it is time to preprocess it (that includes lowercasing, removing punctuation, and tokenizing)

In [7]:
#convert the review title and text to lowercase
df_train['text'] = df_train['text'].str.lower()

In [8]:
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,i'm reading a lot of reviews saying that this ...
1,2,Amazing!,this soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,i truly like this soundtrack and i enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","if you've played the game, you know how divine..."
4,2,an absolute masterpiece,i am quite sure any of you actually taking the...


In [9]:
#remove punctuations
import string
df_train['text'] = df_train['text'].str.translate(str.maketrans('', '', string.punctuation))
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,im reading a lot of reviews saying that this i...
1,2,Amazing!,this soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,i truly like this soundtrack and i enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...",if youve played the game you know how divine t...
4,2,an absolute masterpiece,i am quite sure any of you actually taking the...


In [10]:
#tokenizing (breaking down text into words or tokens)
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#tokenize the text column
df_train['text'] = df_train['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /Users/smirghor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
df_train.head()

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"[im, reading, a, lot, of, reviews, saying, tha..."
1,2,Amazing!,"[this, soundtrack, is, my, favorite, music, of..."
2,2,Excellent Soundtrack,"[i, truly, like, this, soundtrack, and, i, enj..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","[if, youve, played, the, game, you, know, how,..."
4,2,an absolute masterpiece,"[i, am, quite, sure, any, of, you, actually, t..."


In [12]:
#removing stop words such as 'the', 'and', 'is'
from nltk.corpus import stopwords
nltk.download('stopwords')

#remove stop words
stop_words = set(stopwords.words('english'))

df_train['text'] = df_train['text'].apply(lambda x: [word for word in x if word not in stop_words])
df_train.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"[im, reading, lot, reviews, saying, best, game..."
1,2,Amazing!,"[soundtrack, favorite, music, time, hands, int..."
2,2,Excellent Soundtrack,"[truly, like, soundtrack, enjoy, video, game, ..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","[youve, played, game, know, divine, music, eve..."
4,2,an absolute masterpiece,"[quite, sure, actually, taking, time, read, pl..."


In [13]:
#stemming words to their base form , Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Lemmatize the words
df_train['text'] = df_train['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df_train.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"[im, reading, lot, review, saying, best, game,..."
1,2,Amazing!,"[soundtrack, favorite, music, time, hand, inte..."
2,2,Excellent Soundtrack,"[truly, like, soundtrack, enjoy, video, game, ..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","[youve, played, game, know, divine, music, eve..."
4,2,an absolute masterpiece,"[quite, sure, actually, taking, time, read, pl..."


In [44]:
#Done with preprocessing, and now need to save the file
df_train.to_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/preprocessed_reviews_train.csv', index=False)

In [52]:
df_test = pd.read_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/test.csv')
# Ensure columns are properly named
df_test.columns = ['polarity', 'title', 'text']

df_test.head()

Unnamed: 0,polarity,title,text
0,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
1,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
2,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
3,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
4,1,DVD Player crapped out after one year,I also began having the incorrect disc problem...


In [53]:
#All these steps on the test data

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize objects
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))



# Lowercase the text
df_test['text'] = df_test['text'].str.lower()

# Remove punctuation
df_test['text'] = df_test['text'].str.translate(str.maketrans('', '', string.punctuation))

# Tokenize the text
df_test['text'] = df_test['text'].apply(word_tokenize)
# Remove stop words
df_test['text'] = df_test['text'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatize the words
df_test['text'] = df_test['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Inspect the result

df_test.head()

[nltk_data] Downloading package punkt to /Users/smirghor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/smirghor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,polarity,title,text
0,2,One of the best game music soundtracks - for a...,"[despite, fact, played, small, portion, game, ..."
1,1,Batteries died within a year ...,"[bought, charger, jul, 2003, worked, ok, desig..."
2,2,"works fine, but Maha Energy is better","[check, maha, energy, website, powerex, mhc204..."
3,2,Great for the non-audiophile,"[reviewed, quite, bit, combo, player, hesitant..."
4,1,DVD Player crapped out after one year,"[also, began, incorrect, disc, problem, ive, r..."


In [58]:
#save the test file for future use
df_test.to_csv('/Users/smirghor/Library/Mobile Documents/com~apple~CloudDocs/Personal/Machine Learning Projects/data.nosync/amazon_review_polarity_csv/preprocessed_reviews_test.csv',index=False )

Unnamed: 0,polarity,title,text
0,2,The best soundtrack ever to anything.,"['im', 'reading', 'lot', 'review', 'saying', '..."
1,2,Amazing!,"['soundtrack', 'favorite', 'music', 'time', 'h..."
2,2,Excellent Soundtrack,"['truly', 'like', 'soundtrack', 'enjoy', 'vide..."
3,2,"Remember, Pull Your Jaw Off The Floor After He...","['youve', 'played', 'game', 'know', 'divine', ..."
4,2,an absolute masterpiece,"['quite', 'sure', 'actually', 'taking', 'time'..."


Unnamed: 0,polarity,title,text
0,2,One of the best game music soundtracks - for a...,"['despite', 'fact', 'played', 'small', 'portio..."
1,1,Batteries died within a year ...,"['bought', 'charger', 'jul', '2003', 'worked',..."
2,2,"works fine, but Maha Energy is better","['check', 'maha', 'energy', 'website', 'powere..."
3,2,Great for the non-audiophile,"['reviewed', 'quite', 'bit', 'combo', 'player'..."
4,1,DVD Player crapped out after one year,"['also', 'began', 'incorrect', 'disc', 'proble..."


In [5]:
#Let's start with the TF-IDF model that is the advanced verion of BoW for the train data
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer with built-in stop words removal
vectorizer = TfidfVectorizer()

# Fit and transform the train data
X_train = vectorizer.fit_transform(df_train_preprocessed['text'].apply(lambda x: ' '.join(x)))

# Transform the test data (do not fit again)
X_test = vectorizer.transform(df_test_preprocessed['text'].apply(lambda x: ' '.join(x)))

ValueError: empty vocabulary; perhaps the documents only contain stop words

#### Step 3: Separate Features and Labels

In [4]:
y_train = df_train_preprocessed['polarity']
y_test = df_test_preprocessed['polarity']

NameError: name 'df_train_preprocessed' is not defined

#### Step 4: Train and evaluate model

In [5]:
#start with a simple model such as logistic regression to just classify the reviews and negative (1) or positive (2)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#initialze the model
model = LogisticRegression()

#train the model
model.fit(X_train, y_train)

#make predictions on the test set
y_pred = model.predict(X_test)

#evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

#print a detailed classification report
print(classification_report(y_test, y_pred))

NameError: name 'X_train' is not defined