## Libraries

In [1]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import contractions

In [3]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.naive_bayes import MultinomialNB

## Read Data

In [6]:
from google.colab import drive
drive.mount("/content/drive")
df=pd.read_table('/content/drive/My Drive/Colab Notebooks/amazon_reviews_us_Beauty_v1_00.tsv', on_bad_lines='skip', usecols=['review_body', 'star_rating'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
def avg_char_count():
  df['review_body_character_count'] = df['review_body'].str.len()
  return str(df['review_body_character_count'].mean())

In [8]:
df['review_body']=df['review_body'].apply(str)

In [9]:
df1 = df.query("star_rating == '1' | star_rating == '2'").sample(n=20000)
df1 = df1.assign(rating_class=1)

In [10]:
df2=df.query("star_rating == '3'").sample(n=20000)
df2 = df2.assign(rating_class=2)

In [11]:
df3=df.query("star_rating == '4' | star_rating == '5'").sample(n=20000)
df3 = df3.assign(rating_class=3)

In [12]:
df = pd.concat([df1[['star_rating', 'review_body', 'rating_class']], df2[['star_rating', 'review_body', 'rating_class']], df3[['star_rating', 'review_body', 'rating_class']]])

## Data Cleaning

In [13]:
before_cleaning_count = avg_char_count()

In [14]:
df['review_body'] = df['review_body'].str.lower()

In [15]:
def cleaning_html_url(string):
  string = re.sub('<.*?>','',string)
  string = re.sub('https?://\S+|www\.\S+','', string)
  return string
df['review_body']=df['review_body'].apply(cleaning_html_url)

In [16]:
def cleaning_non_alpha_spaces(string):
  string = re.sub(r'[^a-z0-9\s]', '', string)
  string = re.sub('\n', '', string)
  string = re.sub(r'\s+', ' ', string)
  string = re.sub('\w*\d\w*', '', string)
  return string
df['review_body']=df['review_body'].apply(cleaning_non_alpha_spaces)

In [17]:
df['review_body']=df['review_body'].str.strip()

In [18]:
df['review_body']=df['review_body'].apply(lambda s : contractions.fix(s))

In [19]:
after_cleaning_count = avg_char_count()

In [20]:
print(before_cleaning_count + ", " + after_cleaning_count)

189.75916666666666, 181.91503333333333


## Data Preprocessing

In [21]:
before_preprocessing_count = avg_char_count()

In [22]:
def stop_words(text):
  tokens = word_tokenize(text.lower())
  english_stopwords = stopwords.words('english')
  tokens_wo_stopwords = [t for t in tokens if t not in english_stopwords]
  return " ".join(tokens_wo_stopwords)
  
df['review_body'] = df['review_body'].apply(stop_words)


In [23]:
lemmatizer = WordNetLemmatizer()

In [24]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [25]:
def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence)) 
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [26]:
df['review_body'] = df['review_body'].apply(lemmatize_sentence)

In [27]:
after_perprocessing_count = avg_char_count()

In [28]:
print(before_preprocessing_count + ", " + after_perprocessing_count)

181.91503333333333, 106.35263333333333


## TF-IDF Feature Extraction

In [29]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review_body'])

In [30]:
vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aaaa', ..., 'zyloo', 'zz', 'zzzquil'], dtype=object)

In [31]:
X.shape

(60000, 35208)

In [32]:
Y = df['rating_class']

In [33]:
def printMatrix(matrix):
  print(str(matrix['1']['precision']) + ", " + str(matrix['1']['recall']) + ", " + str(matrix['1']['f1-score']))
  print(str(matrix['2']['precision']) + ", " + str(matrix['2']['recall']) + ", " + str(matrix['2']['f1-score']))
  print(str(matrix['3']['precision']) + ", " + str(matrix['3']['recall']) + ", " + str(matrix['3']['f1-score']))
  print(str(matrix['macro avg']['precision']) + ", " + str(matrix['macro avg']['recall']) + ", " + str(matrix['macro avg']['f1-score']))

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

## Perceptron

In [35]:
perceptron = Perceptron(n_jobs = -1, max_iter = 10000, random_state = 10)
fit_model = perceptron.fit(X_train,Y_train)
Y_pred = perceptron.predict(X_test)

In [36]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

0.5956229793583686, 0.6043401463537724, 0.5999498997995992
0.49376776732998034, 0.5625311410064773, 0.525911261208804
0.7087492660011744, 0.6000497141436739, 0.6498855835240275
0.5993800042298411, 0.5889736671679745, 0.5919155815108103


## SVM

In [37]:
svmClassifier = svm.LinearSVC(C = 0.01, multi_class="ovr", random_state = 10)
svmClassifier.fit(X_train, Y_train)
Y_pred = svmClassifier.predict(X_test)

In [38]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

0.6424043715846994, 0.7416098914963412, 0.6884516280159287
0.618469358938153, 0.5455904334828101, 0.5797485109199205
0.7518022657054583, 0.7258264976385782, 0.7385860629821677
0.6708919987427703, 0.6710089408725765, 0.6689287339726723


## Logistic Regression

In [39]:
logisticRegressor = LogisticRegression(C = 0.4, solver = 'sag', random_state = 10)
logisticRegressor.fit(X_train, Y_train)
Y_pred = logisticRegressor.predict(X_test)

In [40]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

0.6738197424892703, 0.7130961392884179, 0.6929018021331371
0.6035060582624387, 0.583208769307424, 0.5931838337767643
0.7619047619047619, 0.7437235893611732, 0.7527044025157232
0.6797435208854904, 0.680009499319005, 0.6795966794752083


## Multinomial Naive Bayes

In [41]:
NBClassifier = MultinomialNB()
NBClassifier.fit(X_train, Y_train)
Y_pred = NBClassifier.predict(X_test)

In [42]:
printMatrix(metrics.classification_report(Y_test, Y_pred, output_dict=True))

0.694369247428262, 0.6472369417108251, 0.6699751861042184
0.5667796610169491, 0.624813153961136, 0.5943832207607537
0.7505797474877609, 0.7240865026099925, 0.7370951417004048
0.6705762186443239, 0.6653788660939846, 0.6671511828551256
