In [1]:
import pandas as pd
import numpy as np
import re
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def delete_redundant_cols(df, cols):
    
    for col in cols:
        del df[col]
    return df

In [4]:
def preprocess_review_text(review):
    
    review = review.lower()
    
    review = re.sub(r"http\S+|www\S+|https\S+","", review, flags=re.MULTILINE)
    
    review = review.translate(str.maketrans("","", string.punctuation))
    
    review = re.sub(r'\@\w+|\#', "",review)
    
    review_tokens = word_tokenize(review)
    filtered_words = [word for word in review_tokens if word not in stop_words]
    
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(lemma_words)

preprocess_review_text("Hi there, How are you preparing for your exams?")

'hi prepar exam'

In [5]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector


In [6]:
df=pd.read_csv("datasets/amazon_fine_food_review.csv", encoding='latin-1')
len(df["Text"])

568454

In [7]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [8]:
review_length = 400    
df_filtered = df[df["Text"].map(len) < review_length] 

In [9]:
len(df_filtered["Text"])

360149

In [10]:
redundant_cols=['Id', 'ProductId', 'UserId', 'ProfileName', 'Time']

df_filtered2=delete_redundant_cols(df_filtered, redundant_cols)

In [11]:
df_filtered2.columns

Index(['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Summary',
       'Text'],
      dtype='object')

In [12]:
df_filtered2["Score"].unique()

array([5, 1, 2, 4, 3], dtype=int64)

In [13]:
def int_to_string(sentiment):
    if sentiment == 1:
        return "Highly Negative"
    elif sentiment == 2:
        return "Somewhat Negative"
    elif sentiment == 3:
        return "Neutral"
    elif sentiment == 4:
        return "Somewhat Positive"
    else:
        return "Highly Positive"

In [14]:
df_filtered2.Text = df_filtered2["Text"].apply(preprocess_review_text)

In [15]:
df_filtered2.Text

0         bought sever vital can dog food product found ...
1         product arriv label jumbo salt peanutsth peanu...
3         look secret ingredi robitussin believ found go...
4         great taffi great price wide assort yummi taff...
6         saltwat taffi great flavor soft chewi candi in...
                                ...                        
568449    great sesam chickenthi good good restur eaten ...
568450    im disappoint flavor chocol note especi weak m...
568451    star small give 1015 one train session tri tra...
568452    best treat train reward dog good groom low cal...
568453    satisfi product advertis use cereal raw vinega...
Name: Text, Length: 360149, dtype: object

In [16]:
df_filtered2

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text
0,1,1,5,Good Quality Dog Food,bought sever vital can dog food product found ...
1,0,0,1,Not as Advertised,product arriv label jumbo salt peanutsth peanu...
3,3,3,2,Cough Medicine,look secret ingredi robitussin believ found go...
4,0,0,5,Great taffy,great taffi great price wide assort yummi taff...
6,0,0,5,Great! Just as good as the expensive brands!,saltwat taffi great flavor soft chewi candi in...
...,...,...,...,...,...
568449,0,0,5,Will not do without,great sesam chickenthi good good restur eaten ...
568450,0,0,2,disappointed,im disappoint flavor chocol note especi weak m...
568451,2,2,5,Perfect for our maltipoo,star small give 1015 one train session tri tra...
568452,1,1,5,Favorite Training and reward treat,best treat train reward dog good groom low cal...


In [17]:
df_filtered2.iloc[:, 2]

0         5
1         1
3         2
4         5
6         5
         ..
568449    5
568450    2
568451    5
568452    5
568453    5
Name: Score, Length: 360149, dtype: int64

In [18]:
tf_vector = get_feature_vector(np.array(df_filtered2.iloc[:, 4]).ravel())
X = tf_vector.transform(np.array(df_filtered2.iloc[:, 4]).ravel())
y = np.array(df_filtered2.iloc[:, 2]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [19]:
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.7648202137998056


In [20]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.6875190892683604
