Download Reviews dataset from https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews?resource=download

In [1]:
import pandas as pd
df = pd.read_csv('Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Classifying Tweets

In [2]:
# assign reviews with score > 3 as positive sentiment
# score < 3 negative sentiment
# remove score = 3
df = df[df['Score'] != 3]
df['sentiment'] = df['Score'].apply(lambda rating : +1 if rating > 3 else -1)

Data Cleaning

In [3]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"'))
    return final
df['Text'] = df['Text'].apply(remove_punctuation)
df = df.dropna(subset=['Summary'])
df['Summary'] = df['Summary'].apply(remove_punctuation)

Split the Dataframe

In [4]:
dfNew = df[['Summary','sentiment']]
dfNew.head(2)

Unnamed: 0,Summary,sentiment
0,Good Quality Dog Food,1
1,Not as Advertised,-1


In [5]:
import numpy as np

# random split train and test data
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

Create a bag of words

In [6]:
# count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Summary'])
test_matrix = vectorizer.transform(test['Summary'])

Import Logistic Regression

In [7]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

Split target and independent variables

In [8]:
X_train = train_matrix
X_test = test_matrix
y_train = train['sentiment']
y_test = test['sentiment']

Fit model on data

In [None]:
lr.fit(X_train,y_train)

Make Predictions

In [10]:
predictions = lr.predict(X_test)

Testing

In [11]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predictions,y_test)

array([[11602,  2327],
       [ 5861, 92128]])

In [12]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

          -1       0.66      0.83      0.74     13929
           1       0.98      0.94      0.96     97989

    accuracy                           0.93    111918
   macro avg       0.82      0.89      0.85    111918
weighted avg       0.94      0.93      0.93    111918



Save the vectorizer and model

In [13]:
import pickle

pickle.dump(vectorizer, open('vector.pkl', 'wb'))
pickle.dump(lr, open('model.pkl', 'wb'))