In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1-Imports

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , LSTM , Embedding
from keras.models import Sequential
from keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

import pickle
import plotly.express as px

# 2-Reading the Dataframe

In [8]:
data = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv')

# 3-exploratory data analysis (EDA)

In [9]:
print(data.shape)
data.head()

In [10]:
# Product Scores
fig = px.histogram(data, x="Score")
fig.update_traces(marker_color="turquoise",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Product Score')
fig.show()

## Check null values


In [11]:
data.isna().sum().to_frame(name='missing values')

## Removing nulls

In [12]:
total_rows =  data.shape[0]
data.dropna(how='any',inplace=True)
remaining_rows= data.shape[0]

removed_rows = total_rows-remaining_rows
print("No. of rows removed :", removed_rows)

## Removing Duplication

In [13]:
a =  data.shape[0]
data.drop_duplicates(inplace=True, subset=['Score','Text'])
b = data.shape[0]

print("No. of rows removed :", a-b)

## Removing outliers

In [14]:
a =  data.shape[0]

idx = data[data["HelpfulnessNumerator"]>data["HelpfulnessDenominator"]].index
data.drop(index=idx, inplace=True)

b = data.shape[0]

print("No. of rows removed :", a-b)

## Create target column


In [15]:
## `Score` > 3 : "Positive" 
## `Score` == 3 : "Neutral"
## `Score` < 3 : "Negative"

def create_target(x):
    return "Positive" if x>3 else "Negative" if x<3 else "Neutral"

data.loc[:, 'target'] = data.Score.apply(create_target)

data[['Score', 'target']].sample(5)

# 4-Pre-processing

## Handling imbalance


In [16]:
fig, ax = plt.subplots(figsize=(16, 6))

vc = data.target.value_counts()
vc.plot.bar(color="lightblue",fontsize=14,ax=ax)
ax.set_title("Label vs Count", fontsize=15)
plt.show()

## Downsampling

In [17]:
neutral = data.loc[data.target=="Neutral"] # 29770 reviews

positive = data.loc[data.target=="Positive"].sample(50000)

negative = data.loc[data.target=="Negative"].sample(50000)

data = pd.concat([positive, negative, neutral])
data.shape
# data.head()

In [18]:
fig, ax = plt.subplots(figsize=(16, 6))

vc = data.target.value_counts()
vc.plot.bar(color="lightblue",fontsize=14,ax=ax)
ax.set_title("Label vs Count", fontsize=15)
plt.show()

## Stopwords

In [19]:
total_stopwords = set(stopwords.words('english'))
# print(total_stopwords)
# subtract negative stop words like no, not, don't etc.. from total_stopwords because it will help us for evaluation.
negative_stop_words = set(word for word in total_stopwords 
                          if "n't" in word or 'no' in word)
# print(negative_stop_words)
final_stopwords = total_stopwords - negative_stop_words

final_stopwords.add("one")
print(final_stopwords)

## Stemming

In [20]:
stemmer = PorterStemmer()
print(stemmer)

HTMLTAGS = re.compile('<.*?>')
table = str.maketrans(dict.fromkeys(string.punctuation))
print(table)

remove_digits = str.maketrans('', '', string.digits)
print(remove_digits)

MULTIPLE_WHITESPACE = re.compile(r"\s+")

## Clear Reviews

In [21]:
def preprocessor(review):
    # remove html tags
    review = HTMLTAGS.sub(r'', review)

    # remove puncutuation
    review = review.translate(table)
    
    # remove digits
    review = review.translate(remove_digits)
    
    # lower case all letters
    review = review.lower()
    
    # replace multiple white spaces with single space
    review = MULTIPLE_WHITESPACE.sub(" ", review).strip()
    
    # remove stop words
    review = [word for word in review.split()
              if word not in final_stopwords]
    
    # stemming
    review = ' '.join([stemmer.stem(word) for word in review])
    
    return review

In [22]:
print("Before pre-processing : ")
data.Text.iloc[2]

In [23]:
data.Text = data.Text.apply(preprocessor) 
print("After pre-processing : ")
data.Text.iloc[2]

## WordCloud

In [24]:
def generate_wcloud(text):
    stopwords = set(STOPWORDS)
    
    wordcloud = WordCloud(stopwords=stopwords, background_color='white')
    wordcloud.generate(text)
    
    plt.figure(figsize=(15,7))
    plt.axis('off')
    plt.imshow(wordcloud, interpolation='bilinear')
    return plt.show()

In [25]:
# positive reviews
pos = data.loc[data.target=="Positive"].Text
text = " ".join(review for review in pos.astype(str))

generate_wcloud(text)

In [26]:
# negative reviews
neg = data.loc[data.target=="Negative"].Text
text = " ".join(review for review in neg.astype(str))

generate_wcloud(text)

In [27]:
# natural reviews
nat = data.loc[data.target=="Neutral"].Text
text = " ".join(review for review in nat.astype(str))

generate_wcloud(text)

# 5-Modeling

In [28]:
X = data.Text
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [29]:
X_train.shape, X_test.shape

## Vectorization

#### Bag of Words Vectorizer


In [30]:
bow_vectorizer = CountVectorizer(max_features=10000)
bow_vectorizer.fit(X_train)

# transform
bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)

#### TF-IDF

In [31]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer.fit(X_train)

# transform
tfidf_X_train = tfidf_vectorizer.transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

## Label Encoder


In [32]:
labelEncoder = LabelEncoder()

y_train = labelEncoder.fit_transform(y_train)
y_test = labelEncoder.transform(y_test)

labels = labelEncoder.classes_.tolist()
print(labels) # index-> class

## Training

In [33]:
def train_and_eval(model, trainX, trainY, testX, testY):

    # training
    model = model.fit(trainX, trainY)

    # predictions
    y_preds_train = model.predict(trainX)
    y_preds_test = model.predict(testX)

    # evaluation
    print()
    print(model)
    print(f"Train accuracy score : {accuracy_score(y_train, y_preds_train)}")
    print(f"Test accuracy score : {accuracy_score(y_test, y_preds_test)}")

## Logistic Regression with bag of words

In [34]:
# C = [0.001, 0.01, 0.1, 0.3]

# for c in C: 

log_model = LogisticRegression(C=0.1, max_iter=500, random_state=1)

train_and_eval(model=log_model,
               trainX=bow_X_train,
               trainY=y_train,
               testX=bow_X_test,
               testY=y_test)

## Logistic Regression with Tf-Idf


In [35]:
# C = [0.001, 0.01, 0.1, 1]

# for c in C: 

log_model = LogisticRegression(C=1, max_iter=500, random_state=1)

train_and_eval(model=log_model,
               trainX=tfidf_X_train,
               trainY=y_train,
               testX=tfidf_X_test,
               testY=y_test)

# So Tf-Idf has got the best accuracy

## Prediction on a review


In [36]:
# labels = ['Negative', 'Neutral', 'Positive']
def get_sentiment(review):
    # preprocessing
    x = preprocessor(review)
    #vectorization
    x = tfidf_vectorizer.transform([x])
    #prediction
    y = int(log_model.predict(x.reshape(1,-1)))
    return labels[y]

# Testing

In [37]:
# positve review
review = "I like this product so much it is really good one!"
print(f"This is a {get_sentiment(review)} review!")

In [38]:
# negative review
review = "so awful product that I really don't recommend it ever again"
print(f"This is a {get_sentiment(review)} review!")