In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
## Read the data set
df=pd.read_csv("Reviews.csv")

In [None]:
## Plot the score data
import plotly.express as px

fig=px.histogram(df,x="Score")
fig.update_layout(title="Review Score")
fig.update_traces(marker_line_width=1.5)

In [None]:
## Forming the word cloud to check the most frequent words used for reviews
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

## creating stopwords
stopwords=set(stopwords.words('english'))
stopwords.update(["br","href"])
text_review=" ".join(review for review in df.Text)

## creating wordcloud
wordcloud=WordCloud(stopwords=stopwords).generate(text_review)

## Plot the wordcloud data as image
plt.imshow(wordcloud,interpolation="bilinear")
plt.savefig("wordcloud.png")

In [None]:
## Now we will classify the review score as positive or negative
## score > 3 => Positive Review = 1
## Score < 3 => Negative Review = -1
## Remove data having Score =3

## We will create a new column "Sentiment" to store the positive/negative score
import numpy as np

df=df[df["Score"]!=3]

df["Sentiment"]=np.where(df.Score>3,1,-1)

df_positive=df[df["Sentiment"]==1]
df_negative=df[df["Sentiment"]==-1]

In [None]:
## wordcloud for positive sentiment

stopwords.update(["br","href","good","great"])
pos_summary=" ".join(review for review in df_positive.Summary)

## creating wordcloud
wordcloud=WordCloud(stopwords=stopwords).generate(pos_summary)

## Plot the wordcloud data as image
plt.imshow(wordcloud,interpolation="bilinear")
plt.savefig("wordcloud-positive.png")

In [None]:
df_positive.Summary.isnull().sum()

In [None]:
df_negative.Summary.isnull().sum()

In [None]:
df_negative.dropna(inplace=True)

In [None]:
df_negative.Summary.isnull().sum()

In [None]:
## wordcloud for negative sentiment
neg_summary=" ".join(review for review in df_negative.Summary)

## creating wordcloud
wordcloud=WordCloud(stopwords=stopwords).generate(neg_summary)

## Plot the wordcloud data as image
plt.imshow(wordcloud,interpolation="bilinear")
plt.savefig("wordcloud-negative.png")

In [None]:
## Plotting the sentiment score

df["Sentiment_Text"]=np.where(df.Sentiment==1,"Positive","Negative")
fig=px.histogram(df,x="Sentiment_Text")
fig.update_layout(title="Sentiment Score")

In [None]:
## Building the model
## we will only consider "summary" and "sentiment" for our analysis

df1=df[["Summary","Sentiment"]]
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
df1.dropna(inplace=True)

In [None]:
## cleaning the data

## remove punctuations

import string

def remove_punc(text):
    final=text.translate(str.maketrans('', '', string.punctuation))
    return final
df1["Summary"]=df1.Summary.apply(remove_punc)
df1.head()

In [None]:
## split the data in training and testing data
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(df1["Summary"],df1["Sentiment"],train_size=.8)

In [None]:
## convert the summary into bag of words from the df1.summary 
## We will need to convert the text into a bag-of-words model 
## since the logistic regression algorithm cannot understand text.
from sklearn.feature_extraction.text import CountVectorizer

vectorizer=CountVectorizer()
x_train_matrix = vectorizer.fit_transform(x_train)
x_test_matrix = vectorizer.transform(x_test)

In [None]:
## Logistic regression
from sklearn.linear_model import LogisticRegression

logit=LogisticRegression()
logit.fit(x_train_matrix,y_train)
y_pred=logit.predict(x_test_matrix)

In [None]:
## predicting the score

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

print(confusion_matrix(y_test,y_pred))
print("Accuracy Score",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))