In [None]:
#mounting google drive
from google.colab import drive
drive.mount('/content/gdrive',force_remount=False)

In [None]:
#importing the required library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import nltk

In [None]:
#Reading the dataset 
df = pd.read_csv('/content/gdrive/MyDrive/DataSet/Review.csv')
df = df.drop(df.columns[0],axis = 1)

In [None]:
df.head()

In [None]:
#combining review and review title 
df['Review_combine'] = df['Review_Title'] + "." + df['Review']

# NLTK for sentiment analysis

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

ex = df['Review_combine'][5]
token = nltk.word_tokenize(ex)
tagged = nltk.pos_tag(token)

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')


entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

## 1. VADER Sentiment Scoring

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores(ex)

In [None]:
# Run the polarity score on the entire dataset
result = {}
for i, row in df.iterrows():
    text = row['Review_combine']
    result[i] = sia.polarity_scores(text)

In [None]:
#Making result datframe and merge it into orginal dataframe
vaders = pd.DataFrame(result).T
df = pd.merge(df, vaders, left_index=True, right_index=True)

# 2. Roberta Sentiment Analysis

In [None]:
#Install transforrmer before running it.
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in df.iterrows():
    try:
        text = row['Review_combine']
        roberta_result = polarity_scores_roberta(text)
        res[i] = roberta_result
    except RuntimeError:
        print(f'Broke for id {i}')

In [None]:
roberta = pd.DataFrame(res).T
df = pd.merge(df, roberta, left_index=True, right_index=True)

In [None]:
df.head()

## 3. By Transformer Pipeline

In [None]:
from transformers import pipeline
sent_pipeline = pipeline("sentiment-analysis")

In [None]:
res = {}
for i, row in df.iterrows():
    try:
        text = row['Review_combine']
        roberta_result = sent_pipeline(text)
        res[i] = roberta_result
    except RuntimeError:
        print(f'Broke for id {i}')

In [None]:
l1= []
l2 = []
for i in res:
  a = res[i]
  x = a[0]
  l1.append(x['label'])
  l2.append(x['score'])


df['Label'] = l1
df['Score'] = l2

# 4. Working on Word Tree

4.1 Word Cloud for Positve Reviews

In [None]:
df_pos = df[df['Label']=='POSITIVE']
total_text = (" ".join(df_pos['Review_combine']))

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
  
 
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(total_text)
  
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []
  
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
  

total_filtered_text = (" ".join(filtered_sentence))

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

wordcloud = WordCloud(width=1000,height=500,stopwords = stopwords).generate(total_filtered_text)
plt.figure(figsize=(20,15))
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
#TOP_20 positve reviews
df_pos_sort = df_pos.sort_values("Score", ascending=False)
top_10_pos_review = df_pos_sort['Review_combine'].head(20)
for i in top_10_pos_review:
  print(i)

In [None]:
#Plotting Bar Graph for Word Frequency
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

filter_list = tokenizer.tokenize(total_filtered_text)
total_filtered_text = (" ".join(filter_list))


from collections import Counter
counter = Counter()
counter.update(total_filtered_text.split())

lan =[]
feq =[]
for i in counter.most_common(50):
  lan.append(i[0])
  feq.append(i[1])

lan.reverse()
feq.reverse()

plt.figure(figsize=(15,25))
plt.barh(lan,feq)
plt.title('Frequency of Words appear in Reviews')
plt.xlabel('Frequency')
plt.show()

4.2 Word Cloud for Negative Review

In [None]:
df_neg = df[df['Label']=='NEGATIVE']
total_text = (" ".join(df_neg['Review_combine']))

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
  
 
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(total_text)
  
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sentence = []
  
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
  

total_filtered_text = (" ".join(filtered_sentence))

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

wordcloud = WordCloud(width=1000,height=500,stopwords = stopwords).generate(total_filtered_text)
plt.figure(figsize=(20,15))
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
#TOP_20 Negative reviews
df_neg_sort = df_neg.sort_values("Score", ascending=False)
top_10_neg_review = df_neg_sort['Review_combine'].head(20)
for i in top_10_neg_review:
  print(i)

In [None]:
#Plotting Bar Graph for Word Frequency
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

filter_list = tokenizer.tokenize(total_filtered_text)
total_filtered_text = (" ".join(filter_list))


from collections import Counter
counter = Counter()
counter.update(total_filtered_text.split())

lan =[]
feq =[]
for i in counter.most_common(50):
  lan.append(i[0])
  feq.append(i[1])

lan.reverse()
feq.reverse()

plt.figure(figsize=(15,25))
plt.barh(lan,feq)
plt.title('Frequency of Words appear in Reviews')
plt.xlabel('Frequency')
plt.show()