In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

plt.style.use('ggplot')

import nltk

In [None]:
#Read the Dataset 
df = pd.read_excel('/kaggle/input/udemy-review/Udemy review.xlsx')

In [None]:
df.head()

In [None]:
print(df.shape)

In [None]:
df.info()

## Cleaning Data

In [None]:
df.isnull().sum()

In [None]:
df1 = df.drop('review_title', axis=1)

In [None]:
df1.isnull().sum()

In [None]:
df1.info()

In [None]:
df = df1.dropna()

In [None]:
df.info()

### Quick EDA

In [None]:
ax = df['review_rating'].value_counts().sort_index() \
.plot(kind='bar',title='Count of Reviews', figsize=(10,5))
ax.set_xlabel('Review Stars')

## VADER Sentiment Scoring
This uses "bag of words" approach:


1. Stop words are removed

2. each word is scored and combined to a total score

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer 
from tqdm.notebook import tqdm

sia =  SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('The course was excellent')

In [None]:
#Run the polarity score on the entire dataset 
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['review_content']
    course_id = row['course_id']
    res[course_id] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index':'course_id'})
vaders = vaders.merge(df, how='left')

In [None]:
vaders

## Plot VADER results

In [None]:
ax = sns.barplot(data=vaders, x='review_rating', y='compound')
ax.set_title('Compound Score by Udemy Review Rating')
ax.set_xlabel('Review Rating')
ax.set_ylabel('Compound Score')
plt.show()

In [None]:
ax = sns.barplot(data=vaders, x = 'review_rating', y = 'pos')
ax.set_xlabel('Review Rating')
ax.set_ylabel('Positive Rating')
ax.set_title('Positive Score by Udemy Review Rating')
plt.show()

In [None]:
ax = sns.barplot(data=vaders, x = 'review_rating', y = 'neu')
ax.set_xlabel('Review Rating')
ax.set_ylabel('Neutral Rating')
ax.set_title('Neutral Score by Udemy Review Rating')
plt.show()

In [None]:
ax = sns.barplot(data = vaders, x = 'review_rating', y='neg')
ax.set_xlabel("Review Rating")
ax.set_ylabel("Negetive Rating")
ax.set_title('Negetive Score by Review Rating')

## All in One plot Positive, Neutral and Negetive reviews

In [None]:
fig, axs = plt.subplots(1,3, figsize=(15,5))
sns.barplot(data=vaders, x = 'review_rating', y = 'pos',ax= axs[0] )
sns.barplot(data=vaders, x = 'review_rating', y = 'neu', ax= axs[1])
sns.barplot(data = vaders, x = 'review_rating', y='neg', ax= axs[2])
axs[0].set_xlabel('Review Rating')
axs[0].set_ylabel('Positive Rating')
axs[0].set_title('Positive Score by Udemy Review Rating')
axs[1].set_xlabel('Review Rating')
axs[1].set_ylabel('Neutral Rating')
axs[1].set_title('Neutral Score by Udemy Review Rating')
axs[2].set_xlabel('Review Rating')
axs[2].set_ylabel('Negetive Rating')
axs[2].set_title('Negetive Score by Udemy Review Rating')
plt.tight_layout()
plt.show()

## Roberta Pretrained Model

* Use a model trained of a large corpus of data
* Transformer model accounts for the words but also the context related to other words

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
        text = row['review_content']
        myid = row['course_id']
        roberta_result = polarity_scores_roberta(text)
        res[myid] = roberta_result

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index':'course_id'})
results_df = results_df.merge(df, how = 'left')
results_df

In [None]:
results_df.info()

## Plot Reberta Results 

In [None]:
ax = sns.barplot(data=results_df, x = 'review_rating', y = 'roberta_neu')
ax.set_xlabel('Review Rating')
ax.set_ylabel('Neutral Rating')
ax.set_title('Neutral Score by Udemy Review Rating')
plt.show()

In [None]:
ax = sns.barplot(data=results_df, x = 'review_rating', y = 'roberta_pos')
ax.set_xlabel('Review Rating')
ax.set_ylabel('Positive Rating')
ax.set_title('Positive Score by Udemy Review Rating')
plt.show()

In [None]:
ax = sns.barplot(data=results_df, x = 'review_rating', y = 'roberta_neg')
ax.set_xlabel('Review Rating')
ax.set_ylabel('Negetive Rating')
ax.set_title('Negetive Score by Udemy Review Rating')
plt.show()

## All plots in One 

In [None]:
fig, axs = plt.subplots(1,3, figsize=(15,5))
sns.barplot(data=results_df, x = 'review_rating', y = 'roberta_neu',ax= axs[0])
sns.barplot(data=results_df, x = 'review_rating', y = 'roberta_pos', ax= axs[1])
sns.barplot(data =results_df, x = 'review_rating', y='roberta_neg', ax= axs[2])
axs[0].set_xlabel('Review Rating')
axs[0].set_ylabel('Neutral Rating')
axs[0].set_title('Neutral Score by Udemy Review Rating')
axs[1].set_xlabel('Review Rating')
axs[1].set_ylabel('Positive Rating')
axs[1].set_title('Positive Score by Udemy Review Rating')
axs[2].set_xlabel('Review Rating')
axs[2].set_ylabel('Negetive Rating')
axs[2].set_title('Negetive Score by Udemy Review Rating')
plt.tight_layout()
plt.show()

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['review_content']
        myid = row['course_id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

In [None]:
result_df = pd.DataFrame(res).T
result_df = result_df.reset_index().rename(columns={'index': 'course_id'})
result_df = result_df.merge(df, how='left')
result_df

## Compare Scores Between Models

In [None]:
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='Score',
            palette='tab10')
plt.show()

## Word Cloud Representation

In [None]:
import pandas as pd
import numpy as np 

df_text = pd.read_excel('/kaggle/input/udemy-review/Udemy review.xlsx')
df_text
df_text.info()

In [None]:
df_text1=df_text.drop('review_title', axis= 1)
df_text1

In [None]:
df_text1.dropna()
df_text1.info()

In [None]:
text=df_text1['review_content']

In [None]:
final_text=text.dropna()
final_text

In [None]:
f_str = final_text.to_string(index=False)

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS, WordCloud

TXT_FILE = Path.cwd() / "wordcloud.txt"

# Read text
text = open(TXT_FILE, mode="r", encoding="utf-8").read()
stopwords = STOPWORDS

wc = WordCloud(background_color="white",max_font_size=40, stopwords=stopwords, height=600, width=400)
wc.generate(text)

# store to file
wc.to_file("wordcloud_output.png")