In [None]:
# Import the libraries

import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt 
%matplotlib inline

import re

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.metrics import plot_confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tag import pos_tag, pos_tag_sents
from nltk.corpus import wordnet

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from pprint import pprint

## Data Exploration

In [None]:
df = pd.read_excel('programming_for_everybody.xlsx')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
# Top 10 most common reviews

df['review'].value_counts().head(10)  

In [None]:
# duplicates seem to be from different learners

df[(df.review == 'good')]

In [None]:
# Most reviews have a good rating - the dataset is not balanced, there should be more positive than negative reviews

df['rating'].value_counts()

In [None]:
# Most people have given a rating of 5

plt.figure(figsize =(10, 7)) 
plt.pie([8925, 1033, 163], labels = ['5', '4', '1 - 3'], autopct='%1.1f%%');
plt.axis('equal')
plt.show()

In [None]:
# Check null values

df.isnull().sum() * 100 / df.shape[0]  

In [None]:
# Count null values in 'review'

len(df[df['review'].isna()==True])


In [None]:
# Drop null values

df.dropna(inplace = True)


In [None]:
# Count null values in 'review'

len(df[df['review'].isna()==True])


In [None]:
df.shape

In [None]:
# Create column 'length'

df['length'] = df['review'].astype(str).apply(len)
df.head()

In [None]:
# The average length of a review is 77 words - reviews are in general short

df['length'].describe()

In [None]:
# The longest reviews have a rating of 1-2. Shortest reviews have a rating of 3. 

px.box(df, x = "rating", y = "length")

In [None]:
# Label reviews as positive, negative and neutral depending on their rating

label = []

for row in df['rating']:
    if row == 1 or row == 2:
        label.append('neg')
    elif row == 3:
        label.append('neu')
    else:
        label.append('pos')

In [None]:
df['label'] = label

In [None]:
df.head()

In [None]:
df['label'].value_counts()

In [None]:
# 98 % of the reviews are positive according to the rating

plt.figure(figsize =(10, 7)) 
plt.pie([9956, 99, 64], labels = ['Positive', 'Neutral', 'Negative'], autopct='%1.1f%%');
plt.axis('equal')
plt.show()

## VADER Sentiment analysis

Code borrowed from: https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef7664

In [None]:
df_vader = df.copy(deep = True)

In [None]:
# Create the vader sentiment calculator 

sid = SentimentIntensityAnalyzer()

In [None]:
# Apply vader to the reviews

df_vader['scores'] = df_vader['review'].apply(lambda review: sid.polarity_scores(review))

df_vader.head()

In [None]:
# Extract the compound score from each review

df_vader['compound']  = df_vader['scores'].apply(lambda score_dict: score_dict['compound'])

df_vader.head()

In [None]:
# We once again label reviews as positive, negative and neutral depending on their compound score

sentiment = []

for row in df_vader['compound']:
    if row < 0:
        sentiment.append('neg')
    elif row > 0:
        sentiment.append('pos')
    else:
        sentiment.append('neu')
        

In [None]:
df_vader['sentiment'] = sentiment

df_vader.head()

In [None]:
# VADER could predict the sentiment of the reviews with almost 88% accuracy (using the rating as the accuracy measure for evaluation)

print( 'The accuracy of the sentiment prediction is ', accuracy_score(label, sentiment))

## Sentiment Analysis on sentence-level

In [None]:
df_sen = df.copy(deep = True)

In [None]:
rule = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
df_sen['review'] = df_sen.loc[:10120, "review"].apply(lambda x: re.split(rule, x))

In [None]:
df_sen

In [None]:
df_sen = df_sen.explode("review")


In [None]:
df_sen

In [None]:
df_sen['review_index'] = df_sen.index

In [None]:
df_sen.columns

In [None]:
# Rearrange the columns

df_sen = df_sen[['review_index', 'date', 'review', 'rating', 'status', 'length', 'label']]

In [None]:
df_sen.head()

In [None]:
df_sen = df_sen.reset_index(drop = True)

In [None]:
df_sen

In [None]:
# Update length column

df_sen['length'] = df_sen['review'].astype(str).apply(len)
df_sen.head()

In [None]:
# Apply vader to the reviews

df_sen['scores'] = df_sen['review'].apply(lambda review: sid.polarity_scores(review))

df_sen.head()

In [None]:
# Extract the compound score from each review

df_sen['compound']  = df_sen['scores'].apply(lambda score_dict: score_dict['compound'])

df_sen.head()

In [None]:
sentiment_1 = []

for row in df_sen['compound']:
    if row < 0:
        sentiment_1.append('neg')
    elif row > 0:
        sentiment_1.append('pos')
    else:
        sentiment_1.append('neu')

In [None]:
df_sen['sentiment'] = sentiment_1

df_sen.head()

In [None]:
df_sen['sentiment'].value_counts()

## Topic Modeling on sentence level

In [None]:
# Create a column with the original sentences

df_sen['original'] = df_sen['review']

## Preprocessing 

In [None]:
# Covert to lowercase

df_sen['review'] =  df_sen['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_sen['review'].head()


In [None]:
# Remove punctuations

df_sen['review'] = df_sen['review'].str.replace('[^\w\s]','')
df_sen['review'].head()

In [None]:
# Remove stopwords

stop = stopwords.words('english')

df_sen['review'] = df_sen['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df_sen['review'].head()

In [None]:
# Tokenization

tokenizer = RegexpTokenizer(r'\w+')

df_sen['review'] = df_sen['review'].apply(lambda x: tokenizer.tokenize(x.lower()))
df_sen['review'].head(5)

In [None]:
# Add speech tags

df_sen['review'] = df_sen['review'].apply(nltk.tag.pos_tag)
df_sen['review'].head()

In [None]:
# Define function to convert parts of speech tags to wordnet’s format

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Apply the function to the tagged data

df_sen['review'] = df_sen['review'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
df_sen['review'].head()

In [None]:
# Create lemmatizer

lemmatizer = WordNetLemmatizer()

# Apply to data 

df_sen['review'] = df_sen['review'].apply(lambda x: [lemmatizer.lemmatize(word, tag) for word, tag in x])
df_sen['review'].head()

In [None]:
df_sen['review']

## LDA Topic-Modeling

In [None]:
# Bag of words on the Data set
# Create a dictionary from 'df_sen['review']' containing the number of times a word appears in the training set

dictionary = gensim.corpora.Dictionary(df_sen['review'])

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
# Gensim filter extremes

# Filter out tokens that appear in:
# Less than 30 documents, more than 0.5 documents, and keep only 100000 most frquent tokens

dictionary.filter_extremes(no_below = 50, no_above = 0.5, keep_n = 100000)

In [None]:
# Gensim doc2bow
# Create a dictionary reporting how many words and how many times those words appear

bow_corpus = [dictionary.doc2bow(doc) for doc in df_sen['review']]


In [None]:
# TF-IDF

tfidf = models.TfidfModel(bow_corpus)      # Create a TF-IDF model
corpus_tfidf = tfidf[bow_corpus]           # Apply transformation to the entire corpus and call it ‘corpus_tfidf’

for doc in corpus_tfidf:       # Preview for the first document
    pprint(doc)
    break

In [None]:
# Run parallelized LDA using Tf-IDF

%timeit
lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics = 6, id2word = dictionary, passes = 10, workers = 4)

In [None]:
# Explore the words occurring in that topic and its relative weight

for id, topic in lda_model.print_topics(-1):
    print('Topic: {}\nWord: {}\n'.format(id, topic))

In [None]:
# Create new column with the topic

df_sen['topic'] = ''

df_sen['topic'] = [max(p, key = lambda item: item[1]) for p in lda_model[corpus_tfidf]]

In [None]:
# Create two separate columns for the topic ID and topic score

df_sen[['topic_id', 'topic_score']] = df_sen['topic'].apply(pd.Series)

In [None]:
df_sen.head()