# Sentiment Analysis and Evaluative Language

### Implementation for English Dataset

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.similarities import Similarity

import numpy as np
import pandas as pd
from scipy import spatial

import csv
import io
import glob
import spacy
import re
import string
import random
from random import sample
from itertools import chain
import pickle

from spacy.lang.en import English
import nltk

from collections import Counter
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.manifold import TSNE
from sklearn.cluster import AffinityPropagation

from textblob import TextBlob
from itertools import cycle

In [None]:
def tokenize_pipe(doc):
    tok_list = [tok.text.lower() for tok in doc
                  if tok.is_alpha] 
    return tok_list

def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=8):
        preproc_pipe.append(tokenize_pipe(doc))
    return preproc_pipe

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1000000000

#### Reviews: Data import and preprocessing

In [None]:
reviews_eng = r"C:\...\ENG_reviews"
additional_text = r"C:\...\ENG_corpus"

In [None]:
list_files = glob.glob(reviews_eng + '/*.txt')
list_additional = glob.glob(additional_text + '/*.txt')

In [None]:
reviews_txt = []
for file_path in list_files:
    with open(file_path, encoding="utf-8") as file:
        reviews_txt.append(file.read())
        
additional_txt = []
for file_path in list_additional:
    with open(file_path, encoding="utf-8") as file:
        additional_txt.append(file.read())

In [None]:
reviews_txt_clean = [word.replace('\n',' ') for word in reviews_txt]
reviews_txt_clean = [word.lower() for word in reviews_txt_clean]

#reviews = preprocess_pipe(reviews_txt_clean)

#with open(r'C:\...\ENG_reviews_tok.pkl', 'wb') as f:
#    pickle.dump(reviews, f)

In [None]:
with open(r'C:\...\ENG_reviews_tok.pkl', 'rb') as f:
    reviews = pickle.load(f)

In [None]:
reviews_toks = [tok for sent in reviews for tok in sent]

In [None]:
df_fdist_reviews = pd.DataFrame(reviews_toks, columns=['Freq'])
relfreq_reviews = pd.DataFrame(df_fdist_reviews['Freq'].value_counts())
relfreq_reviews = relfreq_reviews / len(reviews_toks)

#### Comparative corpus: Data import and preprocessing

In [None]:
#additional_txt_clean = [word.replace('\n',' ') for word in additional_txt]
#additional_txt_clean = [word.lower() for word in additional_txt_clean]

#additional = preprocess_pipe(additional_txt_clean)

#with open(r'C:\...\ENG_additional_lemma.pkl', 'wb') as f:
#    pickle.dump(additional, f)

In [None]:
with open(r'C:\...\ENG_additional_tok.pkl', 'rb') as f:
    additional = pickle.load(f)

In [None]:
additional_toks = [tok for sent in additional for tok in sent if tok.isalpha()]

In [None]:
df_fdist_additional = pd.DataFrame(additional_toks, columns=['Freq'])
relfreq_additional = pd.DataFrame(df_fdist_additional['Freq'].value_counts())
relfreq_additional = relfreq_additional / len(additional_toks)

#### Comparative frequency

In [None]:
in_novels = []
in_reviews = []
for row in relfreq_reviews.index:
    freq_reviews = float(relfreq_reviews.loc[row])
    freq_additional = 0
    if row in relfreq_additional.index:
        freq_additional = float(relfreq_additional.loc[row])
    score = freq_reviews-freq_additional
    if score > 0:
        #print(row + ": " + str(freq_reviews) + " | " + str(freq_additional)) 
        in_reviews.append([row, score])
    else:
        in_novels.append([row, score])

In [None]:
len(in_reviews)

In [None]:
in_reviews_words = []
for entry in in_reviews:
    in_reviews_words.append(entry[0])

#### Word embedding

In [None]:
#all_toks = additional_toks + reviews_toks
#sents = additional + reviews
#model = Word2Vec(sentences = sents, vector_size=300, window=5, min_count=5, workers=4)

#with open(r'C:\...\ENG_reviews_model_FINAL.pkl', 'wb') as f:
#    pickle.dump(model, f)

In [None]:
with open(r'C:\...\ENG_reviews_model_FINAL.pkl', 'rb') as f:
    model = pickle.load(f)

#### Evaluative words

In [None]:
pos = ["ADJ","NOUN"]
reviews_pos = []
for doc in nlp.pipe(reviews_txt_clean, disable=["ner"]):
    sentence_toks = [[token.text for token in sent if token.pos_ in pos] for sent in doc.sents]
    reviews_pos = reviews_pos + sentence_toks

In [None]:
stopwords = nltk.corpus.stopwords.words("english")

In [None]:
reviews_tok_clean = [[tok for tok in review if tok not in stopwords] for review in reviews_pos]

In [None]:
reviews_toks = []
for review in reviews_tok_clean:
    for tok in review:
        if tok in in_reviews_words:
            reviews_toks.append(tok)

In [None]:
reviews_tok_clean_top = Counter(reviews_toks).most_common(10000)
corpus_top = [i[0] for i in reviews_tok_clean_top]

In [None]:
pos = ["excellent","admirable","estimable","exemplary",
       "invaluable","incomparable","superb","outstanding",
       "wonderful","perfect","superior","worthy","fine",
       "exceptional","skillful","masterful","extraordinary",
       "impressive","notable","noteworthy"]

In [None]:
len(pos)

In [None]:
add_pos = []
for p in pos:
    x = model.wv.most_similar(p, topn=2)
    for entry in x:
        add_pos.append(entry[0])

In [None]:
neg = ["terrible","grievous","hideous","ghastly",
       "disgusting","unfavourable","disagreeable","distasteful",
       "error","fault","unpleasant","imprudent","unlikely",
       "undesirable","unreasonable","absurd","offensive",
       "unsuitable","questionable","disconcerting"]

In [None]:
len(neg)

In [None]:
add_neg = []
for p in neg:
    x = model.wv.most_similar(p, topn=2)
    for entry in x:
        add_neg.append(entry[0])

In [None]:
val_words = pos + neg + add_pos + add_neg
val_words = list(set(val_words))

In [None]:
remaining_words = []
for word in val_words:
    if word in corpus_top:
        remaining_words.append(word)
    #else:
    #    print(word)

In [None]:
len(remaining_words)

#### Review analysis / SentiArt

In [None]:
l = []
for word in remaining_words:
    l.append(model.wv[word])
X = np.array(l)

In [None]:
af = AffinityPropagation(random_state=1).fit(X)

In [None]:
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

In [None]:
n_clusters_ = len(cluster_centers_indices)

print("Estimated number of clusters: %d" % n_clusters_)

In [None]:
cluster_words = [""]*len(cluster_centers_indices)
i = 0

for label in labels:
    cluster_num = int(label)
    cluster_words[cluster_num] = cluster_words[cluster_num] + " " + remaining_words[i]
    i += 1

In [None]:
plt.close("all")
plt.figure(1)
plt.clf()

colors = cycle("bgrcmykbgrcmykbgrcmykbgrcmyk")
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X[cluster_centers_indices[k]]
    plt.plot(X[class_members, 0], X[class_members, 1], col + ".")
    plt.plot(
        cluster_center[0],
        cluster_center[1],
        "o",
        markerfacecolor=col,
        markeredgecolor="k",
        markersize=14,
        )
    for x in X[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.show()

In [None]:
cluster_words

In [None]:
#Check indeces and adapt if necessary
pos_centroids = [X[cluster_centers_indices[10]],X[cluster_centers_indices[2]],X[cluster_centers_indices[11]]]

' extraordinary unusual remarkable singular',
' startling marvellous wonderful astonishing'
' admirable excellent amiable'

In [None]:
#Check indeces and adapt if necessary
neg_centroids = [X[cluster_centers_indices[3]]]

' ghastly horrible hideous terrible'

In [None]:
vocab = model.wv.key_to_index

In [None]:
keys = []
similarity = []
all_pos = []
all_neg = []
for word in corpus_top:
    if word in vocab.keys():
        vec = model.wv[word]

        pos = 0
        for centroid in pos_centroids:
            pos = np.dot(centroid, vec)/(np.linalg.norm(centroid)* np.linalg.norm(vec))
            all_pos.append(pos)
        pos = pos/len(pos_centroids)

        neg = 0
        for centroid in neg_centroids:
            neg = np.dot(centroid, vec)/(np.linalg.norm(centroid)* np.linalg.norm(vec))
            all_neg.append(neg)
        neg = neg/len(neg_centroids)

        sim = pos-neg
        similarity.append(sim)
        keys.append(word)

In [None]:
senti_dict = dict(zip(keys,similarity))

In [None]:
senti_dict

In [None]:
reviews_toks = []
for doc in nlp.pipe(reviews_txt_clean, disable=["ner"]):
    sentence_toks = [[token.text for token in sent if token.text not in stopwords and token.is_alpha] for sent in doc.sents]
    reviews_toks.append(sentence_toks)

In [None]:
reviews_sentiscores = []
for review in reviews_toks:
    sentiscores_review_level = []
    sentence_count = 0
    
    for sentence in review:
        sentiscores_sent_level = []
        
        for word in sentence:
            if word in senti_dict.keys():
                score = senti_dict[word]
                sentiscores_sent_level.append(score)
                
        if len(sentiscores_sent_level) != 0:
            sentiscores_review_level.append(sum(sentiscores_sent_level)/len(sentiscores_sent_level))
        else:
            sentiscores_review_level.append(0)
    
    if len(sentiscores_review_level) != 0:
        reviews_sentiscores.append(sum(sentiscores_review_level)/len(sentiscores_review_level))
    else:
        reviews_sentiscores.append(0)

#### Comparison: Sentiment analysis with TextBlob

In [None]:
all_sentiment_scores = []
for review in reviews_toks:
    sentiscores_review_level = []
    sentence_count = 0
    
    for sentence in review:
        sentiscores_sent_level = []
        
        for word in sentence:
            score = TextBlob(word).sentiment.polarity
            if word in senti_dict.keys():
                score = score * 1.5
            sentiscores_sent_level.append(score)
                
        if len(sentence) != 0:
            sentiscores_review_level.append(sum(sentiscores_sent_level)/len(sentence))
        else:
            sentiscores_review_level.append(0)
        
    if len(review) != 0:
        all_sentiment_scores.append(sum(sentiscores_review_level)/len(review))
    else:
        all_sentiment_scores.append(0)

In [None]:
filenames = []
journals = []
text_ids = []
for file in list_files:
    filename = re.sub(r".+reviews\\(.+).txt", r"\1", file)
    filenames.append(filename)
    journal = re.sub(r"\d{5}_\d{4}_(.+?)_.+", r"\1", filename)
    journals.append(journal)
    text_id = re.sub(r"(\d{5})_.+", r"\1", filename)
    text_ids.append(int(text_id))

In [None]:
reviews_senti_df = pd.DataFrame(
    {'sentiscore_average': reviews_sentiscores,
     'sentiment_Textblob': all_sentiment_scores,
     'textfile': filenames,
     'journal': journals,
     'text_id': text_ids
})

In [None]:
reviews_senti_df.to_csv(r"C:\...\ENG_reviews_senti_FINAL.csv",
                        sep = ';', encoding = 'utf-8')

In [None]:
testset = pd.read_csv(r'C:\...\ENG_testset.csv', sep = ";")

In [None]:
eval_df = pd.merge(testset, reviews_senti_df, how='inner', on=['textfile'])

In [None]:
eval_df

In [None]:
confusion_matrix_sentiscore = [[0], [0], 
                               [0], [0]]
confusion_matrix_sentiscore = pd.DataFrame(confusion_matrix_sentiscore,
                                          index = ['TP','FP','TN','FN'],
                                          columns = ['count'])
confusion_matrix_textblob = [[0], [0], 
                             [0], [0]]
confusion_matrix_textblob = pd.DataFrame(confusion_matrix_textblob,
                                          index = ['TP','FP','TN','FN'],
                                          columns = ['count'])


for index, row in eval_df.iterrows():
    sentiscore = row['sentiscore_average']
    textblob = row['sentiment_Textblob']
    classified = row['class']
    text_id = row['text_id']

    if classified == 1:
        if sentiscore > 0:
            print(str(text_id )+ ": TRUE POSITIVE: " + str(classified) + " : " + str(sentiscore))
            confusion_matrix_sentiscore.loc["TP"] += 1
        else:
            print(str(text_id )+ ": FALSE NEGATIVE: " + str(classified) + " : " + str(sentiscore))
            confusion_matrix_sentiscore.loc["FN"] += 1
            
        if textblob > 0:
            print(str(text_id )+ ": TRUE POSITIVE: " + str(classified) + " : " +  str(textblob))
            confusion_matrix_textblob.loc["TP"] += 1
        else:
            print(str(text_id )+ ": FALSE NEGATIVE: " + str(classified) + " : " +  str(textblob))
            confusion_matrix_textblob.loc["FN"] += 1
            
    else:
        if sentiscore < 0:
            print(str(text_id )+ ": TRUE NEGATIVE: " + str(classified) + " : " + str(sentiscore))
            confusion_matrix_sentiscore.loc["TN"] += 1
        else:
            print(str(text_id )+ ": FALSE POSITIVE: " + str(classified) + " : " + str(sentiscore))
            confusion_matrix_sentiscore.loc["FP"] += 1       

        if textblob < 0:
            print(str(text_id )+ ": TRUE NEGATIVE: " + str(classified) + " : " + str(textblob))
            confusion_matrix_textblob.loc["TN"] += 1
        else:
            print(str(text_id )+ ": FALSE POSITIVE: " + str(classified) + " : " + str(textblob))
            confusion_matrix_textblob.loc["FP"] += 1       


In [None]:
confusion_matrix_sentiscore

In [None]:
confusion_matrix_textblob

In [None]:
tp_senti = int(confusion_matrix_sentiscore.loc["TP"])
tn_senti = int(confusion_matrix_sentiscore.loc["TN"])
fn_senti = int(confusion_matrix_sentiscore.loc["FN"])
fp_senti = int(confusion_matrix_sentiscore.loc["FP"])

recall_senti = tp_senti/(tp_senti + fn_senti)
precision_senti = tp_senti/(tp_senti + fp_senti)
accuracy_senti = (tp_senti + tn_senti) / 30

print("Sentiscore:\n" + 
      "Recall: " + str(recall_senti) + "\n" +
      "Precision: " + str(precision_senti) + "\n" +
      "Accuracy: " + str(accuracy_senti))

In [None]:
tp_textblob = int(confusion_matrix_textblob.loc["TP"])
tn_textblob = int(confusion_matrix_textblob.loc["TN"])
fn_textblob = int(confusion_matrix_textblob.loc["FN"])
fp_textblob = int(confusion_matrix_textblob.loc["FP"])

recall_textblob = tp_textblob/(tp_textblob + fn_textblob)
precision_textblob = tp_textblob/(tp_textblob + fp_textblob)
accuracy_textblob = (tp_textblob + tn_textblob) / 30

print("TextBlob:\n" + 
      "Recall: " + str(recall_textblob) + "\n" +
      "Precision: " + str(precision_textblob) + "\n" +
      "Accuracy: " + str(accuracy_textblob))

In [None]:
reviews_senti_df

In [None]:
pos_textBlob = reviews_senti_df.loc[reviews_senti_df['sentiment_Textblob'] > 0]
neg_senti = reviews_senti_df.loc[reviews_senti_df['sentiscore_average'] < 0]

In [None]:
# Define 1st quartile
threshold_textBlob = pos_textBlob['sentiment_Textblob'].describe()[4]
threshold_senti = neg_senti['sentiscore_average'].describe()[4]

In [None]:
pos_textBlob = pos_textBlob.loc[pos_textBlob['sentiment_Textblob'] > threshold_textBlob/2]
neg_senti = neg_senti.loc[neg_senti['sentiscore_average'] < threshold_senti/2]

In [None]:
pos_textBlob_reviews = list(pos_textBlob["textfile"])
neg_senti_reviews = list(neg_senti["textfile"])

In [None]:
pos_textBlob_exclusive = pos_textBlob.loc[~pos_textBlob['textfile'].isin(neg_senti_reviews)]
pos_textBlob_exclusive["classified"] = "positive"
pos_textBlob_exclusive['sentiscore_average'] = 0

neg_senti_exclusive = neg_senti.loc[~neg_senti['textfile'].isin(pos_textBlob_reviews)]
neg_senti_exclusive["classified"] = "negative"
neg_senti_exclusive['sentiment_Textblob'] = 0

In [None]:
pos_textBlob_exclusive_reviews = list(pos_textBlob_exclusive["textfile"])
neg_senti_exclusive_reviews = list(neg_senti_exclusive["textfile"])
not_in = pos_textBlob_exclusive_reviews + neg_senti_exclusive_reviews

not_classified = reviews_senti_df.loc[~reviews_senti_df['textfile'].isin(not_in)]
not_classified["classified"] = "not_classified"

In [None]:
dfs = [pos_textBlob_exclusive,neg_senti_exclusive,not_classified]

In [None]:
len(pos_textBlob_exclusive)

In [None]:
len(neg_senti_exclusive)

In [None]:
reviews_senti_classified = pd.concat(dfs)

In [None]:
reviews_senti_classified.to_csv(r"C:\...\ENG_reviews_senti_classified.csv",
                                sep = ';', encoding = 'utf-8')