# Aggregating Sentiment Score Results from `TextBlob`, `VADER` and LUIS.ai

In [66]:
import pandas as pd
import pickle
from pathlib import Path
import process_luis_response
import numpy as np

from UTILS import utils
from UTILS import feature_extraction

In [80]:
data_dir = Path.cwd().joinpath('OUTPUT')
column_dir = data_dir.joinpath('COLUMNS')

In [55]:
relevant_columns = ['student_comment_apostrophe',
                    'student_rating',
                    'sentiment_textblob',
                    'sentiment_nltk',
                    'sentiment_luis']
pd.set_option('display.max_colwidth', 1000)

In [33]:
data_name = 'data_df_comments'
with open(data_dir.joinpath(data_name), 'rb') as infile:
    df = pickle.load(infile)

# Adding the LUIS.ai Sentiment Scores

In [39]:
sentiment_luis_name = 'sentiment_luis'
with open (data_dir.joinpath('COLUMNS').joinpath(sentiment_luis_name), 'rb') as infile:
    sentiment_luis = pickle.load(infile)

In [40]:
sentiment_luis[:5]

index
17    0.998530
26    0.929806
27   -0.022393
30   -0.269620
40    0.963414
Name: response, dtype: float64

In [41]:
sentiment_luis.name = 'sentiment_luis'

In [42]:
df = df.join(sentiment_luis, how='left')

In [44]:
# Unpacking the 'compound' element of the sentiment_nltk, because the return type is a json
df['sentiment_nltk'] = df.sentiment_nltk.apply(lambda x: x['compound'])

# Adding the LUIS.ai Intents

In [47]:
intent_luis_name = 'luis_intent_pickle'
with open (data_dir.joinpath(intent_luis_name), 'rb') as infile:
    intent_luis = pickle.load(infile)

In [48]:
intent_luis.name = 'intent_luis'

In [49]:
df = df.join(intent_luis, how='left')

# Aggregation

The idea of aggregation is to mitigate the adverse effect of one package not high inaccurate scores. The aggregation algorithm is simply:
1. ignore the score that is the odd one out based on distance
2. take the mean of the remaining scores

For example, in index `27`, the scores for `TextBlob`, `NLTK`, and LUIS.ai are 0.008333, 0.4411 and -0.022393 respectively. 

- The difference between the high and the middle score is 0.4411 - 0.008333 = 0.432767
- The difference between the middle and the low score is 0.008333 - (-0.02239) = 0.0.30727

The low and middle scores are closer to each other than are the middle and the high score. In this case the value to be averaged is the low and the middle score, leading to a final sentiment value of (0.008333 + 0.02239) / 2 = -0.007, which is very similar to the neutral score rating of 3.

In [56]:
sentiments = df[relevant_columns].loc[27, 'sentiment_textblob':'sentiment_luis']
print(sentiments)

sentiment_textblob    0.00833333
sentiment_nltk            0.4411
sentiment_luis        -0.0223935
Name: 27, dtype: object


In [57]:
sorted(sentiments)

[-0.0223934948, 0.008333333333333338, 0.4411]

# Aggregating the Sentiment Scores

In [21]:
def aggregate_sentiments(row):
    """Take the average of the two closest values"""
    sentiment_list = sorted(row[['sentiment_textblob', 'sentiment_nltk', 'sentiment_luis']])
    if any(np.isnan(sentiment_list)):
        result = np.nanmean(sentiment_list)
    elif sentiment_list[1] - sentiment_list[0] < sentiment_list[2] - sentiment_list[1]:
        result = np.nanmean(sentiment_list[:2])
    else:
        result = np.nanmean(sentiment_list[1:])
    return result

In [67]:
df['sentiment_aggregated'] = df.apply(feature_extraction.naive_aggregate_sentiments, axis = 'columns')

# High Student Ratings, Low `TextBlob` Sentiment Scores

Looking at the same five comments as those initially investigated in [Validating Out of the Box Algorithms (Sentiment Analysis)](https://rpubs.com/RRoger_Yu/Validating_Out_of_the_Box_Algorithms_Sentiment_Analysis):

In [24]:
df[relevant_columns + ['sentiment_aggregated']].loc[[31715, 47396, 71394, 32063, 19418]]

Unnamed: 0,student_comment_apostrophe,student_rating,sentiment_textblob,sentiment_nltk,sentiment_luis,sentiment_aggregated
31715,"yo simon is an mk, real shit maddest dog ive ever met",5.0,-0.3,-0.8126,-0.142343,-0.221171
47396,yeah not the tutors fault but the website was really slow i had to refresh a couple of times.,5.0,-0.3,0.3025,-0.063081,-0.18154
71394,"I thought the interaction between both of us was a bit slow, it took a while to load and send through messages.",5.0,-0.3,0.0,-0.195103,-0.247551
32063,The connection was slow and the photo I submitted went distorted after I submitted it.,5.0,-0.3,-0.4019,-0.033585,-0.35095
19418,Helped me slowly work through it,5.0,-0.3,0.0,-0.082189,-0.041095


The aggregation doesn't seem to have improved the scores.

# Low Student Ratings, High `TextBlob` Sentiment Scores

Looking at the same five comments as those initially investigated in [Validating Out of the Box Algorithms (Sentiment Analysis)](https://rpubs.com/RRoger_Yu/Validating_Out_of_the_Box_Algorithms_Sentiment_Analysis):

In [25]:
df[relevant_columns + ['sentiment_aggregated']].loc[[78189, 29136, 79281, 47031, 36957]]

Unnamed: 0,student_comment_apostrophe,student_rating,sentiment_textblob,sentiment_nltk,sentiment_luis,sentiment_aggregated
78189,best tutor in client,1.0,1.0,0.6369,0.994721,0.99736
29136,"Very happy with the service, and got all the answers I needed",1.0,1.0,0.6115,0.845618,0.922809
79281,Perfect,1.0,1.0,0.5719,0.965229,0.982615
47031,Corey was great!,1.0,1.0,0.6588,0.988572,0.994286
36957,Not the greatest service this time,1.0,1.0,-0.5216,0.752059,0.87603


LUIS.ai also incorrectly scores "Not the greatest service this time" as a very high 0.75. The result is still a high 0.88.

Let's look at other such cases to see if the aggregation has helped to mitigate some cases.

# Low `sentiment_nltk` and high `sentiment_aggregated` and high `student_rating`

In [None]:
query = 'sentiment_nltk < 0 and student_rating > 3'
df[relevant_columns + ['sentiment_aggregated'] + ['intent_luis']].query(query)

# High `sentiment_nltk` and low `student_rating`

In [62]:
query = 'sentiment_nltk > 0 and student_rating < 3'
df[relevant_columns + ['sentiment_aggregated'] + ['intent_luis']].query(query)

KeyError: "['sentiment_aggregated'] not in index"

# Fixing the rating on `PRESSED_WRONG_BUTTON`

In [75]:
query_1 = 'intent_luis == "PRESSED_WRONG_BUTTON" and student_rating == 1'
query_2 = 'intent_luis == "PRESSED_WRONG_BUTTON" and student_rating == 2'
index_pressed_wrong_button_1 = df.query(query_1).index
index_pressed_wrong_button_2 = df.query(query_2).index
df.loc[index_pressed_wrong_button_1, ['student_rating', 'student_comment']]

Unnamed: 0,student_rating,student_comment
33676,1.0,it was really nice. thankyou. by mistake i clicked not satisfied button
115493,1.0,"Sorry, I pressed wrong button. Im extremely happy. Thank you very much."
139815,1.0,"Sorry, I clicked the wrong button and now can't go back. I'm very happy with the review process.."
161106,1.0,I clickd the wrong button and can't go back... Very happy with feedback. Thank you
161237,1.0,Sorry I am Happy with the work pressed wrong button
164717,1.0,"Accidentally pressed 'Extremely Dissatisfied' button, I meant Extremely SATISFIED!"
165931,1.0,I don't appear to have any feedback something must have went wrong
176518,1.0,"sorry i pressed wrong button, i am impressed with your work"
186538,1.0,Oh sorry I was really happy I think I just pressed the wrong buttong
197160,1.0,extremly satisfied i pressed the wrong thing


In [79]:
print(len(index_pressed_wrong_button_1))
print(len(index_pressed_wrong_button_2))

34
4


There are 34 cases in which students pressed the wrong button and rated 1, and 4 cases in which students pressed the wrong button and rated 2. Note that these are known cases. There are others who may have pressed the wrong button but did not leave a comment.

In [59]:
df['student_rating_fixed'] = df.student_rating

Change ratings of 2 to 4 and 1 to 5.

In [76]:
df.loc[index_pressed_wrong_button_1, 'student_rating_fixed'] = 5
df.loc[index_pressed_wrong_button_2, 'student_rating_fixed'] = 4

In [None]:
df.student_rating.value_counts()

In [None]:
df.student_rating_fixed.value_counts()

In [81]:
utils.save_object(
    df.student_rating_fixed,
    'student_rating_fixed',
    column_dir
)

Pickling to D:\OneDrive - UTS\36102 iLab 1 - Spring 2019\CODE\OUTPUT\COLUMNS\student_rating_fixed.


In [83]:
utils.save_object(
    df.intent_luis,
    'intent_luis',
    column_dir
)

Pickling to D:\OneDrive - UTS\36102 iLab 1 - Spring 2019\CODE\OUTPUT\COLUMNS\intent_luis.


# Saving Outputs

## `sentiment_aggregated`

In [49]:
with open(Path.cwd().joinpath('OUTPUT').joinpath('sentiment_aggregated'), 'ab') as outfile:
    pickle.dump(data_df_comments.sentiment_aggregated, outfile)

## `student_rating_fixed`

In [50]:
with open(Path.cwd().joinpath('OUTPUT').joinpath('student_rating_fixed'), 'ab') as outfile:
    pickle.dump(data_df_comments.student_rating_fixed, outfile)