In [None]:
import re
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from pycorenlp import StanfordCoreNLP
import pandas as pd

nlp = StanfordCoreNLP('http://localhost:9000')

In [None]:
import dill 

df = dill.load(open('df_preprocessed.pkd', 'rb'))

##  Sentiment Analysis  

This notebook aims at processing the text messages and using natural language processing techniques to evaluate the sentiment around each text message. 

In particular, this notebook uses [Stanford CoreNLP API](https://stanfordnlp.github.io/CoreNLP/index.html) sentiment tool. The Sentiment Annotator of CoreNLP implements [Socher et al](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)'s sentiment model, by attaching a binary tree in the sentence level. The Node of the tree contains predicted class and scores for that subtree. The current version of the sentiment annotator of CoreNLP includes 5 score classes: very negative, negative, neutral, positive, and very positive. 


To record the sentiment scores of text messages, this notebook extracts the probability distribution associated with the 5 score classes (very negative to very positive) for each sentence within a message. A score of -2, -1, 0, 1, 2 is assigned to the 5 classes, and with that, the expected score for each sentence is calculated as: 

$$ E_i = \sum_{j=1}^5 P_{i,j}(s) \times s_{i,j} $$ 

where, $E_{i}$ is expected score for sentence *i*, and $P_j$ is the probability associated with each score class, $s_j$ in (-2,-1,0,1,2). 

Ultimately, the frequency of score classes for the sentences included in one text message is calculated by binning the expected scores of all sentences. 

The average of expected score for each message is claculated to indicate the overall score of a whole text message. 

Throughout the sentiment analysis the number of sentences and words within each message is extracted and added to the data frame.


In [None]:
def SentimentAnalysis(text):
    
    ''' runs the sentiment analysis using Stanford CoreNLP API,
    returns:
    (i) average sentiment over all sentences in a message, 
    (ii) the number of sentences and words,
    (iii) and distribution of sentiments of all sentences in a message.'''

    sent = nlp.annotate(text,
                   properties={
                       'annotators': 'sentiment, ner, pos',
                       'outputFormat': 'json',
                       'timeout': 150000})
    
    num_sentence = len(sent['sentences'])
    num_words = len(text.split())
    
    sent_dist = [] 
    ''' sentiment distribution for each sentence
     sentiment distribution is probability 
     for each sentiment score (-2 to 2).
     it is obrtianed from the node 
     of the binary tree associated with each sentence. '''
    scores = [-2,-1,0,1,2]
    expected = []
    sigma = []
    tot_sent = 0
    for i, s in enumerate(sent['sentences']):
        sent_dist.append(s['sentimentDistribution'])
        E_i = np.sum([a*b for a,b in zip(s['sentimentDistribution'],scores)])
        s_i = np.sum([np.sqrt(a*b) for a,b in zip(s['sentimentDistribution']
                                                  , [(x - E_i)**2 for x in scores])])
        sigma.append(s_i)
        expected.append(E_i)

    mean_dist = np.mean(sent_dist,axis=0)
    mean_expected_dist = np.sum(expected,axis=0)
    freq = np.histogram(expected, np.linspace(-2,2,6))
    
    expected_text = np.sum([a*b for a,b in zip(mean_dist, scores)])
    sigma_text = np.sum([np.sqrt(a*b) for a,b in zip(mean_dist, [(x - expected_text)**2 for x in scores])])

    return num_sentence, num_words, expected_text, sigma_text, freq, sent_dist



def get_sentiments(X):
    
    '''Modifies the original data frame 
    and adds the sentiment analysis outputs '''

    cols = ['num_sentence',
            'num_words',
            'expected_sentiment_text',
            'sigma_sentiment_text',
            'freq-very_negative',
            'freq-negative',
            'freq-neutral',
            'freq-positive',
            'freq-very_positive']
            #'sentiment_dist']

    for i, c in enumerate(cols):
        X[c] = 0
    
    for i in range(len(X)):
        try:
            text = str(X['clean_message'][i])
            num_sentence, num_words, expected_text, sigma_text, freq, sentiment_dist = SentimentAnalysis(text)
            X.loc[i,'num_sentence'] = num_sentence
            X.loc[i,'num_words'] = num_words
            X.loc[i,'expected_sentiment_text'] = expected_text
            X.loc[i,'sigma_sentiment_text'] = sigma_text
            X.loc[i,'freq-very_negative'] = freq[0][0]
            X.loc[i,'freq-negative'] = freq[0][1]
            X.loc[i,'freq-neutral'] = freq[0][2]
            X.loc[i,'freq-positive'] = freq[0][3]
            X.loc[i,'freq-very_positive'] = freq[0][4]
            #X.loc[i,'sentiment_dist'] = sentiment_dist
        except:
            print("error where i =", i) 

    return X

In [None]:
class Sentiment(BaseEstimator, TransformerMixin):
    
    def __init__(self, func):
        self.func = func


    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None): 
        return self.func(X)

In [None]:
sent = Sentiment(get_sentiments)
sent.fit_transform(df)

In [None]:
dill.dump(df, open('df_sentiments.pkd', 'wb'))