In [1]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from itertools import chain, product
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag
import pandas as pd
import re
import time
from nltk.tag import pos_tag,map_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer as form_replacer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import spacy

In [None]:
import nltk
nltk.download('universal_tagset')

# Sentiment Aware Metric (SAM)

Calculates the sentiment scores of the mismatched words between hyp and ref. It computes the absoluted difference between the sum of the sentiment scores of mismatched words in hyp and ref

# Steps
1. Calculate the exact match between hyp and ref
2. Caculate the Sentiment Distance between the unmatched words according to Sentiment Words Lexicon "prior sentiment scores":
    1. decontract negative (e.g "don't" to 'do not')
    2. Assigns pos tags by Spacy, get the dependency relation
    3. Lemmatize with WordNet lemmatizer
    4. Change the lemmas into the SW lexicon entry form lemma#pos
    5. returns an enumerated list of lemma#pos (exclude the lemmas that do not have a POS in SW and hence zero sentiment score)
    6. change determiners ('DET') such as 'no, and conjunctions such as 'or' 'CCONJ' to Noun
    8. Compute the avergage between the scores of the mismatched words in hyp and ref
### NOTE:
The sentiment value of 'not' is increased to reflect a proportional penalty to the flipping of the sentiment

# Calculate Exact Match

In [None]:
def _generate_enums(hypothesis, reference, preprocess=str.lower): # this tokenizes and returnd (index,w) tuple
    """
    Takes in string inputs for hypothesis and reference and returns
    enumerated word lists for each of them

    :param hypothesis: hypothesis string
    :type hypothesis: str
    :param reference: reference string
    :type reference: str
    :preprocess: preprocessing method (default str.lower)
    :type preprocess: method
    :return: enumerated words list
    :rtype: list of 2D tuples, list of 2D tuples
    """
    hypothesis_list = list(enumerate(preprocess(hypothesis).split()))
    reference_list = list(enumerate(preprocess(reference).split()))
    return hypothesis_list, reference_list

In [None]:
def exact_match(hypothesis, reference): # this combines the above and below functions together
    """
    matches exact words in hypothesis and reference
    and returns a word mapping based on the enumerated
    word id between hypothesis and reference

    :param hypothesis: hypothesis string
    :type hypothesis: str
    :param reference: reference string
    :type reference: str
    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
             enumerated unmatched reference tuples
    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
    """
    hypothesis_list, reference_list = _generate_enums(hypothesis, reference)
    return _match_enums(hypothesis_list, reference_list)

In [None]:
# it takes the (index,w) lists and makes a new tuple list of (indexfromhyp,indexfromref) for all the same words
# and returns what is left unmatched in as (indexhyp, w) (indexref,w)
def _match_enums(enum_hypothesis_list, enum_reference_list):
    """
    matches exact words in hypothesis and reference and returns
    a word mapping between enum_hypothesis_list and enum_reference_list
    based on the enumerated word id.

    :param enum_hypothesis_list: enumerated hypothesis list
    :type enum_hypothesis_list: list of tuples
    :param enum_reference_list: enumerated reference list
    :type enum_reference_list: list of 2D tuples
    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
             enumerated unmatched reference tuples
    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
    """
    word_match = []
    for i in range(len(enum_hypothesis_list))[::-1]:
        for j in range(len(enum_reference_list))[::-1]:
            if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
                word_match.append( # attach the indeces without the words
                    (enum_hypothesis_list[i][0], enum_reference_list[j][0])
                )
                (enum_hypothesis_list.pop(i)[1], enum_reference_list.pop(j)[1]) # take out the matched tuple from the original list
                break
    return word_match, enum_hypothesis_list, enum_reference_list

# Calculate Sentiment Distance between the Unmached words

In [None]:
# Function to decontract negatives and other abbr

def decontracted(phrase):
    
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase



In [None]:
#function to turn the pos tags (for Spacy, NLTK pos taggin or Penn-tree) to WN pos tags to be compatible with SW lexicon
# turning Determiner and conjunction to noun to get the scores form SW
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NOUN', 'CCONJ', 'DET','NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VERB','VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['ADV','RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['ADJ','JJ', 'JJR', 'JJS']


def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None

In [None]:
# function that takes the raw hyp/ref and:
# 1. decontract negative (e.g "don't" to 'do not')
# 2. Assigns pos tags by Spacy, get the dependency relation
# 3. Lemmatize with WornNet lemmatizer
# 4. Get the lemmas in the SW lexicon entry form lemma#pos
# 5. returns an enumerated list of lemma#pos (excludes words that do not have lemmas in SW, 0 sentiment score)



def preprocess2(tran):
    tokens = []
    
    decontr = decontracted(tran)
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(decontr.lower())
    spacy_ls = [(token.text, token.pos_, token.dep_) for token in doc]
    for i in range(len(spacy_ls)):
        try:
            t = form_replacer().lemmatize(spacy_ls[i][0], penn_to_wn(spacy_ls[i][1]))+"#"+ penn_to_wn(spacy_ls[i][1])  
        except:
            continue
        tokens.append(t)
    enum_tokens = list(enumerate(tokens))
    
    return enum_tokens

In [None]:
preprocess2(h)

[(0, 'the#n'),
 (1, 'sad#a'),
 (2, 'twitter#n'),
 (3, 'bring#v'),
 (4, 'really#r'),
 (5, 'depression#n')]

In [None]:
# function to get exact matches indexes, and mismatched hyp and ref words with POS tags

def senti_exact_match(hyp,ref):
    hyp_ls = preprocess2(hyp)
    ref_ls = preprocess2(ref)
    word_match = []
    for i in range(len(hyp_ls))[::-1]:
        for j in range(len(ref_ls))[::-1]:
            if hyp_ls[i][1] == ref_ls[j][1]:
                word_match.append(
                    (hyp_ls[i][0], ref_ls[j][0])
                )
                (hyp_ls.pop(i)[1], ref_ls.pop(j)[1])
                break
    return word_match, hyp_ls, ref_ls

In [None]:
h = "the sad Twitter brings really depression" # the preprocess2 fixes the problem with brings
f = " the sad twitter doesn't really bring happiness"

In [None]:
senti_exact_match(h,f)

([(4, 5), (3, 6), (2, 2), (1, 1), (0, 0)],
 [(5, 'depression#n')],
 [(3, 'do#v'), (4, 'not#r'), (7, 'happiness#n')])

In [None]:
# function to calculate the sentiment score difference between mismatched words

def senti_score(hyp,ref):  
    matched_words, hyp_ls, ref_ls = senti_exact_match(hyp,ref)
    column_names = ['lemmaPOS','score']
    df = pd.read_csv("C:/Users/sadan/OneDrive/PhD/metric/metrics/senti_lexicon/SentiWords.txt", header=None, delimiter="\t", names=column_names)
    senti_dict = dict(df.values.tolist())
    hyp_values = []
    ref_values = []
    for key in senti_dict.keys():
        for i in range(len(hyp_ls)):      
            if  key == hyp_ls[i][1]: 
                hyp_values.append(senti_dict[key])
        for j in range(len(ref_ls)):
            if  key == ref_ls[j][1]: 
                ref_values.append(senti_dict[key])
    diff = abs(sum(hyp_values)-sum(ref_values))
    return diff, hyp_ls, ref_ls,hyp_values,ref_values

In [None]:
# function to calculate the sentiment score difference between mismatched words

def senti_score2(hyp,ref):  
    matched_words, hyp_ls, ref_ls = senti_exact_match(hyp,ref)
    column_names = ['lemmaPOS','score']
    df = pd.read_csv("C:/Users/sadan/OneDrive/PhD/metric/metrics/senti_lexicon/SentiWords.txt", header=None, delimiter="\t", names=column_names)
    senti_dict = dict(df.values.tolist())
    hyp_values = []
    ref_values = []
    for key in senti_dict.keys():
        for i in range(len(hyp_ls)):      
            if  key == hyp_ls[i][1]: 
                hyp_values.append(senti_dict[key])
        for j in range(len(ref_ls)):
            if  key == ref_ls[j][1]: 
                ref_values.append(senti_dict[key])
    #diff = abs(sum(hyp_values)-sum(ref_values))
    return {'trans_mismatch_num': len(hyp_ls), 
            'ref_mismatch_num': len(ref_ls),
            'trans_senti_score':sum(hyp_values),
            'ref_senti_score': sum(ref_values),
            'num_matched_words' : len(matched_words)}

In [None]:
# function to calculate the sentiment score difference between mismatched words (take zero out)

def senti_score3(hyp,ref):  
    matched_words, hyp_ls, ref_ls = senti_exact_match(hyp,ref)
    column_names = ['lemmaPOS','score']
    df = pd.read_csv("C:/Users/sadan/OneDrive/PhD/metric/metrics/senti_lexicon/SentiWords.txt", header=None, delimiter="\t", names=column_names)
    senti_dict = dict(df.values.tolist())
    hyp_values = []
    ref_values = []
    for key in senti_dict.keys():
        try:
            for i in range(len(hyp_ls)):      
                if  key == hyp_ls[i][1]: 
                    hyp_values.append(round(senti_dict[key],2))
        except:
            continue
        try:
            for j in range(len(ref_ls)):
                if  key == ref_ls[j][1]: 
                    ref_values.append(round(senti_dict[key],2))
        except:
            continue
    
    diff = abs(sum(ref_values)-sum(hyp_values))
    return {'trans_mismatch': hyp_ls,
            'trans_senti_scores': hyp_values,
            'transi_mis_len' : len(hyp_ls),
            'ref_mismatch': ref_ls,            
            'ref_senti_scores': ref_values,
            'ref_mis_len' : len(ref_ls),
            'absulte_Diff': round(diff,2),
            'num_matched_words' : len(matched_words)}
            

In [None]:
# function to calculate the sentiment score difference between mismatched words (take zero out)

def senti_score4(hyp,ref):  
    matched_words, hyp_ls, ref_ls = senti_exact_match(hyp,ref)
    column_names = ['lemmaPOS','score']
    df = pd.read_csv("C:/Users/sadan/OneDrive/PhD/metric/metrics/senti_lexicon/SentiWords.txt", header=None, delimiter="\t", names=column_names)
    senti_dict = dict(df.values.tolist())
    hyp_values = []
    ref_values = []
    for key in senti_dict.keys():
        for i in range(len(hyp_ls)):      
            if  key == hyp_ls[i][1]: 
                hyp_values.append(round(senti_dict[key],2))
        for j in range(len(ref_ls)):
            if  key == ref_ls[j][1]: 
                ref_values.append(round(senti_dict[key],2))
    
    #diff = abs(sum(hyp_values)-sum(ref_values))
    return hyp_values, ref_values

# Assigning penalty

In [None]:
import numpy
def senti_penalty(ls):
    result = numpy.dot( list(map(abs, ls)),ls)/sum(list(map(abs, ls)))
    return result

