## Data Preparation and Modification

In [1]:
import pandas as pd
import csv
import time

In [179]:

# Factor should be an exact decimal
def make_new_dataset(path, limit, factor):
    limit = abs(limit)

    base_data = pd.read_csv('data/lexicons/NRC-VAD-Lexicon-v2.1.csv', delimiter='\t')
    given_data = pd.read_csv(path, delimiter=',')

    terms = base_data['term'].tolist()
    arousal = base_data['arousal'].tolist()
    dominance = base_data['dominance'].tolist()


    result_terms = []
    result_valence = []
    result_arousal = []
    result_dominance = []

    available_fieldnames = given_data.columns.tolist()
    for name in available_fieldnames:
        if name == 'term':
            given_terms = given_data['term'].tolist()
        elif name == 'valence':
            given_valence = given_data['valence'].tolist()
        elif name == 'arousal':
            given_arousal = given_data['arousal'].tolist()
        elif name == 'dominance':
            given_dominance = given_data['dominance'].tolist()

    
    sig_figs = int(len(str(factor)[2:]))

    ref_vals = []
    for i in range(int(1 / factor) + 1):
        ref_val = i * factor 
        ref_vals.append(round(ref_val, sig_figs))
    
    for i in range(len(given_terms)):
        word = given_terms[i]
        val = given_valence[i]
        try:
            arr = given_arousal[i]
        except:
            try:
                j = terms.index(word) # try to find term
                arr = arousal[j]
            except:
                continue
        try:
            dom = given_dominance[i]
        except:
            try:
                j = terms.index(word) # try to find term
                dom = dominance[j]
            except:
                continue 

        try:
            if ' ' in str(word):
                word = word.replace(' ', '-')
                given_terms[i] = word 
        except:
            pass
        
        
        val = round(float(val), sig_figs)
        arr = round(float(arr), sig_figs)
        dom = round(float(dom), sig_figs)

        scaled_val = round(val / (2 * limit), sig_figs)
        scaled_arr = round(arr / (2 * limit), sig_figs)
        scaled_dom = round(dom / (2 * limit), sig_figs)

        if val != 0:
            val = 0.5 + scaled_val 
        else:
            val = 0.5
        
        if arr != 0: 
            arr = 0.5 + scaled_arr 
        else:
            arr = 0.5
        
        if dom != 0:
            dom = 0.5 + scaled_dom 
        else:
            dom = 0.5

        val_proximity_list = []
        arr_proximity_list = []
        dom_proximity_list = []

        for ref_val in ref_vals:
            val_proximity = round(abs(val - ref_val), sig_figs)
            arr_proximity = round(abs(arr - ref_val), sig_figs)
            dom_proximity = round(abs(dom - ref_val), sig_figs)

            val_proximity_list.append(val_proximity)
            arr_proximity_list.append(arr_proximity)
            dom_proximity_list.append(dom_proximity)


        min_val_proximity = min(val_proximity_list)
        min_arr_proximity = min(arr_proximity_list)
        min_dom_proximity = min(dom_proximity_list)

        try:
            min_val_index = val_proximity_list.index(round(min_val_proximity, sig_figs))
        except:
            print(min_val_proximity)
            print(val_proximity_list)
            print(ref_vals)
            print(val)
            print(i)
        min_arr_index = arr_proximity_list.index(round(min_arr_proximity, sig_figs))
        min_dom_index = dom_proximity_list.index(round(min_dom_proximity, sig_figs))


        val = ref_vals[min_val_index]
        arr = ref_vals[min_arr_index]
        dom = ref_vals[min_dom_index]

        result_terms.append(word) 
        result_valence.append(round(val, sig_figs)) 
        result_arousal.append(round(arr, sig_figs ))
        result_dominance.append(round(dom, sig_figs))

    data = {'term': result_terms, 'valence': result_valence , 'arousal': result_arousal, 'dominance': result_dominance}
    df = pd.DataFrame(data)
    df.to_csv(f'data/lexicons/data-{factor}.csv', index=False)
    

In [30]:
import csv 
import pandas as pd

def txt_to_csv(path):
    
    new_lines = []
  
    with open(path, 'r') as file:
           for line in file:
                 objects = []
                 temp_obj = ""
                 for char in line:
                       if char == "\t" or char == "\n":
                             objects.append(temp_obj)
                             temp_obj = ""
                       else:
                             temp_obj += char
                 new_lines.append(objects)
              
    print(new_lines[1][5])
    df = pd.DataFrame({new_lines[0][0]: [sub[0] for sub in new_lines] , new_lines[0][1]: [sub[1] for sub in new_lines], new_lines[0][2]: [sub[2] for sub in new_lines], new_lines[0][3]: [sub[3] for sub in new_lines] , new_lines[0][4]: [sub[4] for sub in new_lines], new_lines[0][5]: [sub[5] for sub in new_lines]})

    df.to_csv('SentiWordNet.csv', index=False)



In [31]:
txt_to_csv("data/lexicons/SentiWordNet_3.0.0.txt")

(usually followed by `to') having the necessary means or skill or know-how or authority to do something; "able to swim"; "she was able to program her computer"; "we were at last able to buy a car"; "able to get a grant for the project"


In [21]:
txt_to_csv(pos_path='data/products/1/kitchen/positive.txt', neg_path="data/products/1/kitchen/negative.txt")

In [112]:
thing = [1, 2, 3, 4, 5, 6]
print(thing[:1])

[1]


In [41]:
x = 0.1


In [181]:
path = "data/lexicons/vader_lexicon.csv"
make_new_dataset(path=path, factor=x, limit=4) # Create a new dataset with x increments

[-1.5, -0.4, -1.5]


In [173]:
txt_to_csv("data/lexicons/vader_lexicon.txt", '\t', 2)

In [37]:
df = pd.read_csv("data/lexicons/SentiWordNet.csv")


pos_scores = df["PosScore"].tolist()
neg_scores = df["NegScore"].tolist()
term_groups = df["SynsetTerms"].tolist()

pos_scores.remove("PosScore")
neg_scores.remove("NegScore")
term_groups.remove("SynsetTerms")

new_combined_scores = []

for j, k in zip(pos_scores, neg_scores): # Valence is simply computed by combining negative and positive sentiment, with 0.5 being neutral (objective), 0 negative, and 1 positive
    
    j = float(j)
    k = float(k)
    comb_score = 0
    if j > k:
        comb_score = round(0.5 + ((j - k) / 2), 1)
    elif k > j:
        comb_score = round(0.5 - ((k - j) / 2), 1)
    elif k == j:
        comb_score = 0.5

    new_combined_scores.append(comb_score)

new_df = pd.DataFrame({"SynsetTerms": term_groups, "valence": new_combined_scores})
new_df.to_csv("valence_sentiwordnet.csv", index=False)


In [62]:
terms = []
valences = []
group_ids = []
seen_terms = set()


for j, group in enumerate(term_groups):
    group_valence = new_combined_scores[j]
    temp_word = ""
    for i, char in enumerate(group):
        if char == " ":
            temp_word = temp_word.split("#", 1)[0]
            if temp_word not in seen_terms:
              terms.append(temp_word)
              valences.append(group_valence) # also takes and holds the valence of only the first appearing definition of the word
              group_ids.append(j)
              seen_terms.add(temp_word)
            temp_word = ""
        elif i == (len(group) - 1):
            temp_word += char 
            temp_word = temp_word.split("#", 1)[0]
            if temp_word not in seen_terms:
              terms.append(temp_word)
              valences.append(group_valence)
              group_ids.append(j)
              seen_terms.add(temp_word)
            temp_word = ""
        else:
            temp_word += char 



df = pd.DataFrame({"term": terms, "valence": valences, "groupid": group_ids})
df.to_csv("cleaned_sentiwordnet.csv", index=False)

print(terms[0])
print(valences[0])
    

            


able
0.6


In [26]:
crc_df = pd.read_csv("data/lexicons/NRC-CONV.csv")
crc_terms = crc_df["term"].tolist()
crc_arousal = crc_df["arousal"].tolist()
crc_dominance = crc_df["dominance"].tolist()

senti_df = pd.read_csv("cleaned_sentiwordnet.csv")
terms = senti_df["term"].tolist()
valences = senti_df["valence"].tolist()
group_ids = senti_df["groupid"].tolist()

term = []
valence = []
arousal = []
dominance = []
seen_ids = []
for i, t in enumerate(terms):
    if t in crc_terms:
        crc_index = crc_terms.index(t)
        val = valences[i]
        arr = crc_arousal[crc_index]
        dom = crc_dominance[crc_index]

        term.append(t)
        valence.append(val)
        arousal.append(arr)
        dominance.append(dom)
        seen_ids.append(group_ids[i])
        #print(f"appended {t} with {group_ids[i]}")
    elif group_ids[i] in seen_ids:
        
        seen_index = seen_ids.index(group_ids[i])

        x = term[seen_index]
    
        crc_index = crc_terms.index(x)
        val = valences[i]
        arr = crc_arousal[crc_index]
        dom = crc_dominance[crc_index]

        
        term.append(t)
        valence.append(val)
        arousal.append(arr)
        dominance.append(dom)
        seen_ids.append(group_ids)


final_df = pd.DataFrame({"term": term , "valence": valence, "arousal": arousal, "dominance": dominance })
final_df.to_csv("SentiWordNet-CONV.csv", index=False)





In [20]:
crc_df = pd.read_csv("data/lexicons/NRC-CONV.csv")
crc_terms = crc_df["term"].tolist()
crc_arousal = crc_df["arousal"].tolist()
crc_dominance = crc_df["dominance"].tolist()

senti_df = pd.read_csv("data/lexicons/vader_lexicon.csv")
terms = senti_df["term"].tolist()
valences = senti_df["valence"].tolist()

term = []
valence = []
arousal = []
dominance = []
for i, t in enumerate(terms):
    val = valences[i]
    if t in crc_terms:
        crc_index = crc_terms.index(t)
        arr = crc_arousal[crc_index]
        dom = crc_dominance[crc_index]
        #print(f"appended {t} with {group_ids[i]}")
    else:
        val = valences[i]
        arr = -0.1
        dom = -0.1

    term.append(t)
    valence.append(val)
    arousal.append(arr)
    dominance.append(dom)

final_df = pd.DataFrame({"term": term , "valence": valence, "arousal": arousal, "dominance": dominance })
final_df.to_csv("vader-CONV.csv", index=False)


## Sentiment Analysis Algorithm

In [1]:

import re
import nltk
import string
import time
import pandas as pd
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import TweetTokenizer 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rahsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Cleaning inputs 
def decompose_review(review):
    tokenizer = TweetTokenizer()
    bow = tokenizer.tokenize(text=str(review).lower())
    return bow 

In [3]:
decompose_review("row row row your boat didn't like the end of the album!!!")

['row',
 'row',
 'row',
 'your',
 'boat',
 "didn't",
 'like',
 'the',
 'end',
 'of',
 'the',
 'album',
 '!',
 '!',
 '!']

In [64]:
# Grabbing the data 

data = pd.read_csv(f'data/lexicons/vader-CONV.csv') # Change this

terms = data['term'].tolist()
valence = data['valence'].tolist()
arousal = data['arousal'].tolist()
dominance = data['dominance'].tolist()

lexicon = {}
key = {}
#subjects_map = {'movies': [[0.4, 0.45, 0.3], [0.1, 0.4, 0.8]],  'movie': [0.4, 0.45]} #Mapping of subjects to transformations


for term, val, arr, dom in zip(terms, valence, arousal, dominance):
  #lexicon[term] = [val, arr, dom]
  lexicon[term] = [round((val + 4) / 8,1), arr, dom]



### Electra (Alpha)

Constants

In [81]:
def transformation(array, map): #Check eligibility and return a transformation for a term based on a given map
    val = array[0]
    points = 0
    for x, y in zip(array, map):
        if x == y or y == 1.1:
            points += 1
        elif x == -0.01:
            break
        else:
            pass
    if points == 3:
        val_mag = round(abs(val - 0.5), 2)
        if val < 0.5:
            val = 0.5 + val_mag
        elif val > 0.5:
            val = 0.5 - val_mag
        else:
            pass
    return val
        


In [79]:

def inference(review, maps=False, map=[0.5, 0.5, 0.5], sig_figs=2):
  start = time.perf_counter()
  bow = decompose_review(review)
  #print(f'starting task for sentence: {sentence} from user: {user}')

  tot_sentiment = 0
  term_count = 0
  for i, word in enumerate(bow):
      try:
        array = lexicon[word]
      except:
        array = [0.5, 0.5, 0.5]
      val = array[0]
      
      if maps:
        val = transformation(array=array, map=map)
      
      val = round(val, sig_figs)
      #print(f'word: {word}, valence: {val}')
      if val != 0.5:
        tot_sentiment += val 
        term_count += 1

  try:
    final_sentiment = round((tot_sentiment / term_count), sig_figs)
  except:
    final_sentiment = 0.5
  end = time.perf_counter()
  elapsed = end - start
  return (final_sentiment, elapsed)

In [73]:
print(inference("the cow  jumped over the moon!"))
print(inference("the bad cow  jumped over the moon!"))

(0.5, 0.00014659999942523427)
(0.2, 0.00031970000054570846)


In [42]:
print(inference("the cow  jumped over the moon!"))

(0.5, 0.00014930000179447234)


### Electra 

In [None]:

# nearest word with a valence not equal to 0.5 is amplified 

In [8]:
# Modified VADER Algorithm for Experimentation 

# Natural Language Toolkit: vader
#
# Copyright (C) 2001-2025 NLTK Project
# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
#         Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
#         George Berry <geb97@cornell.edu> (modifications)
#         Malavika Suresh <malavika.suresh0794@gmail.com> (modifications)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
# Modifications to the original VADER code have been made in order to
# integrate it into NLTK. These have involved changes to
# ensure Python 3 compatibility, and refactoring to achieve greater modularity.

"""
If you use the VADER sentiment analysis tools, please cite:

Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
Sentiment Analysis of Social Media Text. Eighth International Conference on
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
"""

import math
import re
import string
from itertools import product

import nltk.data
from nltk.util import pairwise
import pandas as pd 

transformations = True 


class VaderConstants:
    """
    A class to keep the Vader lists and constants.
    """

    ##Constants##
    # (empirically derived mean sentiment intensity rating increase for booster words)
    B_INCR = 0.293
    B_DECR = -0.293

    # (empirically derived mean sentiment intensity rating increase for using
    # ALLCAPs to emphasize a word)
    C_INCR = 0.733

    N_SCALAR = -0.74

    NEGATE = {
        "aint",
        "arent",
        "cannot",
        "cant",
        "couldnt",
        "darent",
        "didnt",
        "doesnt",
        "ain't",
        "aren't",
        "can't",
        "couldn't",
        "daren't",
        "didn't",
        "doesn't",
        "dont",
        "hadnt",
        "hasnt",
        "havent",
        "isnt",
        "mightnt",
        "mustnt",
        "neither",
        "don't",
        "hadn't",
        "hasn't",
        "haven't",
        "isn't",
        "mightn't",
        "mustn't",
        "neednt",
        "needn't",
        "never",
        "none",
        "nope",
        "nor",
        "not",
        "nothing",
        "nowhere",
        "oughtnt",
        "shant",
        "shouldnt",
        "uhuh",
        "wasnt",
        "werent",
        "oughtn't",
        "shan't",
        "shouldn't",
        "uh-uh",
        "wasn't",
        "weren't",
        "without",
        "wont",
        "wouldnt",
        "won't",
        "wouldn't",
        "rarely",
        "seldom",
        "despite",
    }

    # booster/dampener 'intensifiers' or 'degree adverbs'
    # https://en.wiktionary.org/wiki/Category:English_degree_adverbs

    BOOSTER_DICT = {
        "absolutely": B_INCR,
        "amazingly": B_INCR,
        "awfully": B_INCR,
        "completely": B_INCR,
        "considerably": B_INCR,
        "decidedly": B_INCR,
        "deeply": B_INCR,
        "effing": B_INCR,
        "enormously": B_INCR,
        "entirely": B_INCR,
        "especially": B_INCR,
        "exceptionally": B_INCR,
        "extremely": B_INCR,
        "fabulously": B_INCR,
        "flipping": B_INCR,
        "flippin": B_INCR,
        "fricking": B_INCR,
        "frickin": B_INCR,
        "frigging": B_INCR,
        "friggin": B_INCR,
        "fully": B_INCR,
        "fucking": B_INCR,
        "greatly": B_INCR,
        "hella": B_INCR,
        "highly": B_INCR,
        "hugely": B_INCR,
        "incredibly": B_INCR,
        "intensely": B_INCR,
        "majorly": B_INCR,
        "more": B_INCR,
        "most": B_INCR,
        "particularly": B_INCR,
        "purely": B_INCR,
        "quite": B_INCR,
        "really": B_INCR,
        "remarkably": B_INCR,
        "so": B_INCR,
        "substantially": B_INCR,
        "thoroughly": B_INCR,
        "totally": B_INCR,
        "tremendously": B_INCR,
        "uber": B_INCR,
        "unbelievably": B_INCR,
        "unusually": B_INCR,
        "utterly": B_INCR,
        "very": B_INCR,
        "almost": B_DECR,
        "barely": B_DECR,
        "hardly": B_DECR,
        "just enough": B_DECR,
        "kind of": B_DECR,
        "kinda": B_DECR,
        "kindof": B_DECR,
        "kind-of": B_DECR,
        "less": B_DECR,
        "little": B_DECR,
        "marginally": B_DECR,
        "occasionally": B_DECR,
        "partly": B_DECR,
        "scarcely": B_DECR,
        "slightly": B_DECR,
        "somewhat": B_DECR,
        "sort of": B_DECR,
        "sorta": B_DECR,
        "sortof": B_DECR,
        "sort-of": B_DECR,
    }

    # check for special case idioms using a sentiment-laden keyword known to SAGE
    SPECIAL_CASE_IDIOMS = {
        "the shit": 3,
        "the bomb": 3,
        "bad ass": 1.5,
        "yeah right": -2,
        "cut the mustard": 2,
        "kiss of death": -1.5,
        "hand to mouth": -2,
    }

    # for removing punctuation
    REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]")

    PUNC_LIST = [
        ".",
        "!",
        "?",
        ",",
        ";",
        ":",
        "-",
        "'",
        '"',
        "!!",
        "!!!",
        "??",
        "???",
        "?!?",
        "!?!",
        "?!?!",
        "!?!?",
    ]

    def __init__(self):
        pass

    def negated(self, input_words, include_nt=True):
        """
        Determine if input contains negation words
        """
        neg_words = self.NEGATE
        if any(word.lower() in neg_words for word in input_words):
            return True
        if include_nt:
            if any("n't" in word.lower() for word in input_words):
                return True
        for first, second in pairwise(input_words):
            if second.lower() == "least" and first.lower() != "at":
                return True
        return False

    def normalize(self, score, alpha=15):
        """
        Normalize the score to be between -1 and 1 using an alpha that
        approximates the max expected value
        """
        norm_score = score / math.sqrt((score * score) + alpha)
        return norm_score

    def scalar_inc_dec(self, word, valence, is_cap_diff):
        """
        Check if the preceding words increase, decrease, or negate/nullify the
        valence
        """
        scalar = 0.0
        word_lower = word.lower()
        if word_lower in self.BOOSTER_DICT:
            scalar = self.BOOSTER_DICT[word_lower]
            if valence < 0:
                scalar *= -1
            # check if booster/dampener word is in ALLCAPS (while others aren't)
            if word.isupper() and is_cap_diff:
                if valence > 0:
                    scalar += self.C_INCR
                else:
                    scalar -= self.C_INCR
        return scalar


class SentiText:
    """
    Identify sentiment-relevant string-level properties of input text.
    """

    def __init__(self, text, punc_list, regex_remove_punctuation):
        if not isinstance(text, str):
            text = str(text.encode("utf-8"))
        self.text = text
        self.PUNC_LIST = punc_list
        self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation
        self.words_and_emoticons = self._words_and_emoticons()
        # doesn't separate words from
        # adjacent punctuation (keeps emoticons & contractions)
        self.is_cap_diff = self.allcap_differential(self.words_and_emoticons)

    def _words_plus_punc(self):
        """
        Returns mapping of form:
        {
            'cat,': 'cat',
            ',cat': 'cat',
        }
        """
        no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text)
        # removes punctuation (but loses emoticons & contractions)
        words_only = no_punc_text.split()
        # remove singletons
        words_only = {w for w in words_only if len(w) > 1}
        # the product gives ('cat', ',') and (',', 'cat')
        punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)}
        punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)}
        words_punc_dict = punc_before
        words_punc_dict.update(punc_after)
        return words_punc_dict

    def _words_and_emoticons(self):
        """
        Removes leading and trailing puncutation
        Leaves contractions and most emoticons
            Does not preserve punc-plus-letter emoticons (e.g. :D)
        """
        wes = self.text.split()
        words_punc_dict = self._words_plus_punc()
        wes = [we for we in wes if len(we) > 1]
        for i, we in enumerate(wes):
            if we in words_punc_dict:
                wes[i] = words_punc_dict[we]
        return wes

    def allcap_differential(self, words):
        """
        Check whether just some words in the input are ALL CAPS

        :param list words: The words to inspect
        :returns: `True` if some but not all items in `words` are ALL CAPS
        """
        is_different = False
        allcap_words = 0
        for word in words:
            if word.isupper():
                allcap_words += 1
        cap_differential = len(words) - allcap_words
        if 0 < cap_differential < len(words):
            is_different = True
        return is_different


class SentimentIntensityAnalyzer:
    """
    Give a sentiment intensity score to sentences.
    """

    def __init__(
        self,
        lexicon_file=f"data/lexicons/vader-CONV.csv", # Change this 
    ):
        
        data = pd.read_csv(lexicon_file) 

        terms = data['term'].tolist()
        valence = data['valence'].tolist()
        arousal = data['arousal'].tolist()
        dominance = data['dominance'].tolist()

        self.lexicon = {}

        for term, val, arr, dom in zip(terms, valence, arousal, dominance):
          self.lexicon[term] = [val, arr, dom]
        self.constants = VaderConstants()

    def make_lex_dict(self):
        """
        Convert lexicon file to a dictionary
        """
        lex_dict = {}
        for line in self.lexicon_file.split("\n"):
            (word, measure) = line.strip().split("\t")[0:2]
            lex_dict[word] = float(measure)
        return lex_dict

    def polarity_scores(self, text, transformations, mapping=[0.5, 0.5, 0.5]):
        """
        Return a float for sentiment strength based on the input text.
        Positive values are positive valence, negative value are negative
        valence.

        :note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you
            are interested in processing the text in the hashtags too, then we recommend
            preprocessing your data to remove the #, after which the hashtag text may be
            matched as if it was a normal word in the sentence.
        """
        # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
        sentitext = SentiText(
            text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION
        )
        sentiments = []
        words_and_emoticons = sentitext.words_and_emoticons
        for item in words_and_emoticons:
            valence = 0
            i = words_and_emoticons.index(item)
            if (
                i < len(words_and_emoticons) - 1
                and item.lower() == "kind"
                and words_and_emoticons[i + 1].lower() == "of"
            ) or item.lower() in self.constants.BOOSTER_DICT:
                sentiments.append(valence)
                continue

            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments, mapping, transformations)

        sentiments = self._but_check(words_and_emoticons, sentiments)

        return self.score_valence(sentiments, text)

    def sentiment_valence(self, valence, sentitext, item, i, sentiments, mapping, transformations):
        is_cap_diff = sentitext.is_cap_diff
        score = 0 
        words_and_emoticons = sentitext.words_and_emoticons
        item_lowercase = item.lower()
        if item_lowercase in self.lexicon:
            # get the sentiment valence
            array = self.lexicon[item_lowercase]
            valence = array[0]
            crc_val = round((valence + 4) / 8,1)
            crc_vad = [crc_val, array[1], array[2]]
            if transformations:
             for x,y in zip(crc_vad , mapping):
                if x == y or y == 1.1:
                    score += 1
                else:
                    score = score

            
            if score == 3:
                    valence = valence * -1
           
            # check if sentiment laden word is in ALL CAPS (while others aren't)
            if item.isupper() and is_cap_diff:
                if valence > 0:
                    valence += self.constants.C_INCR
                else:
                    valence -= self.constants.C_INCR

            for start_i in range(0, 3):
                if (
                    i > start_i
                    and words_and_emoticons[i - (start_i + 1)].lower()
                    not in self.lexicon
                ):
                    # dampen the scalar modifier of preceding words and emoticons
                    # (excluding the ones that immediately preceed the item) based
                    # on their distance from the current item.
                    s = self.constants.scalar_inc_dec(
                        words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
                    )
                    if start_i == 1 and s != 0:
                        s = s * 0.95
                    if start_i == 2 and s != 0:
                        s = s * 0.9
                    valence = valence + s
                    valence = self._never_check(
                        valence, words_and_emoticons, start_i, i
                    )
                    if start_i == 2:
                        valence = self._idioms_check(valence, words_and_emoticons, i)

                        # future work: consider other sentiment-laden idioms
                        # other_idioms =
                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
                        #  "upper hand": 1, "break a leg": 2,
                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
                        #  "on the ball": 2,"under the weather": -2}

            valence = self._least_check(valence, words_and_emoticons, i)
        sentiments.append(valence)
        return sentiments

    def _least_check(self, valence, words_and_emoticons, i):
        # check for negation case using "least"
        if (
            i > 1
            and words_and_emoticons[i - 1].lower() not in self.lexicon
            and words_and_emoticons[i - 1].lower() == "least"
        ):
            if (
                words_and_emoticons[i - 2].lower() != "at"
                and words_and_emoticons[i - 2].lower() != "very"
            ):
                valence = valence * self.constants.N_SCALAR
        elif (
            i > 0
            and words_and_emoticons[i - 1].lower() not in self.lexicon
            and words_and_emoticons[i - 1].lower() == "least"
        ):
            valence = valence * self.constants.N_SCALAR
        return valence

    def _but_check(self, words_and_emoticons, sentiments):
        words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons]
        but = {"but"} & set(words_and_emoticons)
        if but:
            bi = words_and_emoticons.index(next(iter(but)))
            for sidx, sentiment in enumerate(sentiments):
                if sidx < bi:
                    sentiments[sidx] = sentiment * 0.5
                elif sidx > bi:
                    sentiments[sidx] = sentiment * 1.5
        return sentiments

    def _idioms_check(self, valence, words_and_emoticons, i):
        onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}"

        twoonezero = "{} {} {}".format(
            words_and_emoticons[i - 2],
            words_and_emoticons[i - 1],
            words_and_emoticons[i],
        )

        twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}"

        threetwoone = "{} {} {}".format(
            words_and_emoticons[i - 3],
            words_and_emoticons[i - 2],
            words_and_emoticons[i - 1],
        )

        threetwo = "{} {}".format(
            words_and_emoticons[i - 3], words_and_emoticons[i - 2]
        )

        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]

        for seq in sequences:
            if seq in self.constants.SPECIAL_CASE_IDIOMS:
                valence = self.constants.SPECIAL_CASE_IDIOMS[seq]
                break

        if len(words_and_emoticons) - 1 > i:
            zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}"
            if zeroone in self.constants.SPECIAL_CASE_IDIOMS:
                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone]
        if len(words_and_emoticons) - 1 > i + 1:
            zeroonetwo = "{} {} {}".format(
                words_and_emoticons[i],
                words_and_emoticons[i + 1],
                words_and_emoticons[i + 2],
            )
            if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS:
                valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo]

        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
        if (
            threetwo in self.constants.BOOSTER_DICT
            or twoone in self.constants.BOOSTER_DICT
        ):
            valence = valence + self.constants.B_DECR
        return valence

    def _never_check(self, valence, words_and_emoticons, start_i, i):
        if start_i == 0:
            if self.constants.negated([words_and_emoticons[i - 1]]):
                valence = valence * self.constants.N_SCALAR
        if start_i == 1:
            if words_and_emoticons[i - 2] == "never" and (
                words_and_emoticons[i - 1] == "so"
                or words_and_emoticons[i - 1] == "this"
            ):
                valence = valence * 1.5
            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
                valence = valence * self.constants.N_SCALAR
        if start_i == 2:
            if (
                words_and_emoticons[i - 3] == "never"
                and (
                    words_and_emoticons[i - 2] == "so"
                    or words_and_emoticons[i - 2] == "this"
                )
                or (
                    words_and_emoticons[i - 1] == "so"
                    or words_and_emoticons[i - 1] == "this"
                )
            ):
                valence = valence * 1.25
            elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]):
                valence = valence * self.constants.N_SCALAR
        return valence

    def _punctuation_emphasis(self, sum_s, text):
        # add emphasis from exclamation points and question marks
        ep_amplifier = self._amplify_ep(text)
        qm_amplifier = self._amplify_qm(text)
        punct_emph_amplifier = ep_amplifier + qm_amplifier
        return punct_emph_amplifier

    def _amplify_ep(self, text):
        # check for added emphasis resulting from exclamation points (up to 4 of them)
        ep_count = text.count("!")
        if ep_count > 4:
            ep_count = 4
        # (empirically derived mean sentiment intensity rating increase for
        # exclamation points)
        ep_amplifier = ep_count * 0.292
        return ep_amplifier

    def _amplify_qm(self, text):
        # check for added emphasis resulting from question marks (2 or 3+)
        qm_count = text.count("?")
        qm_amplifier = 0
        if qm_count > 1:
            if qm_count <= 3:
                # (empirically derived mean sentiment intensity rating increase for
                # question marks)
                qm_amplifier = qm_count * 0.18
            else:
                qm_amplifier = 0.96
        return qm_amplifier

    def _sift_sentiment_scores(self, sentiments):
        # want separate positive versus negative sentiment scores
        pos_sum = 0.0
        neg_sum = 0.0
        neu_count = 0
        for sentiment_score in sentiments:
            if sentiment_score > 0:
                pos_sum += (
                    float(sentiment_score) + 1
                )  # compensates for neutral words that are counted as 1
            if sentiment_score < 0:
                neg_sum += (
                    float(sentiment_score) - 1
                )  # when used with math.fabs(), compensates for neutrals
            if sentiment_score == 0:
                neu_count += 1
        return pos_sum, neg_sum, neu_count

    def score_valence(self, sentiments, text):
        if sentiments:
            sum_s = float(sum(sentiments))
            # compute and add emphasis from punctuation in text
            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
            if sum_s > 0:
                sum_s += punct_emph_amplifier
            elif sum_s < 0:
                sum_s -= punct_emph_amplifier

            compound = self.constants.normalize(sum_s)
            # discriminate between positive, negative and neutral sentiment scores
            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)

            if pos_sum > math.fabs(neg_sum):
                pos_sum += punct_emph_amplifier
            elif pos_sum < math.fabs(neg_sum):
                neg_sum -= punct_emph_amplifier

            total = pos_sum + math.fabs(neg_sum) + neu_count
            pos = math.fabs(pos_sum / total)
            neg = math.fabs(neg_sum / total)
            neu = math.fabs(neu_count / total)
        

        else:
            compound = 0.0
            pos = 0.0
            neg = 0.0
            neu = 0.0

        sentiment_dict = {
            "neg": round(neg, 3),
            "neu": round(neu, 3),
            "pos": round(pos, 3),
            "compound": round(compound, 4),
        }

        return sentiment_dict



In [74]:
def vader_inference(review, maps=False, map=[0.5, 0.5, 0.5]):
  start = time.perf_counter()
  sia = SentimentIntensityAnalyzer()
  sentiment = sia.polarity_scores(review, transformations=maps, mapping=map)['compound']
  if sentiment < 0:
    sentiment = 0.4
  elif sentiment > 0:
    sentiment = 0.6
  elif sentiment == 0:
    sentiment = 0.5
  end = time.perf_counter()
  elapsed = end - start
  return (sentiment, elapsed) 

In [10]:
sentence = "bad bad bad"
print(vader_inference(review=sentence, maps=False, map=[0.2, 0.6, 0.3]))

(0.4, 0.011015300000508432)


In [None]:
sentence = ""
print(inference(review=sentence))
print(vader_inference(review=sentence))

(0.2, 0.00010760000077425502)
-0.6229
(0.4, 0.008932199998525903)


## Loading Test/Train Data

In [8]:
import os

### Product Reviews (Amazon)

In [85]:
books_df = pd.read_csv('data/products/1/books/cleaned.csv')  
dvd_df = pd.read_csv('data/products/1/dvd/cleaned.csv') 
electronics_df = pd.read_csv('data/products/1/electronics/cleaned.csv') 
kitchen_df = pd.read_csv('data/products/1/kitchen/cleaned.csv')   

pos_books_df = books_df[books_df["polarity"] == "positive"]
pos_dvd_df = dvd_df[dvd_df["polarity"] == "positive"]
pos_electronics_df = electronics_df[electronics_df["polarity"] == "positive"]
pos_kitchen_df = kitchen_df[kitchen_df["polarity"] == "positive"]

neg_books_df = books_df[books_df["polarity"] == "negative"]
neg_dvd_df = dvd_df[dvd_df["polarity"] == "negative"]
neg_electronics_df = electronics_df[electronics_df["polarity"] == "negative"]
neg_kitchen_df = kitchen_df[kitchen_df["polarity"] == "negative"]

pos_reviews = pos_books_df["review"].tolist() + pos_dvd_df["review"].tolist() + pos_electronics_df["review"].tolist() + pos_kitchen_df["review"].tolist()
neg_reviews = neg_books_df["review"].tolist() + neg_dvd_df["review"].tolist() + neg_electronics_df["review"].tolist() + neg_kitchen_df["review"].tolist()

pos_reviews = pos_reviews[:25000]
neg_reviews = neg_reviews[:25000]

train_pos_reviews = pos_reviews[:(int(len(pos_reviews) / 2))]
test_pos_reviews = pos_reviews[(-1 * int(len(pos_reviews) / 2)):]

train_neg_reviews = neg_reviews[:(int(len(neg_reviews) / 2))]
test_neg_reviews = neg_reviews[(-1 * int(len(neg_reviews) / 2)):]

train_dataset = [[[x for x in train_pos_reviews], ["positive" for i in range(len(train_pos_reviews))]], [[x for x in train_neg_reviews], ["negative" for i in range(len(train_neg_reviews))]]]
test_dataset = [[[x for x in test_pos_reviews], ["positive" for i in range(len(test_pos_reviews))]], [[x for x in test_neg_reviews], ["negative" for i in range(len(test_neg_reviews))]]]


In [86]:
len(train_dataset[1][1])

2000

### Movies (IMDB)

In [90]:
df = pd.read_csv('data/movies/1/cleaned.csv') 
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv("cleaned.csv", index=False)

In [76]:
# Train/Test Split

df = pd.read_csv('data/movies/1/cleaned.csv') #Change this 

pos_df = df[df["polarity"] == "positive"]
neg_df = df[df["polarity"] == "negative"]

pos_reviews = pos_df["review"].tolist()
neg_reviews = neg_df["review"].tolist()

train_pos_reviews = pos_reviews[:(int(len(pos_reviews) / 2))]
test_pos_reviews = pos_reviews[(-1 * int(len(pos_reviews) / 2)):]

train_neg_reviews = neg_reviews[:(int(len(neg_reviews) / 2))]
test_neg_reviews = neg_reviews[(-1 * int(len(neg_reviews) / 2)):]

train_dataset = [[[x for x in train_pos_reviews], ["positive" for i in range(len(train_pos_reviews))]], [[x for x in train_neg_reviews], ["negative" for i in range(len(train_neg_reviews))]]]
test_dataset = [[[x for x in test_pos_reviews], ["positive" for i in range(len(test_pos_reviews))]], [[x for x in test_neg_reviews], ["negative" for i in range(len(test_neg_reviews))]]]


In [77]:
train_dataset[0][0][1]

'Yes, this production is long (good news for Bronte fans!) and it has a somewhat dated feel, but both the casting and acting are so brilliant that you won\'t want to watch any other versions!<br /><br />Timothy Dalton IS Edward Rochester... it\'s that simple. I don\'t care that other reviewers claim he\'s too handsome. Dalton is attractive, certainly, but no pretty-boy. In fact he possesses a craggy, angular dark charm that, in my mind, is quite in keeping with the mysterious, very masculine Mr R. And he takes on Rochester\'s sad, tortured persona so poignantly. He portrays ferocity when the scene calls for it, but also displays Rochester\'s tender, passionate, emotional side as well. (IMO the newer A&E production suffers in that Ciaran Hinds - whom I normally adore - seems to bluster and bully his way throughout. I\'ve read the book many times and I never felt that Rochester was meant to be perceived as a nonstop snarling beast.)<br /><br />When I reread the novel, I always see Zelah 

### Microblogging (sentiment140)

In [89]:
df = pd.read_csv('data/microblogging/1/uncleaned.csv') 
df = df.drop(columns=["id", "date", "flag", "user"])
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv("cleaned.csv", index=False)

In [90]:
df = pd.read_csv('data/microblogging/1/cleaned.csv') #Change this 

pos_df = df[df["polarity"] == 4]
neg_df = df[df["polarity"] == 0]

pos_reviews = pos_df["review"].tolist()[:25000]
neg_reviews = neg_df["review"].tolist()[:25000]

train_pos_reviews = pos_reviews[:(int(len(pos_reviews) / 2))]
test_pos_reviews = pos_reviews[(-1 * int(len(pos_reviews) / 2)):]

train_neg_reviews = neg_reviews[:(int(len(neg_reviews) / 2))]
test_neg_reviews = neg_reviews[(-1 * int(len(neg_reviews) / 2)):]

train_dataset = [[[x for x in train_pos_reviews], ["positive" for i in range(len(train_pos_reviews))]], [[x for x in train_neg_reviews], ["negative" for i in range(len(train_neg_reviews))]]]
test_dataset = [[[x for x in test_pos_reviews], ["positive" for i in range(len(test_pos_reviews))]], [[x for x in test_neg_reviews], ["negative" for i in range(len(test_neg_reviews))]]]


In [91]:
len(train_dataset[0][0])

12500

### News (Financial/Stock)

In [93]:
df = pd.read_csv('data/news/1/cleaned.csv') #Change this 

pos_df = df[df["polarity"] == "positive"]
neg_df = df[df["polarity"] == "negative"]

pos_reviews = pos_df["review"].tolist()[:25000]
neg_reviews = neg_df["review"].tolist()[:25000]

train_pos_reviews = pos_reviews[:(int(len(pos_reviews) / 2))]
test_pos_reviews = pos_reviews[(-1 * int(len(pos_reviews) / 2)):]

train_neg_reviews = neg_reviews[:(int(len(neg_reviews) / 2))]
test_neg_reviews = neg_reviews[(-1 * int(len(neg_reviews) / 2)):]

train_dataset = [[[x for x in train_pos_reviews], ["positive" for i in range(len(train_pos_reviews))]], [[x for x in train_neg_reviews], ["negative" for i in range(len(train_neg_reviews))]]]
test_dataset = [[[x for x in test_pos_reviews], ["positive" for i in range(len(test_pos_reviews))]], [[x for x in test_neg_reviews], ["negative" for i in range(len(test_neg_reviews))]]]


In [94]:
train_dataset[0][0][1]

"For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m ."

In [34]:
len(train_dataset[0][0])

926

## Experimentation

function for testing inferences with the dataset

In [33]:
def test(dataset=train_dataset, ver="basic", data_exclusion=None, val=0.5, arr=0.5, dom=0.5 ): # 'f' version of above testing function
    right_trans_pos = 0
    right_notrans_pos = 0

    wrong_trans_pos = 0
    wrong_notrans_pos = 0

    right_trans_neg = 0
    right_notrans_neg = 0

    wrong_trans_neg = 0
    wrong_notrans_neg = 0
    
    total = 0
    total_pos = 0
    total_neg = 0
    
    total_time_w_trans = 0
    total_time_wo_trans = 0

    test_map = [val, arr, dom]

    if data_exclusion is not None and data_exclusion == "positive":
         pass
    else:
     for i in range(len(dataset[0][0])):
          text = str(dataset[0][0][i])
          sentiment = dataset[0][1][i] # positive texts in the dataset

          if ver == "basic":
               results_w_trans = inference(text, maps=True, map=test_map)
               result_w_trans = round(results_w_trans[0], 2)
               time_w_trans = results_w_trans[1]

               results_wo_trans = inference(text, maps=False, map=test_map)
               result_wo_trans = round(results_wo_trans[0], 2)
               time_wo_trans = results_wo_trans[1]


          elif ver == "vader":
               results_w_trans = vader_inference(text, maps=True, map=test_map)
               result_w_trans = round(results_w_trans[0], 2)
               time_w_trans = results_w_trans[1]

               results_wo_trans = vader_inference(text, maps=False, map=test_map)
               result_wo_trans = round(results_wo_trans[0], 2)
               time_wo_trans = results_wo_trans[1]

          
          total += 1
          total_pos += 1
          total_time_w_trans += time_w_trans
          total_time_wo_trans += time_wo_trans
          if sentiment == "positive" and result_w_trans < 0.5:
                    wrong_trans_pos += 1
          elif sentiment == "positive" and result_w_trans > 0.5 :
                    right_trans_pos += 1 
          if sentiment == "positive" and result_wo_trans < 0.5:
                     wrong_notrans_pos += 1 
          elif sentiment == "positive" and result_wo_trans > 0.5:
                     right_notrans_pos += 1 

    if data_exclusion is not None and data_exclusion == "negative":
         pass
    else:   
     for i in range(len(dataset[1][0])):
          text = str(dataset[1][0][i])
          sentiment = dataset[1][1][i] # negative texts in the dataset
          if ver == "basic":
               results_w_trans = inference(text, maps=True, map=test_map)
               result_w_trans = round(results_w_trans[0], 2)
               time_w_trans = results_w_trans[1]

               results_wo_trans = inference(text, maps=False, map=test_map)
               result_wo_trans = round(results_wo_trans[0], 2)
               time_wo_trans = results_wo_trans[1]

          elif ver == "vader":
               results_w_trans = vader_inference(text, maps=True, map=test_map)
               result_w_trans = round(results_w_trans[0], 2)
               time_w_trans = results_w_trans[1]

               results_wo_trans = vader_inference(text, maps=False, map=test_map)
               result_wo_trans = round(results_wo_trans[0], 2)
               time_wo_trans = results_wo_trans[1]


          total += 1
          total_neg += 1 
          total_time_w_trans += time_w_trans
          total_time_wo_trans += time_wo_trans
          if sentiment == "negative" and result_w_trans > 0.5:
                    wrong_trans_neg += 1 
          elif sentiment == "negative" and result_w_trans < 0.5:
                    right_trans_neg += 1 
          if sentiment == "negative" and result_wo_trans > 0.5:
                     wrong_notrans_neg += 1 
          elif sentiment == "negative" and result_wo_trans < 0.5:     
                     right_notrans_neg += 1 


    trans_precision = right_trans_pos / (right_trans_pos + wrong_trans_neg)
    notrans_precision = right_notrans_pos / (right_notrans_pos + wrong_notrans_neg)

    trans_recall = right_trans_pos / (right_trans_pos + wrong_trans_pos)
    notrans_recall = right_notrans_pos / (right_notrans_pos + wrong_notrans_pos)

    trans_fone = 2 / ((1/trans_precision) + (1/trans_recall))
    notrans_fone = 2 / ((1/notrans_precision) + (1/notrans_recall))

    total_trans_right = right_trans_neg + right_trans_pos
    total_notrans_right = right_notrans_neg + right_notrans_pos

    print("With Transformations: " + f"{total_trans_right} / {total}" + f" in elapsed time {total_time_w_trans} seconds \n precision: {trans_precision}, recall: {trans_recall}, f1 score: {trans_fone}")
    print("Without Transformations: " + f"{total_notrans_right} / {total}" + f" in elapsed time {total_time_wo_trans} seconds \n precision: {notrans_precision}, recall: {notrans_recall}, f1 score: {notrans_fone}")
    

### CPU friendly finding function

In [34]:
data = pd.read_csv(f'data/lexicons/vader-CONV.csv') # Change this

terms = data['term'].tolist()
valence = data['valence'].tolist()
arousal = data['arousal'].tolist()
dominance = data['dominance'].tolist()

lexicon = {}
for term, val, arr, dom in zip(terms, valence, arousal, dominance):
    #lexicon[term] = [val, arr, dom]
    lexicon[term] = [round((val + 4) / 8, 1), arr, dom]

In [54]:
reviews = train_dataset[0][0] + train_dataset[1][0]
labels = train_dataset[0][1] + train_dataset[1][1]
for i in range(len(reviews)):
    review = reviews[i]
    bow = decompose_review(str(review))
    reviews[i] = bow 


In [55]:
def training_inference(reviews, compatible_words, sig_figs=2):
  sentiments = []
  for review in reviews:
   num_words = 0
   tot_sent = 0
   for word in review:
     try:
      val = round(lexicon[word][0], sig_figs)
     except:
      continue              
     
     if val != 0.5:
       if word in compatible_words:
         if val < 0.5:
           val = 0.5 + abs(val - 0.5)
         else:
           val = 0.5 - abs(val - 0.5)
       tot_sent += val 
       num_words += 1 

   if num_words != 0:
      final_sentiment = round((tot_sent / num_words), sig_figs)
   else:
      final_sentiment = 0.5

   sentiments.append(final_sentiment)
   
  return sentiments 

In [56]:
from itertools import product

values = [round(v, 2) for v in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1]]
map_combinations = list(product(values, repeat=3))  

accuracies = []
progress = 0

all_compatible_words = []

for combination in map_combinations:
  compatible_words = set()
  for word in lexicon.keys():
    array = lexicon[word]
    if (array[0] == combination[0] or combination[0] == 1.1) and (array[1] == combination[1] or combination[1] == 1.1) and (array[2] == combination[2] or combination[2] == 1.1):
      compatible_words.add(word)
  all_compatible_words.append(compatible_words)

def work(i, sig_figs=2):
  compatible_words = all_compatible_words[i]
  if not compatible_words:
    return 0
  else:
   sentiments = training_inference(reviews, compatible_words=compatible_words, sig_figs=sig_figs)
   sentiments = ["positive" if x > 0.5 else "negative" if x < 0.5 else "neutral" for x in sentiments]
   correct = [sentiment == label for sentiment, label in zip(sentiments, labels)]
   accuracy = sum(correct) / len(correct)
   return accuracy 
 

In [57]:
all_compatible_words[160]

set()

In [58]:

def find(sig_figs):
    accuracies = []
    for i in range(len(map_combinations)):
        accuracy = work(i, sig_figs=sig_figs)
        accuracies.append(accuracy)
        print(f"{i} / {len(map_combinations)}")
        
    
    best_idx = accuracies.index(max(accuracies))
    return map_combinations[best_idx]

In [76]:
index = map_combinations.index((0.7, 0.3, 1.1))
print(work(index, 2))
print(index)
print(map_combinations)
len(map_combinations)

0.69844
1055
[(0, 0, 0), (0, 0, 0.1), (0, 0, 0.2), (0, 0, 0.3), (0, 0, 0.4), (0, 0, 0.5), (0, 0, 0.6), (0, 0, 0.7), (0, 0, 0.8), (0, 0, 0.9), (0, 0, 1.0), (0, 0, 1.1), (0, 0.1, 0), (0, 0.1, 0.1), (0, 0.1, 0.2), (0, 0.1, 0.3), (0, 0.1, 0.4), (0, 0.1, 0.5), (0, 0.1, 0.6), (0, 0.1, 0.7), (0, 0.1, 0.8), (0, 0.1, 0.9), (0, 0.1, 1.0), (0, 0.1, 1.1), (0, 0.2, 0), (0, 0.2, 0.1), (0, 0.2, 0.2), (0, 0.2, 0.3), (0, 0.2, 0.4), (0, 0.2, 0.5), (0, 0.2, 0.6), (0, 0.2, 0.7), (0, 0.2, 0.8), (0, 0.2, 0.9), (0, 0.2, 1.0), (0, 0.2, 1.1), (0, 0.3, 0), (0, 0.3, 0.1), (0, 0.3, 0.2), (0, 0.3, 0.3), (0, 0.3, 0.4), (0, 0.3, 0.5), (0, 0.3, 0.6), (0, 0.3, 0.7), (0, 0.3, 0.8), (0, 0.3, 0.9), (0, 0.3, 1.0), (0, 0.3, 1.1), (0, 0.4, 0), (0, 0.4, 0.1), (0, 0.4, 0.2), (0, 0.4, 0.3), (0, 0.4, 0.4), (0, 0.4, 0.5), (0, 0.4, 0.6), (0, 0.4, 0.7), (0, 0.4, 0.8), (0, 0.4, 0.9), (0, 0.4, 1.0), (0, 0.4, 1.1), (0, 0.5, 0), (0, 0.5, 0.1), (0, 0.5, 0.2), (0, 0.5, 0.3), (0, 0.5, 0.4), (0, 0.5, 0.5), (0, 0.5, 0.6), (0, 0.5, 0.7), (0

1728

### GPU Finding Function

In [None]:
import torch 
import pandas as pd

print(torch.cuda.is_available()) 

if torch.cuda.is_available():
    torch.cuda.empty_cache()

True


In [16]:
# Initialize lexicon for a vectorized approach

data = pd.read_csv(f'data/lexicons/vader-CONV.csv') # Change this

terms = data['term'].tolist()
valence = data['valence'].tolist()
arousal = data['arousal'].tolist()
dominance = data['dominance'].tolist()

lexicon = {}
for term, val, arr, dom in zip(terms, valence, arousal, dominance):
    #lexicon[term] = [val, arr, dom]
    lexicon[term] = [round((val + 4) / 8,1), arr, dom]

# Create an UNK token to handle unknown words (as they are handled in the original inference function)
UNK_TOKEN = '<UNK>'
if UNK_TOKEN not in lexicon:
    lexicon[UNK_TOKEN] = [0.5, 0.5, 0.5]

vocab = list(lexicon.keys())
word_to_index = {word: i for i, word in enumerate(vocab)}
lexicon_matrix = torch.tensor([lexicon[word] for word in vocab], dtype=torch.float32).cuda()
unk_index = word_to_index[UNK_TOKEN]

In [30]:
# Prepare data for training 

reviews = train_dataset[0][0] + train_dataset[1][0]
labels = train_dataset[0][1] + train_dataset[1][1]
for i in range(len(reviews)):
    review = reviews[i]
    bow = decompose_review(str(review))
    reviews[i] = bow 

    
# Store the index of each word in each review and pad

max_len = max(len(r) for r in reviews)
sentence_indices = torch.full((len(reviews), max_len), fill_value=-1, dtype=torch.long)

for i, r in enumerate(reviews):
    indices = [word_to_index.get(w, unk_index) for w in r]  
    sentence_indices[i, :len(indices)] = torch.tensor(indices, dtype=torch.long)

sentence_indices = sentence_indices.cuda()
mask = sentence_indices != -1  # mask for padded positions

# Store all possible val, arr, dom combinations

#x_vals = torch.arange(0.0, 1.2, 0.1) # in increments of x
#y_vals = torch.arange(0.0, 1.2, 0.1)
#z_vals = torch.arange(0.0, 1.2, 0.1)
#maps_combinations = torch.cartesian_prod(x_vals, y_vals, z_vals).to(dtype=torch.float32, device='cuda')
maps_combinations = torch.tensor([[0.7, 0.3, 1.1]]).cuda()

In [31]:
maps_combinations

tensor([[0.7000, 0.3000, 1.1000]], device='cuda:0')

In [29]:
maps_combinations

tensor([[0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1000],
        [0.0000, 0.0000, 0.2000],
        ...,
        [1.1000, 1.1000, 0.9000],
        [1.1000, 1.1000, 1.0000],
        [1.1000, 1.1000, 1.1000]], device='cuda:0')

In [32]:
# gpu compatible inference function 

def inference_batch(sentences_idx, mask, lexicon_matrix, maps_combination=None, sig_figs=2):
    """
    Returns: predicted x averages per sentence (shape: num_sentences)
    """
    safe_idx = sentences_idx.clone()
    safe_idx[~mask] = 0
    lex_vals = lexicon_matrix[safe_idx]
    lex_vals[~mask] = 0.5

    vals = lex_vals[:,:,0]  # x component

    if maps_combination is not None:
        sub_trans = maps_combination  # shape (3,)
        matches = ((lex_vals == sub_trans) | (sub_trans == 1.1))
        points = matches.sum(dim=2)
        mask_points = (points == 3) & mask

        # compute magnitude and round to requested decimals using helper
        val_mag = torch.abs(vals - 0.5)
        val_mag = torch.round(val_mag, decimals=sig_figs)

        vals = vals.clone()
        vals[(mask_points) & (vals < 0.5)] = 0.5 + val_mag[(mask_points) & (vals < 0.5)]
        vals[(mask_points) & (vals > 0.5)] = 0.5 - val_mag[(mask_points) & (vals > 0.5)]

    # round vals to decimals using helper
    vals = torch.round(vals, decimals=sig_figs) 


     # Create a boolean mask for sentimental tokens (val != 0.5) and not padding
    sentimental_mask = (vals != 0.5) & mask

    # Sum sentimental vals per sentence and count sentimental tokens per sentence
    sentimental_sum = (vals * sentimental_mask.float()).sum(dim=1)
    sentimental_count = sentimental_mask.sum(dim=1).float()

    # Avoid division by zero: where count == 0, return neutral 0.5; else average
    avg_x = torch.where(sentimental_count > 0,
                        sentimental_sum / sentimental_count,
                        torch.full_like(sentimental_sum, 0.5))

    return avg_x  # shape: (num_sentences,)

In [33]:
# Simple function for computing accuracy over many predictions and labels

def compute_accuracy(pred_avg_x, labels):
    pred_labels = ["positive" if v > 0.5 else "negative" if v < 0.5 else "neutral" for v in pred_avg_x.cpu().tolist()]
    correct = [p == t for p, t in zip(pred_labels, labels)]
    acc = sum(correct)/len(correct)
    return acc

In [34]:
# Function for finding the most accurate val, arr, dom map

def find(sig_figs):
    accuracies = []

    for m in maps_combinations:
        avg_x = inference_batch(sentence_indices, mask, lexicon_matrix, m, sig_figs=sig_figs)
        acc = compute_accuracy(avg_x, labels)
        accuracies.append(acc)
    
    print(max(accuracies))
    print(len(accuracies))

    accuracies = torch.tensor(accuracies)  # (num_maps,)
     
    best_idx = torch.argmax(accuracies)
    best_map = maps_combinations[best_idx]
    
    best_acc = accuracies[best_idx]
    print(f"Best map: {best_map}")
    print(f"Best accuracy: {best_acc.item()}")


### General Testing

In [25]:
print(len(train_dataset[0][0]))
len(reviews)

12500


25000

In [35]:
find(2)

0.70496
1
Best map: tensor([0.7000, 0.3000, 1.1000], device='cuda:0')
Best accuracy: 0.7049599885940552


In [None]:
find(2) #Best map for movies

0 / 1728
1 / 1728
2 / 1728
3 / 1728
4 / 1728
5 / 1728
6 / 1728
7 / 1728
8 / 1728
9 / 1728
10 / 1728
11 / 1728
12 / 1728
13 / 1728
14 / 1728
15 / 1728
16 / 1728
17 / 1728
18 / 1728
19 / 1728
20 / 1728
21 / 1728
22 / 1728
23 / 1728
24 / 1728
25 / 1728
26 / 1728
27 / 1728
28 / 1728
29 / 1728
30 / 1728
31 / 1728
32 / 1728
33 / 1728
34 / 1728
35 / 1728
36 / 1728
37 / 1728
38 / 1728
39 / 1728
40 / 1728
41 / 1728
42 / 1728
43 / 1728
44 / 1728
45 / 1728
46 / 1728
47 / 1728
48 / 1728
49 / 1728
50 / 1728
51 / 1728
52 / 1728
53 / 1728
54 / 1728
55 / 1728
56 / 1728
57 / 1728
58 / 1728
59 / 1728
60 / 1728
61 / 1728
62 / 1728
63 / 1728
64 / 1728
65 / 1728
66 / 1728
67 / 1728
68 / 1728
69 / 1728
70 / 1728
71 / 1728
72 / 1728
73 / 1728
74 / 1728
75 / 1728
76 / 1728
77 / 1728
78 / 1728
79 / 1728
80 / 1728
81 / 1728
82 / 1728
83 / 1728
84 / 1728
85 / 1728
86 / 1728
87 / 1728
88 / 1728
89 / 1728
90 / 1728
91 / 1728
92 / 1728
93 / 1728
94 / 1728
95 / 1728
96 / 1728
97 / 1728
98 / 1728
99 / 1728
100 / 1728

(0.7, 0.3, 1.1)

In [None]:
find(2) #Best map for amazon products

0 / 1728
1 / 1728
2 / 1728
3 / 1728
4 / 1728
5 / 1728
6 / 1728
7 / 1728
8 / 1728
9 / 1728
10 / 1728
11 / 1728
12 / 1728
13 / 1728
14 / 1728
15 / 1728
16 / 1728
17 / 1728
18 / 1728
19 / 1728
20 / 1728
21 / 1728
22 / 1728
23 / 1728
24 / 1728
25 / 1728
26 / 1728
27 / 1728
28 / 1728
29 / 1728
30 / 1728
31 / 1728
32 / 1728
33 / 1728
34 / 1728
35 / 1728
36 / 1728
37 / 1728
38 / 1728
39 / 1728
40 / 1728
41 / 1728
42 / 1728
43 / 1728
44 / 1728
45 / 1728
46 / 1728
47 / 1728
48 / 1728
49 / 1728
50 / 1728
51 / 1728
52 / 1728
53 / 1728
54 / 1728
55 / 1728
56 / 1728
57 / 1728
58 / 1728
59 / 1728
60 / 1728
61 / 1728
62 / 1728
63 / 1728
64 / 1728
65 / 1728
66 / 1728
67 / 1728
68 / 1728
69 / 1728
70 / 1728
71 / 1728
72 / 1728
73 / 1728
74 / 1728
75 / 1728
76 / 1728
77 / 1728
78 / 1728
79 / 1728
80 / 1728
81 / 1728
82 / 1728
83 / 1728
84 / 1728
85 / 1728
86 / 1728
87 / 1728
88 / 1728
89 / 1728
90 / 1728
91 / 1728
92 / 1728
93 / 1728
94 / 1728
95 / 1728
96 / 1728
97 / 1728
98 / 1728
99 / 1728
100 / 1728

(0.7, 0.3, 1.1)

In [None]:
find(2)  #Best map for sentiment140 

0 / 1728
1 / 1728
2 / 1728
3 / 1728
4 / 1728
5 / 1728
6 / 1728
7 / 1728
8 / 1728
9 / 1728
10 / 1728
11 / 1728
12 / 1728
13 / 1728
14 / 1728
15 / 1728
16 / 1728
17 / 1728
18 / 1728
19 / 1728
20 / 1728
21 / 1728
22 / 1728
23 / 1728
24 / 1728
25 / 1728
26 / 1728
27 / 1728
28 / 1728
29 / 1728
30 / 1728
31 / 1728
32 / 1728
33 / 1728
34 / 1728
35 / 1728
36 / 1728
37 / 1728
38 / 1728
39 / 1728
40 / 1728
41 / 1728
42 / 1728
43 / 1728
44 / 1728
45 / 1728
46 / 1728
47 / 1728
48 / 1728
49 / 1728
50 / 1728
51 / 1728
52 / 1728
53 / 1728
54 / 1728
55 / 1728
56 / 1728
57 / 1728
58 / 1728
59 / 1728
60 / 1728
61 / 1728
62 / 1728
63 / 1728
64 / 1728
65 / 1728
66 / 1728
67 / 1728
68 / 1728
69 / 1728
70 / 1728
71 / 1728
72 / 1728
73 / 1728
74 / 1728
75 / 1728
76 / 1728
77 / 1728
78 / 1728
79 / 1728
80 / 1728
81 / 1728
82 / 1728
83 / 1728
84 / 1728
85 / 1728
86 / 1728
87 / 1728
88 / 1728
89 / 1728
90 / 1728
91 / 1728
92 / 1728
93 / 1728
94 / 1728
95 / 1728
96 / 1728
97 / 1728
98 / 1728
99 / 1728
100 / 1728

(0.7, 0.6, 0.6)

In [59]:
find(2) #Best map for financial news 

0 / 1728
1 / 1728
2 / 1728
3 / 1728
4 / 1728
5 / 1728
6 / 1728
7 / 1728
8 / 1728
9 / 1728
10 / 1728
11 / 1728
12 / 1728
13 / 1728
14 / 1728
15 / 1728
16 / 1728
17 / 1728
18 / 1728
19 / 1728
20 / 1728
21 / 1728
22 / 1728
23 / 1728
24 / 1728
25 / 1728
26 / 1728
27 / 1728
28 / 1728
29 / 1728
30 / 1728
31 / 1728
32 / 1728
33 / 1728
34 / 1728
35 / 1728
36 / 1728
37 / 1728
38 / 1728
39 / 1728
40 / 1728
41 / 1728
42 / 1728
43 / 1728
44 / 1728
45 / 1728
46 / 1728
47 / 1728
48 / 1728
49 / 1728
50 / 1728
51 / 1728
52 / 1728
53 / 1728
54 / 1728
55 / 1728
56 / 1728
57 / 1728
58 / 1728
59 / 1728
60 / 1728
61 / 1728
62 / 1728
63 / 1728
64 / 1728
65 / 1728
66 / 1728
67 / 1728
68 / 1728
69 / 1728
70 / 1728
71 / 1728
72 / 1728
73 / 1728
74 / 1728
75 / 1728
76 / 1728
77 / 1728
78 / 1728
79 / 1728
80 / 1728
81 / 1728
82 / 1728
83 / 1728
84 / 1728
85 / 1728
86 / 1728
87 / 1728
88 / 1728
89 / 1728
90 / 1728
91 / 1728
92 / 1728
93 / 1728
94 / 1728
95 / 1728
96 / 1728
97 / 1728
98 / 1728
99 / 1728
100 / 1728

(0.3, 0.5, 0.2)

In [None]:
test(test_dataset, ver="vader", val=0.7, arr=0.3, dom=1.1) # imdb movie results (electra)

With Transformations: 18150 / 25000 in elapsed time 295.7918605012819 seconds 
 precision: 0.7118986962385734, recall: 0.760566762728146, f1 score: 0.7354284387336482
Without Transformations: 17497 / 25000 in elapsed time 293.7831619002973 seconds 
 precision: 0.6512752326846367, recall: 0.862493997118617, f1 score: 0.7421487603305785


In [82]:
test(test_dataset, ver="basic", val=0.7, arr=0.3, dom=1.1) # imdb movie results (basic algorithm)

With Transformations: 17710 / 25000 in elapsed time 40.52297809987431 seconds 
 precision: 0.7241108581298579, recall: 0.7643792474722361, f1 score: 0.7437003588275612
Without Transformations: 16759 / 25000 in elapsed time 36.35650519993942 seconds 
 precision: 0.6460424651700432, recall: 0.8714098145412769, f1 score: 0.7419907067742725


In [None]:
test(test_dataset, ver="vader", val=0.7, arr=0.3, dom=1.1) #  amazon results (electra)

With Transformations: 2810 / 4000 in elapsed time 42.64958379997188 seconds 
 precision: 0.676410045286126, recall: 0.8289606458123108, f1 score: 0.7449557923373384
Without Transformations: 2706 / 4000 in elapsed time 42.79603260001386 seconds 
 precision: 0.6275838698746188, recall: 0.934409687184662, f1 score: 0.7508615446989662


In [87]:
test(test_dataset, ver="basic", val=0.7, arr=0.3, dom=1.1) #  amazon results (basic algorithm)

With Transformations: 2755 / 4000 in elapsed time 6.49634910007444 seconds 
 precision: 0.6925734024179621, recall: 0.835852006253257, f1 score: 0.7574970484061394
Without Transformations: 2633 / 4000 in elapsed time 5.892666299987468 seconds 
 precision: 0.6330686917500863, recall: 0.9434156378600823, f1 score: 0.7576946911795084


In [None]:
test(test_dataset, ver="vader", val=0.7, arr=0.6, dom=0.6) #sentiment140 results (electra)

With Transformations: 13202 / 25000 in elapsed time 246.5840627996513 seconds 
 precision: 0.6699938709394975, recall: 0.8519260743709641, f1 score: 0.7500857717002402
Without Transformations: 13054 / 25000 in elapsed time 245.350983900571 seconds 
 precision: 0.6597620474193272, recall: 0.8578742348358375, f1 score: 0.7458873621056706


In [92]:
test(test_dataset, ver="basic", val=0.7, arr=0.6, dom=0.6) #sentiment140 results (basic algorithm)

With Transformations: 12518 / 25000 in elapsed time 2.484996199986199 seconds 
 precision: 0.6710191920077119, recall: 0.872195010821278, f1 score: 0.758494304110946
Without Transformations: 12364 / 25000 in elapsed time 2.192644299935637 seconds 
 precision: 0.6591510803655308, recall: 0.877344549278163, f1 score: 0.7527552911343022


In [None]:
test(test_dataset, ver="vader", val=0.3, arr=0.5, dom=0.2) #financial news results (electra)

With Transformations: 719 / 1356 in elapsed time 12.66397259994119 seconds 
 precision: 0.7741514360313316, recall: 0.9265625, f1 score: 0.8435277382645805
Without Transformations: 706 / 1356 in elapsed time 12.61593320000975 seconds 
 precision: 0.7791495198902606, recall: 0.8875, f1 score: 0.8298027757487216


In [95]:
test(test_dataset, ver="basic", val=0.3, arr=0.5, dom=0.2) #financial news results (basic algorithm)

With Transformations: 708 / 1356 in elapsed time 0.179400699929829 seconds 
 precision: 0.7850467289719626, recall: 0.9318541996830428, f1 score: 0.8521739130434782
Without Transformations: 677 / 1356 in elapsed time 0.16095270002551842 seconds 
 precision: 0.8079763663220089, recall: 0.8937908496732027, f1 score: 0.8487199379363849
