In [1]:
DATA = './downloaded_lexica/'

AFINN = 'AFINN-111.txt'

NRC = 'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

BING_POS = 'bing-positive-words.txt'

BING_NEG = 'bing-negative-words.txt'


Load Lexicon Data

In [2]:
# essential modules imported

import pandas as pd
import os
from collections import Counter
import numpy as np 
import matplotlib as plt

In [5]:
def load_lexicon_generic(filepath, sep='\t', score_col=None, sentiment_col=None, fixed_sentiment=None, encoding='latin-1'):
    """
    Loads a lexicon file into a standardized DataFrame format.
    """
    words = []
    scores = []
    sentiments = []

    with open(filepath, 'r', encoding=encoding) as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith(';'): # Skip empty lines or comments (Bing files)
                continue
            parts = line.split(sep)

            # 1. Word (always in col 0)
            words.append(parts[0].lower())

            # 2. Numeric Score or default to 1
            if score_col is not None and len(parts) > score_col:
                try:
                    scores.append(int(parts[score_col]))
                except ValueError:
                    scores.append(0) # Manage non-integer scores gracefully
            else:
                scores.append(1)

            # 3. Sentiment/Emotion (NRC) or fixed sentiment (Bing)
            if sentiment_col is not None and len(parts) > sentiment_col:
                sentiments.append(parts[sentiment_col].lower())
            elif fixed_sentiment is not None:
                sentiments.append(fixed_sentiment)
            else:
                sentiments.append(None)

    # Dataframe creation
    df = pd.DataFrame({
        'word': words,
        'score': scores,
        'sentiment': sentiments
    })

    # Clean up for NRC: keep only rows where score == 1
    if sentiment_col is not None:
        df = df[df['score'] == 1].copy()

    return df 

In [6]:
# 1. Load AFINN (scoring in col 1)
afinn_df = load_lexicon_generic(DATA + AFINN, score_col=1)

# 2. Load NRC (emotions in col 1, flag 0/1 in col 2)
nrc_df = load_lexicon_generic(DATA + NRC, sentiment_col=1, score_col=2)

# 3. Load BING (separate files, assign fixed sentiment inside the function)
bing_pos = load_lexicon_generic(DATA + BING_POS, fixed_sentiment='positive')
bing_neg = load_lexicon_generic(DATA + BING_NEG, fixed_sentiment='negative')
bing_df = pd.concat([bing_pos, bing_neg], ignore_index=True)

print("All lexica loaded successfully.")

All lexica loaded successfully.


Sentiment Analysis

In [7]:
def analyse_generic(phrase, lexicon_type, lexicon_df):
    words = phrase.lower().split()
    
    if lexicon_type.lower() == 'afinn':
        lex_dict = dict(zip(lexicon_df['word'], lexicon_df['score']))
        
        word_scores = [] 
        for w in words:
            if w in lex_dict:
                word_scores.append((w, lex_dict[w]))
        
        total_score = sum(score for _, score in word_scores)
        return {
            'lexicon': 'AFINN',
            'total_score': total_score,
            'matched_words': len(word_scores),
            'word_scores': word_scores,
            'average_score': total_score / len(word_scores) if word_scores else 0
        }
    
    elif lexicon_type.lower() == 'bing':

    # Identify positive and negative words

        pos_words = set(lexicon_df[lexicon_df['sentiment']=='positive']['word'])
        neg_words = set(lexicon_df[lexicon_df['sentiment']=='negative']['word'])

    # Parallel counting
        pos_matches = [w for w in words if w in pos_words]
        neg_matches = [w for w in words if w in neg_words]
        return {
            'lexicon': 'BING',
            'positive_count': len(pos_matches),
            'negative_count': len(neg_matches),
    # From parallel counting to an actual score
    
            'net_sentiment': len(pos_matches) - len(neg_matches),
            'positive_words': pos_matches,
            'negative_words': neg_matches
        }
    
    elif lexicon_type.lower() == 'nrc':
        # Initialize counters
        emotion_counts = Counter()
        matched_words = {}

        for w in words:
            # Look for all emotions associated with the word
            emotions = lexicon_df[lexicon_df['word'] == w]['sentiment'].values

            for e in emotions:
                emotion_counts[e] += 1
                # Keep track of which words matched which emotions
                matched_words.setdefault(w, []).append(e)
               
        # 1. Define emotion categories
        pos_emotion_list = {'joy', 'trust', 'surprise', 'anticipation', 'positive'}
        neg_emotion_list = {'anger', 'fear', 'sadness', 'disgust', 'negative'}

        # 2. Count occurrences of each emotion in the phrase
        # Use get to avoid KeyErrors for words not in the lexicon
        total_positive = sum(emotion_counts.get(e, 0) for e in pos_emotion_list)
        total_negative = sum(emotion_counts.get(e, 0) for e in neg_emotion_list)

        # 3. Count Net Score
        net_score = total_positive - total_negative

        return {
            'lexicon': 'NRC',
            'emotion_counts': dict(emotion_counts),
            'matched_words': matched_words,
            'total_matches': sum(emotion_counts.values()),
            'total_positive': total_positive,   
            'total_negative': total_negative,  
            'net_score': net_score
        }
    
    else:
        raise ValueError(f"Unknown lexicon type: {lexicon_type}")


In [8]:
def score_phrase(phrase, lexicon_type, lexicon_df):
    return analyse_generic(phrase, lexicon_type, lexicon_df)

EXAMPLE ANALYSIS

In [9]:
# Example phrase: afinn and bing are unable to pick up a signal
myphrase = "I love you but I hate the current political situation"

In [10]:

afinn_res = score_phrase(myphrase, 'afinn', afinn_df)
bing_res  = score_phrase(myphrase, 'bing',  bing_df)
nrc_res   = score_phrase(myphrase, 'nrc',   nrc_df)

print("\nAFINN result:")
print(f"  Total Score: {afinn_res['total_score']}")
print(f"  Average Score: {afinn_res['average_score']}")
print(f"  Matched Words: {afinn_res['matched_words']}")
print(f"  Word Scores: {afinn_res['word_scores']}")

print("\nBING result:")
print(f"  Positive Count: {bing_res['positive_count']}")
print(f"  Negative Count: {bing_res['negative_count']}")
print(f"  Net Sentiment: {bing_res['net_sentiment']}")
print(f"  Positive Words: {bing_res['positive_words']}")
print(f"  Negative Words: {bing_res['negative_words']}")

print("\nNRC result:")
print(f"  Total Matches: {nrc_res['total_matches']}")
print(f"  Emotion Counts: {nrc_res['emotion_counts']}")
print(f"  Matched Words by Emotion:")
for emotion, words in nrc_res['matched_words'].items():
    print(f"    {emotion.capitalize()}: {words}")




AFINN result:
  Total Score: 0
  Average Score: 0.0
  Matched Words: 2
  Word Scores: [('love', 3), ('hate', -3)]

BING result:
  Positive Count: 1
  Negative Count: 1
  Net Sentiment: 0
  Positive Words: ['love']
  Negative Words: ['hate']

NRC result:
  Total Matches: 7
  Emotion Counts: {'joy': 1, 'positive': 1, 'anger': 1, 'disgust': 1, 'fear': 1, 'negative': 1, 'sadness': 1}
  Matched Words by Emotion:
    Love: ['joy', 'positive']
    Hate: ['anger', 'disgust', 'fear', 'negative', 'sadness']
