In [1]:
import pandas as pd
import re
from collections import Counter
from datetime import datetime
import calendar

In [2]:
def conv(s):
    """
    Ofcourse GPT-3.5 is a smart model and capable enough to return the results in said format (in-most of the cases).
    This function is trying to extract then entity-sentiment pair in a dict format.
    """
    s = s[1:-1]
    dic = {}

    positions=[match.end() for match in re.finditer(r'\b(?:positive|negative|neutral)\b,', s, flags=re.IGNORECASE)]
    for ss in [s[i:j] for i, j in zip([0] + positions, positions + [None])]:
        tmp = ss.split(':')
        try:
            dic[tmp[0]] = tmp[1].replace(",", "")
        except:
            continue

    return dic

def prepros(df):
    """
    Some basic preprocessing & mapping of string sentiment data to dict format
    """
    df.sentiments = df.sentiments.str.replace('[\n"\']', '', regex=True)
    df.sentiments = df.sentiments.str.replace(r'{\s+', '{', regex=True)
    df.sentiments = df.sentiments.str.replace(r'\s+}', '}', regex=True)
    df.sentiments = df.sentiments.str.replace(r':\s+', ':', regex=True)
    df.sentiments = df.sentiments.str.replace(r',\s+', ',', regex=True)
    df.sentiments = df.sentiments.apply(conv)

    return df

def map_to_root(df, mapping):
    """
    Returns a dict with poltical entities as keys and list of sentiments in values
    1. Entity names is used as per the annotated mapping (Top Entity folder)
    2. If entity isn't in top entity list then we are not including them into this dict.
       (B/C after top-100 frequency drops significantly)
    """
    top = list(mapping.index)
    top_dic = {}
    for dic in df.sentiments:
        for k in dic:
            if k in top:
                try:
                    top_dic[mapping.loc[k].loc['map']] += [dic[k]]
                except:
                    top_dic[mapping.loc[k].loc['map']] = [dic[k]]

    return top_dic

def sent_count(l):
    """
    Given a list of sentiments eg:['positive', 'positive', 'negative', 'neutral', 'positive']]
    Return dict with sentiment count
    """
    dic = {'positive': 0, 'negative': 0, 'neutral': 0}
    for i in l:
        try:
            dic[i] +=1
        except:
            continue

    return dic

def polarity_score(d):
    """
    Given a sentiment count dict, it returns the polarity score as defined in the paper
    If it's an empty list then PS is 0.
    """
    P = d['positive']
    N = d['negative']
    T = sum(d.values())

    try:
        return (P-N)/T
    except:
        return 0

In [3]:
def Main(f, y):
    """
    Takes org name and year as input (y = "all" mean take full data !)
    And returns the top sentiment entity dict in sorted order [on basis of frequency]
    The keys of dict represent the political entities and the values are simple sentiment count dict
    """
    mapping = pd.read_excel('Top Entity/'+f)
    mapping = mapping[mapping.include==1]

    mapping.index = list(mapping.ent)

    df = pd.read_excel('Entity Sentiment Data/'+f)

    if(y!='all'):
        df = df[df.date_year==y]

    df = prepros(df)

    top_senti_list = map_to_root(df, mapping)

    for k in top_senti_list:
        top_senti_list[k] = sent_count(top_senti_list[k])

    top_senti_list = dict(sorted(top_senti_list.items(), key=lambda x: x[1]['positive'] + x[1]['negative'] + x[1]['neutral'], reverse=True))

    return top_senti_list

In [4]:
files = ['checkyourfact.xlsx',
 'politifact.xlsx',
 'snopes.xlsx',
 'altnews.xlsx',
 'boomlive.xlsx',
 'opindia.xlsx']

for f in files:
    """
    Looking over the orgs to first get the overall sentiment counts,
    then printing the overall PS for each organization
    """
    top_senti_all = Main(f, 'all')

    alloverall = {'positive': 0, 'negative': 0, 'neutral': 0}

    for i in top_senti_all:
        tmp = top_senti_all[i]
        for j in tmp:
            try:
                alloverall[j] += tmp[j]
            except:
                print(i,j)

    print(f, "\t", polarity_score(alloverall))

checkyourfact.xlsx 	 -0.11610169491525424
politifact.xlsx 	 -0.10361752408652972
snopes.xlsx 	 -0.27574266447967927
altnews.xlsx 	 -0.2747336377473364
boomlive.xlsx 	 -0.18501805054151624
opindia.xlsx 	 -0.24912689173457508
