In [1]:
import pandas as pd
import re

In [2]:
def conv(s):
    """
    Ofcourse GPT-3.5 is a smart model and capable enough to return the results in said format (in-most of the cases).
    This function is trying to extract then entity-sentiment pair in a dict format.
    """
    s = s[1:-1]
    dic = {}

    positions=[match.end() for match in re.finditer(r'\b(?:positive|negative|neutral)\b,', s, flags=re.IGNORECASE)]
    for ss in [s[i:j] for i, j in zip([0] + positions, positions + [None])]:
        tmp = ss.split(':')
        try:
            dic[tmp[0]] = tmp[1].replace(",", "")
        except:
            continue

    return dic

def prepros(df):
    """
    Some basic preprocessing & mapping of string sentiment data to dict format
    """
    df.sentiments = df.sentiments.str.replace('[\n"\']', '', regex=True)
    df.sentiments = df.sentiments.str.replace(r'{\s+', '{', regex=True)
    df.sentiments = df.sentiments.str.replace(r'\s+}', '}', regex=True)
    df.sentiments = df.sentiments.str.replace(r':\s+', ':', regex=True)
    df.sentiments = df.sentiments.str.replace(r',\s+', ',', regex=True)
    df.sentiments = df.sentiments.apply(conv)

    return df

In [3]:
files = ['checkyourfact.xlsx', 'politifact.xlsx', 'snopes.xlsx', 'altnews.xlsx', 'boomlive.xlsx', 'opindia.xlsx']

for f in files:
    """
    After processing the data, it's saving top 100 most frequent entities to be annotated.
    """
    df = pd.read_excel("Entity Sentiment Data/"+f)
    df = prepros(df)

    entities = []

    for dic in df.sentiments:
        entities += list(dic)

    top100 = pd.Series(entities).value_counts()[:100]

    tmp = pd.DataFrame()
    tmp['ent'] = top100.index
    tmp['count'] = top100.values

    # After saving the below file, manual annotation is done to include only top poltical entities ant their uniform mapping
    # for ex: pm modi, pm narendra modi and narendra modi will map to narendra modi !!
    # tmp.to_excel('Top Entity/'+f, index=False)

In [4]:
# Stats of political entities in top-100
for f in files:
    df = pd.read_excel("Top Entity/"+f)

    print(f)
    print(df.include.value_counts())
    print()

checkyourfact.xlsx
include
0    74
1    26
Name: count, dtype: int64

politifact.xlsx
include
0    59
1    41
Name: count, dtype: int64

snopes.xlsx
include
0    69
1    31
Name: count, dtype: int64

altnews.xlsx
include
0    58
1    42
Name: count, dtype: int64

boomlive.xlsx
include
0    62
1    38
Name: count, dtype: int64

opindia.xlsx
include
1    51
0    49
Name: count, dtype: int64

