In [7]:
import re
import os

import pandas as pd

def count_words(clean_data):
    ngrams_dict = {}
    for ngram in clean_data:
        if ngram in ngrams_dict:
            ngrams_dict[ngram] += 1
        else:
            ngrams_dict[ngram] = 1
    
    return ngrams_dict

def correct_entity_labels(named_entites):
    # d- and r- are recategorized to represent democrat or republican

    coronavirus = re.compile(r'covid|coronavirus|vaccine|omicron|delta|vaccinat|vaxx|pandemic|mask|\bncov\b', re.IGNORECASE)
    events = re.compile(r'twitter .*|mandate|^tts$', re.IGNORECASE)
    vaccines = re.compile(r'pfizer|moderna|johnson', re.IGNORECASE)
    people = re.compile(r'warren|trump|psaki|palin|fauci|santa claus|desantis|cuomo|newsom|de blasio|biden|horak|mayorkas|michelle|kilgore|murphy|walensky', re.IGNORECASE)
    organizations = re.compile(r'white house covid-19 team|glaxosmithkline|office of vaccines research and review|front line covid-19 critical care alliance|house select subcommittee on the coronavirus crisis|^d-|^r-|refinitiv lipper|england patriots|scarlet.*knights|astros|task(.*)force|democrat|republican|delta air|cornell|mckesson', re.IGNORECASE) 
    gpe_locations = re.compile(r'eswatini|seattle|america| u\.s\.|beijing|louisiana|michigan|saxony', re.IGNORECASE)
    fac = re.compile(r'flightaware|allegiant stadium|covid data tracker|long covid clinic at|vaccine adverse event reporting system|international vaccine access center', re.IGNORECASE)
    laws = re.compile(r'E.*O.*2021-18|nuremberg code|arizona constitution|Civil Rights Act', re.IGNORECASE)

    products = re.compile(r'hunger games', re.IGNORECASE)

    named_entites['entity_text'] = named_entites['entity_text'].str.lower()
    named_entites['entity_text'] = named_entites['entity_text'].astype(str)

    named_entites['entity_label'] = named_entites.apply(lambda row: 'EVENT' if coronavirus.search(row['entity_text']) else row['entity_label'] , axis=1)
    named_entites['entity_label'] = named_entites.apply(lambda row: 'ORG' if organizations.search(row['entity_text']) else row['entity_label'] , axis=1)
    named_entites['entity_label'] = named_entites.apply(lambda row: 'ORG' if vaccines.search(row['entity_text']) else row['entity_label'] , axis=1)
    named_entites['entity_label'] = named_entites.apply(lambda row: 'EVENT' if events.search(row['entity_text']) else row['entity_label'] , axis=1)
    named_entites['entity_label'] = named_entites.apply(lambda row: 'PERSON' if people.search(row['entity_text']) else row['entity_label'] , axis=1)
    named_entites['entity_label'] = named_entites.apply(lambda row: 'GPE' if gpe_locations.search(row['entity_text']) else row['entity_label'] , axis=1)
    named_entites['entity_label'] = named_entites.apply(lambda row: 'FAC' if fac.search(row['entity_text']) else row['entity_label'] , axis=1)
    named_entites['entity_label'] = named_entites.apply(lambda row: 'LAW' if laws.search(row['entity_text']) else row['entity_label'] , axis=1)

    named_entites['entity_label'] = named_entites.apply(lambda row: 'PRODUCT' if products.search(row['entity_text']) else row['entity_label'] , axis=1)

    return named_entites


def clean_entities(df, source):
    df['entity_text'] = df['entity_text'].astype(str)

    df['entity_text'] = df['entity_text'].str.lower()

    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r'^the ', '', x))
    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r'\'s', '', x))
    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r's\'', '', x))
    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r'’s', '', x))
    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r's’', '', x))
    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r'\"', '', x))

    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r'covid.*19', 'covid-19', x))
    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r'covid.*19 -', 'covid-19', x))

    df['entity_text'] = df['entity_text'].apply(lambda x: re.sub(r'rain due to covid', 'covid-19', x))

    remove_news_outlets = re.compile(r'fox|llc|abc|associated press|^cbs$|reuters|cnn|\bap\b|npr|nbc|buzzfeed|tribune|boston globe|bloomberg|vox|washington post|huffpost|new york post|new york times|usa today')
    f = df['entity_text'].str.contains(remove_news_outlets)
    df = df[~f]

    remove_image_credits = re.compile(r'photo|getty|flickr|istock|^afp$')
    f = df['entity_text'].str.contains(remove_image_credits)
    df = df[~f]

    remove_junk = re.compile(r'nan|file|^quote$|^quotes$')
    f = df['entity_text'].str.contains(remove_junk)
    df = df[~f]

    remove_fox_ads = re.compile(r'factset|mutual fund| lipper|^app$')
    f = df['entity_text'].str.contains(remove_fox_ads)
    df = df[~f]

    if source == 'NPR':
        glyphs = re.compile(r'�')
        f = df['entity_text'].str.contains(glyphs)
        df = df[~f]

        remove_reporters = re.compile(r'rick bowmer|alex brandon|john minchillo|marta lavandier|daniel wood|jonathan franklin|bill chappell|moffett|scott neuman|matthew s. schwartz|mary altaffer|sarah silbiger|lynne sladky|ethan miller|scott hensley|ed jones|scott heins|vanessa romo|rachel treisman|jaclyn diaz')
        f = df['entity_text'].str.contains(remove_reporters)
        df = df[~f]

    if source == 'FOX':
        remove_reporters = re.compile(r'evan vucci|^michelle$|michael lee|kyle morris|andrew mark miller|austin')
        f = df['entity_text'].str.contains(remove_reporters)
        df = df[~f]
        names = re.compile(r'abbott')
        df['entity_label'] = df.apply(lambda row: 'PERSON' if names.search(row['entity_text']) else row['entity_label'] , axis=1)


    df['entity_text'] = df['entity_text'].str.strip()

    df = df.drop(df[df['entity_text'].str.len() < 2].index)

    return df

def to_upper(row):
    if len(row) <= 3:
        return row.upper()
    else:
        return row

def title_case(row):
    if len(row) > 3:
        return row.title()
    else:
        return row

def correct_case(df):
    df['entity_text'] = df['entity_text'].apply(to_upper)
    df['entity_text'] = df['entity_text'].apply(title_case)

    return df


def find_proportion(df):
    return (df['count'] / df['count'].sum()) * 100


def export_measurements(df, ts, filter_by, source):
    df = correct_entity_labels(df)
    df = clean_entities(df, source)

    try:
        df.drop('Unnamed: 0', axis=1, inplace=True) 
    except:
        pass

    df = df[df['entity_label'] == filter_by].copy() # temp -- to find proportion of PERSONs for all PERSONs

    df['source'] = source
    df['count'] = df.groupby('entity_text')['entity_text'].transform('count')

    df = df.drop_duplicates()

    df['proportion'] = find_proportion(df)

    #df = df[df['entity_label'] == filter_by].copy() # for over all proportion, would need to filter for just twitter that is person
    df = df.drop_duplicates()

    df = correct_case(df)
    df.sort_values(by = ['proportion'], inplace = True, ascending = False)

    if not os.path.exists('/home/stephbuon/projects/entascope/results/' + ts):
        os.mkdir('/home/stephbuon/projects/entascope/results/' + ts)

    df.to_csv('/home/stephbuon/projects/entascope/results/' + ts + '/' + source + '_' + filter_by + '.csv')

In [4]:
ent_labels = ['PERSON', 'EVENT', 'ORG']

In [5]:
ts = '12-27-2021'
npr_ne = pd.read_csv('/home/stephbuon/projects/entascope/scraped_pages/' + ts + '/' + 'named_entities_NPR_' + ts +'.csv')

for label in ent_labels:
    export_measurements(npr_ne, ts, label, 'NPR')

In [8]:
ts = '12-27-2021'
fox_ne = pd.read_csv('/home/stephbuon/projects/entascope/scraped_pages/' + ts + '/' + 'named_entities_FOX_' + ts + '.csv')

for label in ent_labels:
    export_measurements(fox_ne, ts, label, 'FOX')#, 'EVENT', 'FOX')