In [2]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import spacy
import ast
import re
import itertools
from thefuzz import process
from itertools import *
import neuralcoref
import en_core_web_sm
from bs4 import BeautifulSoup
import requests
import gender_guesser.detector as gender

import sys
sys.path.insert(0, 'GenderGapTracker/NLP/main')



## Sample Extractions
The following code shows how to parse for (speaker, organization) with our methods, how to split these results by organization type, and how to extract the most likely gender affiliation for each speaker.

### Part 1: Extractions

In [11]:
import importlib.util
import sys

from GenderGapTracker.NLP.main.quote_extractor import extract_quotes
from GenderGapTracker.NLP.main.entity_gender_annotator import (
    merge_nes, remove_invalid_nes, quote_assign
)
from GenderGapTracker.NLP.main.utils import (
    remove_accents, preprocess_text
)

nlp = en_core_web_sm.load()
coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200)
nlp.add_pipe(coref, name='neuralcoref')

# Select functions from GenderTracker file.
def collect_quotes(quotes):
    """Structure final quotes as a list of records for display in a table."""
    collection = []
    for q in quotes:
        # Checking for 'PERSON' before assigning a speaker - if the quote is of type 'Heuristic',
        # the conditions are relaxed and we accept the quote with a blank speaker name
        if q.get('named_entity_type') == 'PERSON' or q.get('quote_type') == 'Heuristic':
            speaker = q.get('named_entity', "")
            quote = preprocess_text(q.get('quote', ""))
            collection.append({'speaker': speaker, 'quote': quote})
    return collection

def extract_quotes_and_entities(sample_text):
    # """Convert raw text to a spaCy doc object and return its named entities and quotes"""
    text = preprocess_text(str(sample_text))
    doc = nlp(text)
    quotes = extract_quotes(doc_id="temp000", doc=doc, write_tree=False)
    unified_nes = merge_nes(doc)
    named_entities = remove_invalid_nes(unified_nes)
    # Get list of people and sources, along with a combined list of all quotes
    people = list(named_entities.keys())
    # Obtain gender of speakers from condensed coreference clusters
    _, _, all_quotes = quote_assign(named_entities, quotes, doc)
    quotes_and_sources = collect_quotes(all_quotes)
    # sort alphabetically based on speaker name
    quotes_and_sources = sorted(quotes_and_sources, key=lambda x: x['speaker'], reverse=True)
    # Get proper list of sources from the list of quotes and speakers
    sources = list(set([person['speaker'] for person in quotes_and_sources]))
    # Merge list of people and sources (in case there is a mismatch) to get full list of people
    people = list(set(people).union(set(sources)))

    return people, sources, quotes_and_sources

# LIST OF REPORTING VERBS (asserts,declares,says,etc.)
reporting_verbs = []
with open('GenderGapTracker/NLP/main/rules/quote_verb_list.txt', 'r') as f:
    line = f.readline()
    while line:
        reporting_verbs.append(line[:-1]) # up until newline character
        line = f.readline()
f.close()

In [13]:
def parse_gtracker(df,ents):
    qse = pd.DataFrame() # quote speaker entities
    missed_speaker = pd.DataFrame()
    for idx, row in df.iterrows():
        if idx % 1000 == 0:
            print(idx)
        sp1, sp2, sp3 = ast.literal_eval(str(ents.loc[idx,"val"]))
        quotes = [d['quote'] for d in sp3]
        sents = [sent.text for sent in nlp(row["text"]).sents]
        for quote in quotes:
            match_sent = [s for s in sents if quote[1:-1] in s]
            match_sent = str(match_sent)[1:-1]
            doc = nlp(match_sent)
            entities = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(doc.ents, key=lambda x: x.label_), lambda x: x.label_)}
            split_by_quote = str(match_sent).split(quote[1:-1])
            context = split_by_quote[0]
            doc = nlp(context)
            entities = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(doc.ents, key=lambda x: x.label_), lambda x: x.label_)}
            if 'PERSON' in entities.keys() and 'ORG' in entities.keys():
                toappend = [idx, match_sent, entities['PERSON'], entities['ORG'], row["source"]]
                qse = qse.append([toappend])
            elif 'PERSON' not in entities.keys() and 'ORG' not in entities.keys() and len(split_by_quote) > 1:
                context = split_by_quote[1]
                doc = nlp(context)
                entities = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(doc.ents, key=lambda x: x.label_), lambda x: x.label_)}
                if 'PERSON' in entities.keys() and 'ORG' in entities.keys():
                    toappend = [idx, match_sent, entities['PERSON'], entities['ORG'], row["source"]]
                    qse = qse.append([toappend])
                else:
                    toappend = [idx, quote]
                    missed_speaker = missed_speaker.append([toappend])
            else:
                toappend = [idx, quote]
                missed_speaker = missed_speaker.append([toappend])
    return qse, missed_speaker


In [19]:
# CODE TO PROCESS A DATAFRAME
def get_qse_ner(df):
    print("Running GenderTracker Dependency Parse Method...")
    df_ents = pd.DataFrame([[i, extract_quotes_and_entities(df.loc[i,"text"])] for i in df.index.values])
    print("Done.")
    df_ents.columns = ['idx','val']
    df_ents.index = df_ents['idx']
    print("Finding quotes with persons and orgs attached...")
    df_qse, df_missed = parse_gtracker(df,df_ents)
    print("Done.")
    return (df_qse, df_missed)

In [46]:
# Extract (expert, organization) for dependency parse (DEP Method)
aylien_data = pd.read_csv("data/data.csv")
aylien_qse, aylien_missed = get_qse_ner(aylien_data[:30])
aylien_qse.index = np.arange(0,aylien_qse.shape[0])
aylien_qse.columns = ['idx','quote','people','orgs','source']

# TODO: NER

Running GenderTracker Dependency Parse Method...
Done.
Finding quotes with persons and orgs attached...
0
Done.


### Part 2: Distinguishing by Organization Type

In [39]:
# return dataframe softmatching >= thresh% to an organization in org_df
# and dataframe of instances which failed to softmatch.
def match_org_test(df, org_df, thresh):
    matches = pd.DataFrame()
    missed = pd.DataFrame()
    two = pd.DataFrame()
    for idx, match in df.iterrows():
        if idx % 100 == 0:
            print(idx)
        score = 0
        for t_org in ast.literal_eval(str(match['orgs'])):
            t_org = t_org.replace("\\","")
            # Skip organizations 1 or 2 chars long
            if len(t_org) <= 2:
                continue
            thisorg, thisscore = process.extract(t_org, org_df, limit=1)[0]
            if thisscore > score:
                org = thisorg
                score = thisscore
        if score >= thresh:
            toappend = [idx, match['idx'], match['quote'], match['people'], org]
            matches = matches.append([toappend])
        else:
            missed = missed.append(match)
    return (matches, missed)

In [61]:
# Split by organization type
# Read in educational institutions and match where we can

# DEP Method
times15 = pd.read_csv('data/timesData-2015rankings.csv')
educ_qse, missed = match_org_test(aylien_qse,times15.university_name.unique(),90)
educ_qse = educ_qse.drop(columns = [0])
educ_qse.columns = ['idx','quote','people','educ_inst']
educ_qse.index = np.arange(educ_qse.shape[0])

gov = pd.read_csv("data/gov_agencies.csv")
gov_qse, missed = match_org_test(missed,gov["0"].unique(),90)
gov_qse = gov_qse.drop(columns = [0])
gov_qse.columns = ['idx','quote','people','orgs']
gov_qse.index = np.arange(gov_qse.shape[0])

thinktanks = pd.read_csv("data/thinktanks.csv")
thinkt_qse, missed = match_org_test(missed, thinktanks["tt_name_en"].unique(),90)
thinkt_qse.columns = ['idx','quote','people','orgs']
thinkt_qse.index = np.arange(thinkt_qse.shape[0])

# TODO: NER

0


### Part 3: Gender Inference

In [37]:
# read in andy_dict.csv and unknown_dict.csv
unknown = pd.read_csv("data/unknown_dict.csv")
unknown = unknown.loc[:,['Person','Gender']]
unknown['Person'] = unknown['Person'].str.replace('"','')
unknown['Gender'] = unknown['Gender'].str.replace('"','')
unknown_dict = dict([(p,g) for p,g in zip(unknown.Person,unknown.Gender)])

andy = pd.read_csv("data/andy_dict.csv")
andy = andy.loc[:,['Person','Gender']]
andy['Person'] = andy['Person'].str.replace('"','')
andy['Gender'] = andy['Gender'].str.replace('"','')
andy_dict = dict([(p,g) for p,g in zip(andy.Person,andy.Gender)])

def get_genders_total(df, usedict = True):
    d = gender.Detector()
    male = 0; female = 0; unknown = 0; andy = 0
    gender_var = {"male" : male, "female": female, "unknown": unknown, "andy" : andy}
    df["gender_t"] = 0
    df["male_t"] = 0
    df["female_t"] = 0
    df["unknown_t"] = 0
    df["andy_t"] = 0
    for i, row in df.iterrows():
        row_gender = []
        male = 0; female = 0; unknown = 0; andy = 0
        for person in ast.literal_eval(row["people"]):
            p = person.split(" ")[0]
            if p in unknown_dict.keys() and usedict == True:
                g = unknown_dict[p]
            elif p in andy_dict.keys() and usedict == True:
                g = andy_dict[p]
            else:
                g = d.get_gender(p)
                if g == "mostly_male":
                    g = "male"
                elif g == "mostly_female":
                    g = "female"
            row_gender.append(g)
            gender_var[g] += 1
            df.loc[i,str(g)+"_t"] += 1
        df.loc[i,"gender_t"] = str(row_gender)
    return df

def get_genders_unique(df):
    d = gender.Detector()
    male = 0; female = 0; unknown = 0; andy = 0
    gender_var = {"male" : male, "female": female, "unknown": unknown, "andy" : andy}
    df["gender_u"] = 0
    df["male_u"] = 0
    df["female_u"] = 0
    df["unknown_u"] = 0
    df["andy_u"] = 0
    past = pd.DataFrame(columns=['Name', 'Count', 'Gender'])
    for i, row in df.iterrows():
        row_gender = []
        male = 0; female = 0; unknown = 0; andy = 0
        for person in ast.literal_eval(row["people"]):
            p = person.split(" ")[0]
            trip = process.extract(p,past['Name'],limit=1)
            if trip:
                person, score, num = trip[0]
                if score > 90:
                    past.loc[past[past["Name"] == person].index.values[0]]["Count"] = past.at[past[past["Name"] == person].index.values[0],"Count"] + 1
                    continue
                else:
                    if p in unknown_dict.keys():
                        g = unknown_dict[p]
                    elif p in andy_dict.keys():
                        g = andy_dict[p]
                    else:
                        g = d.get_gender(p)
                        if g == "mostly_male":
                            g = "male"
                        elif g == "mostly_female":
                            g = "female"
                    row_gender.append(g)
                    gender_var[g] += 1
                    df.loc[i,str(g)+"_u"] += 1
                    past = past.append(pd.DataFrame([[p,1,g]],columns=["Name","Count","Gender"]))
            else:
                if p in unknown_dict.keys():
                    g = unknown_dict[p]
                elif p in andy_dict.keys():
                    g = andy_dict[p]
                else:
                    g = d.get_gender(p)
                    if g == "mostly_male":
                        g = "male"
                    elif g == "mostly_female":
                        g = "female"
                row_gender.append(g)
                gender_var[g] += 1
                df.loc[i,str(g)+"_u"] += 1
                past = pd.DataFrame([[p,1,g]],columns=["Name","Count","Gender"])
            df.loc[i,"gender_u"] = str(row_gender)
    return (df, past)


# Creates four new columns in df that signal the number of people quoted
# with that gender
def gender_counts_total(df):
    df['male'] = 0; df['female'] = 0; df['unknown'] = 0; df['andy'] = 0
    gender_dict = {'male' : 0, 'mostly_male': 0, 'female': 1, 'mostly_female': 1, 'andy' : 2, 'unknown' : 3} #[0,1,2,3] = [male,female,andy,unknown]
    rev_gender_dict = {0:'male', 1:'female', 2:'andy', 3:'unknown'}
    past = pd.DataFrame(columns=['Names','Count','Gender'])
    for i, row in df.iterrows():
        genders = ast.literal_eval(row['gender'])
        people = ast.literal_eval(row['people'])
        for tup in zip(people,genders):
            p, g = tup
            p = re.sub(r'[(]', '', p)
            p = re.sub(r'[)]', '', p)
            trip = process.extract(p, past['Names'],limit=1)
            if g == 'unknown':
                trip = process.extract(p,past['Names'],limit=1)
                if trip:
                    closestperson, score, num = trip[0]
                    if score >= 90:
                        closestgender = past.loc[past["Names"] == closestperson]["Gender"]
                        df.loc[i,rev_gender_dict[closestgender.values[0]]] += 1
                    else:
                        df.loc[i,'unknown'] += 1
                else:
                    df.loc[i,'unknown'] += 1
            else:
                if g == 'mostly_male':
                    df.loc[i,'male'] += 1
                elif g == 'mostly_female':
                    df.loc[i,'female'] += 1
                else:
                    df.loc[i,g] += 1
                past = past.append(pd.DataFrame([[p, gender_dict[g]]],columns=['Names', 'Gender'],index=[i]))
    return df, past

In [62]:
aylien_qse.people = aylien_qse.people.astype(str)

# DEP
whole_qse = get_genders_total(aylien_qse)
whole_educ = get_genders_total(educ_qse)
whole_gov = get_genders_total(gov_qse)
whole_thinkt = get_genders_total(thinkt_qse)

# TODO: NER