# Names

In [17]:
import sys; sys.path.insert(0,'..')
from bechdeltest import *

## Getting all unique names

In [18]:
df = get_corpus_metadata()
# df

In [19]:
# get_text_cast('Ted-Lasso')
# get_text_dialogue('Ted-Lasso')

In [20]:
def get_text_names(text_id):
    df_cast = get_text_cast(text_id)
    cast_names = (list(df_cast.actor_name) + list(df_cast.char_name)) if len(df_cast) else []
    
    df_dial = get_text_dialogue(text_id)
    dial_names = list(df_dial.speaker) if len(df_dial) else []

    return Counter(
        str(name).title() for name in cast_names+dial_names
    )

In [21]:
def get_all_full_names():
    ofn=os.path.join(PATH_DATA, 'all_full_names.json')
    if not os.path.exists(ofn):
        names = Counter()
        for text_id in tqdm(df.index):
            names += get_text_names(text_id)
        with open(ofn,'w') as of: json.dump(dict(names.most_common()), of, indent=4)
    else:
        with open(ofn) as of: return json.load(of)

## Testing if real names

In [22]:
def check_real_names_stanza(names):
    import stanza
    nlp = stanza.Pipeline('en', verbose=False, processors='tokenize,mwt,ner')
    return {
        name:('PERSON' in {ent.type for ent in nlp(name).entities})
        for name in tqdm(names)
    }

    

In [23]:
def check_real_names_spacy(names):
    # python -m spacy download en_core_web_sm
    import spacy
    from tqdm import tqdm
    # Load English tokenizer, tagger, parser and NER
    nlp = spacy.load("en_core_web_sm")
    return {
        name:('PERSON' in {ent.label_ for ent in nlp(name).ents})
        for name in tqdm(names)
    }

    


In [24]:
# check_real_names_spacy(['Barack', 'Spacy Jones', 'Headmistress'])

In [31]:
def get_all_full_names_real():
    ofn = os.path.join(PATH_DATA,'all_full_names_real.json')
    if not os.path.exists(ofn):
        all_full_names = get_all_full_names()
        name_reald = check_real_names_spacy(all_full_names)
        with open(ofn, 'w') as of: json.dump(name_reald, of)
        return name_reald
    else:
        with open(ofn) as f: return json.load(f)


In [33]:
all_full_names_real = get_all_full_names_real()
# all_full_names_real

In [54]:
def get_all_first_names_real(force=False):
    ofn = os.path.join(PATH_DATA,'all_first_names_real.json')
    if force or not os.path.exists(ofn):
        all_full_names_real = get_all_full_names_real()
        all_first_names = Counter([
            strip_punct(name.strip()).split()[0]
            for name,isreal in all_full_names_real.items()
            if isreal
        ])
        all_first_names_real = check_real_names_spacy(all_first_names)
        with open(ofn, 'w') as of: json.dump(all_first_names_real, of)
        return all_first_names_real
    else:
        with open(ofn) as f: return json.load(f)

In [55]:
# valid_first_names = {strip_punct(name) for name,isreal in get_all_first_names_real().items() if isreal}
# len(valid_first_names)

In [58]:
# get_all_first_names_real()

In [60]:
# random.sample(list(valid_first_names),10)

## Genderize

In [61]:
def genderfy(names):
    apikey='10f018f37deabadcf64c7e26bc869657'
    from genderize import Genderize
    genderize = Genderize(
        api_key=apikey,
        timeout=5.0
    )
    return genderize.get(names)

In [63]:
# genderfy(['Ryan','Tabitha'])

In [86]:
def get_all_first_names_real_genderized(force=False, min_prob=.9):
    ofn = os.path.join(PATH_DATA,'all_first_names_real_genderized.json')
    if force or not os.path.exists(ofn):
        all_first_names_real = get_all_first_names_real()
        objs = list(all_first_names_real.keys())
        res = genderfy(objs)
        with open(ofn, 'w') as of: json.dump(res, of)
    
    with open(ofn) as f: ld = json.load(f)
    odf = pd.DataFrame(ld).set_index('name')
    odf = odf.sort_values('probability',ascending=False)
    odf = odf[odf.probability >= min_prob]
    return odf


In [87]:
gender_df = get_all_first_names_real_genderized()
gender_df

Unnamed: 0_level_0,gender,probability,count
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Armek,male,1.0,1
Copeland,male,1.0,7
Jeananne,female,1.0,7
Arikah,male,1.0,1
Kalilah,female,1.0,6
...,...,...,...
Osi,male,0.9,583
Fido,male,0.9,767
Adler,male,0.9,333
Feet,male,0.9,129


Unnamed: 0_level_0,gender,probability,count
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Armek,male,1.0,1
Copeland,male,1.0,7
Jeananne,female,1.0,7
Arikah,male,1.0,1
Kalilah,female,1.0,6
...,...,...,...
Osi,male,0.9,583
Fido,male,0.9,767
Adler,male,0.9,333
Feet,male,0.9,129
