In [1]:
import nomquamgender as nqg
import pandas as pd
import os, random, sys
from collections import Counter

if '/home/rm868/LLM-publication-patterns/data_prep' not in sys.path:
    sys.path.append('/home/rm868/LLM-publication-patterns/data_prep')

%load_ext autoreload
%autoreload 2
import preprocess_utils
from preprocess_utils import PROCESSED_DATA_DIR

In [23]:
metadata = pd.read_json(os.path.join(PROCESSED_DATA_DIR, 'cs_stat_metadata.json'),
                        orient='records', lines=True, dtype={'id': str})

In [5]:
def predict_genders_for_single_paper_using_nqg(detector, author_list):
    # Predictions using nomquamgender package: https://pypi.org/project/nomquamgender/0.1.0/. 
    annotations = detector.classify(author_list)
    for i in range(len(annotations)):
        assert annotations[i] in ['gm', 'gf', '-']
        mapping_dict = {'gm': 'male', 'gf': 'female', '-': 'unknown'}
        annotations[i] = mapping_dict[annotations[i]]
    return annotations

def compute_female_frac_from_predictions(L):
    # For computing female_frac, only include genders with a prediction (i.e., not unknown).
    L = [pred for pred in L if pred in ['female', 'male']]
    if len(L) == 0: # No predictions above confidence threshold for this paper.
        return None
    n_female = len([pred for pred in L if pred == 'female'])
    return n_female / len(L)

def infer_gender_of_authors(metadata, nqg_threshold=0.1):
    '''
    Given paper dataframe, uses NQG to predict gender for each other name.
    See https://peerj.com/articles/cs-156/ for a recent review. 
    ''' 
    detector = nqg.NBGC()
    detector.threshold = nqg_threshold
    print("NQG detector threshold is", detector.threshold)
    predicted_genders = metadata['authors'].map(lambda x: predict_genders_for_single_paper_using_nqg(detector, x))

    all_predictions = [pred for prediction_list in predicted_genders for pred in prediction_list]
    counts = Counter(all_predictions)
    for key in counts:
        print('%-50s %i %2.1f%%' % (key, counts[key], 100 * counts[key]/len(all_predictions)))
    return predicted_genders

In [26]:
inferred_gender_df = {'id': metadata['id'], 'authors': metadata['authors']}

# Compute gender predictions at many different thresholds.
for threshold in [0.01, 0.05, 0.1, 0.15, 0.2, 0.25]:
    inferred_gender_df['nqg_uncertainty_threshold_%2.3f' % threshold] = infer_gender_of_authors(metadata, nqg_threshold=threshold)

# Compute female_frac at each threshold.
gender_inference_cols = list([a for a in inferred_gender_df.keys() if a != 'id' and a != 'authors'])
for col in gender_inference_cols:
    inferred_gender_df['inferred_female_frac_%s' % col] = [compute_female_frac_from_predictions(x) for x in inferred_gender_df[col]]

inferred_gender_df = pd.DataFrame(inferred_gender_df)

NQG detector threshold is 0.01
Inferred genders are (for all papers, not LM papers - don't use these stats in paper):
male                                               724178 43.5%
unknown                                            864595 51.9%
female                                             77317 4.6%
NQG detector threshold is 0.05
Inferred genders are (for all papers, not LM papers - don't use these stats in paper):
male                                               846531 50.8%
unknown                                            666107 40.0%
female                                             153452 9.2%
NQG detector threshold is 0.1
Inferred genders are (for all papers, not LM papers - don't use these stats in paper):
male                                               919664 55.2%
unknown                                            575174 34.5%
female                                             171252 10.3%
NQG detector threshold is 0.15
Inferred genders are (for all papers, not L

In [32]:
from preprocess_utils import GENDER_PATH
inferred_gender_df.to_json(GENDER_PATH, orient='records', lines=True)

### Compute reference stats for the LM metadata dataframe

In [6]:
lm_metadata = pd.read_json(os.path.join(PROCESSED_DATA_DIR, 'lm_papers_metadata.json'),
                            orient='records', lines=True, dtype={'id': str})

# Compute gender predictions at many different thresholds.
for threshold in [0.01, 0.05, 0.1, 0.15, 0.2, 0.25]:
    infer_gender_of_authors(lm_metadata, nqg_threshold=threshold)

NQG detector threshold is 0.01
male                                               31558 37.4%
unknown                                            48345 57.4%
female                                             4367 5.2%
NQG detector threshold is 0.05
male                                               37016 43.9%
unknown                                            38936 46.2%
female                                             8318 9.9%
NQG detector threshold is 0.1
male                                               41278 49.0%
unknown                                            33598 39.9%
female                                             9394 11.1%
NQG detector threshold is 0.15
male                                               44203 52.5%
unknown                                            29898 35.5%
female                                             10169 12.1%
NQG detector threshold is 0.2
male                                               47243 56.1%
unknown                          