For writing the narrative and exporting data.

In [39]:
import pickle
import math
import json
import numpy as np
import matplotlib.pyplot as plt

CUNAME = 'Columbia University in the City of New York'

S_TOTALS = pickle.load(open('../data/S_Sex_Graduates.p', 'rb'))
E_TOTALS = pickle.load(open('../data/E_Sex_Graduates.p', 'rb'))
D_TOTALS = pickle.load(open('../data/SE-Detailed_Sex.p', 'rb'))

TOTALS = {
    'Science': S_TOTALS,
    'Engineering': E_TOTALS,
}

years = sorted(S_TOTALS.keys())
institutions = sorted(S_TOTALS[ max(years) ].keys())
fields = {
    'Science': set(),
    'Engineering': set()
}
for year in years:
    inst_stats = D_TOTALS[year]['Columbia University in the City of New York']
    for seh in inst_stats.keys():
        for field in inst_stats[seh].keys():
            fields[seh].add(field)
fields['Science'] = sorted(fields['Science'])
fields['Engineering'] = sorted(fields['Engineering'])

print(f'{len(institutions)} institutions and {len(fields["Science"]) + len(fields["Engineering"])} fields.')

11 institutions and 39 fields.


In [40]:
# Returns female and male headcounts for a field in an institution
def get_counts(seh, field, inst='Columbia University in the City of New York', default=None):
    females = []
    males = []
    for year in years:
        stats = D_TOTALS[year][inst][seh].get(field, None)
        if stats is None:
            females.append(default)
            males.append(default)
            continue
        females.append(stats.get('Female', 0))
        males.append(stats.get('Male', 0))
    return females, males

complete_fields = {
    'Science': [],
    'Engineering': [],
}
topic_lookup = {}
for topic in complete_fields.keys():
    for field in fields[topic]:
        females, males = get_counts(topic, field)
        if None not in females and None not in males:
            complete_fields[topic].append(field)
        topic_lookup[field] = topic
print(f'{len(complete_fields["Science"]) + len(complete_fields["Engineering"])} complete fields.')

19 complete fields.


In [46]:
# Gets female and male headcounts for a field
def get_counts(field, inst='Columbia University in the City of New York', default=None):
    females = []
    males = []
    topic = topic_lookup[field]
    for year in years:
        stats = D_TOTALS[year][inst][topic].get(field, None)
        if stats is None:
            females.append(default)
            males.append(default)
            continue
        females.append(stats.get('Female', 0))
        males.append(stats.get('Male', 0))
    return females, males

# Gets counts from a totals dictionary
def get_total_counts(topic, inst='Columbia University in the City of New York', default=None):
    totals = TOTALS[topic]
    females = []
    males = []
    for year in years:
        stats = totals[year].get(inst, None)
        if stats is None:
            females.append(None)
            males.append(None)
        else:
            females.append(int(stats['Female']))
            males.append(int(stats['Male']))
    return females, males

# Manually calculates totals by adding together the counts of all fields, where holes are defaulted to 0
def get_summed_counts(topic):
    females = [0 for _ in range(len(years))]
    males = [0 for _ in range(len(years))]
    counts = {field: get_counts(field, default=0) for field in fields[topic]}

    for field in fields[topic]:
        f, m = counts[field]
        for i in range(len(years)):
            females[i] += f[i]
            males[i] += m[i]
    return females, males

# Verifies that get_summed_counts matches the totals dictionary
def verify_sums(topic):
    totalF, totalM = get_total_counts('Science')
    summedF, summedM = get_summed_counts('Science')
    return totalF == summedF and totalM == summedM

In [47]:
verify_sums('Science') and verify_sums('Engineering')

True

Therefore, GSS is not imputing the data for years that Columbia does not report, effectively filling in holes with 0.