Converts the master CSV (breaks down years by institution, sex, and race/ethnicity for grad students and postdoctorates) into two tree-based dictionaries: one for grad students and one for postdocs. Each dictionary is pickled.

In [1]:
import pickle
import pandas as pd

In [33]:
# Converts a dataframe into a nested dictionary
# https://stackoverflow.com/questions/19798112/convert-pandas-dataframe-to-a-nested-dict
def recur_dictify(frame):
    if len(frame.columns) == 1:
        if frame.values.size == 1: return frame.values[0][0]
        return frame.values.squeeze()
    grouped = frame.groupby(frame.columns[0])
    d = {k: recur_dictify(g.ix[:,1:]) for k,g in grouped}
    return d

In [46]:
def pickle_general():
    df = pd.read_csv(
        f'../data/Sex.csv', 
        thousands=',',
        header=6, # starting at 6th row
        skipfooter=6, # ignore last 4 lines of data (footer)
        engine='python', # default pandas engine does not support skipfooter
        index_col=False, # don't use first column (year) as dataframe index
        names = ['Year', 'Institution Name', 'Sex', 'Postdoctorates', 'Graduate Students']
    )

    # Delete some aggregate values we don't need for this analysis
    df = df[ df['Institution Name'] != 'Total for selected values' ]

    # Rename all instances of 'Columbia Univ in the City of New York'
    # to 'Columbia University in the City of New York'
    df['Institution Name'] = df['Institution Name'].str.replace(
        'Columbia Univ in the City of New York', 
        'Columbia University in the City of New York'
    )

    with open(f'../data/Sex_Postdoctorates.p', 'wb') as f:
        postdocs = df.drop('Graduate Students', 1)
        pickle.dump(recur_dictify(postdocs), f)

    with open(f'../data/Sex_Graduates.p', 'wb') as f:
        grads = df.drop('Postdoctorates', 1)
        pickle.dump(recur_dictify(grads), f)

In [None]:
pickle_general()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [40]:
def pickle_data(scope):
    df = pd.read_csv(
        f'../data/{scope}_Sex_RaceEthnicity.csv', 
        thousands=',',
        header=6, # starting at 6th row
        skipfooter=6, # ignore last 4 lines of data (footer)
        engine='python', # default pandas engine does not support skipfooter
        index_col=False, # don't use first column (year) as dataframe index
        names = ['Year', 'Institution Name', f'{scope} Fields', 'Sex', 'Race and Ethnicity', 'Postdoctorates', 'Graduate Students']
    )

    # Delete some aggregate values we don't need for this analysis
    df = df[ df['Institution Name'] != 'Total for selected values' ]
    df = df[ df[f'{scope} Fields'] != 'Total for selected values' ]

    # Rename all instances of 'Columbia Univ in the City of New York'
    # to 'Columbia University in the City of New York'
    df['Institution Name'] = df['Institution Name'].str.replace(
        'Columbia Univ in the City of New York', 
        'Columbia University in the City of New York'
    )

    # Pickle a postdocs dataframe
    with open(f'../data/{scope}_Sex_RaceEthnicity_Postdoctorates.p', 'wb') as f:
        postdocs = df.drop('Graduate Students', 1)
        postdocs = postdocs[postdocs['Postdoctorates'] > 0]
        pickle.dump(recur_dictify(postdocs), f)
    # Pickle a graduate students dataframe
    with open(f'../data/{scope}_Sex_RaceEthnicity_Graduates.p', 'wb') as f:
        grads = df.drop('Postdoctorates', 1)
        grads = grads[grads['Graduate Students'] > 0]
        pickle.dump(recur_dictify(grads), f)

In [55]:
pickle_scope('Broad')

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [56]:
pickle_scope('Detailed')

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
