In [1]:
import pandas as pd
import numpy as np
import random as rnd
import re

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
%matplotlib inline



In [2]:
edgelist_df = pd.read_csv('edge-lists.csv')
ranks_df = pd.read_csv('us-faculty-hiring-networks/data/ranks.csv')

In [3]:
edgelist_df

Unnamed: 0,TaxonomyLevel,TaxonomyValue,InstitutionId,InstitutionName,DegreeInstitutionId,DegreeInstitutionName,Total,Men,Women
0,Field,Mathematics,82,Georgia Tech,243.0,University of Alabama,1.0,1.0,0.0
1,Domain,Mathematics and Computing,82,Georgia Tech,243.0,University of Alabama,1.0,1.0,0.0
2,Academia,Academia,82,Georgia Tech,243.0,University of Alabama,1.0,1.0,0.0
3,Field,Mathematics,82,Georgia Tech,251.0,UC Berkeley,3.0,1.0,2.0
4,Domain,Mathematics and Computing,82,Georgia Tech,251.0,UC Berkeley,20.0,14.0,5.0
...,...,...,...,...,...,...,...,...,...
411539,Academia,Academia,395,Texas Southern,557973.0,Xavier University of Louisiana,1.0,1.0,0.0
411540,Academia,Academia,101,Jewish Theological Seminary,53.0,"City University of New York, The",1.0,1.0,0.0
411541,Academia,Academia,50,Concordia Seminary,58.0,Drew,1.0,1.0,0.0
411542,Academia,Academia,50,Concordia Seminary,548894.0,Pontifica Institute di Studi Arabi e d'Islamis...,1.0,0.0,0.0


In [22]:
ranks_df

Unnamed: 0,Rank,InstitutionId,InstitutionName,TaxonomyValue,TaxonomyLevel
0,0,31.0,California Institute of Technology,Academia,Academia
1,1,123.0,Massachusetts Institute of Technology,Academia,Academia
2,2,87.0,Harvard University,Academia,Academia
3,3,167.0,Princeton University,Academia,Academia
4,4,198.0,Stanford University,Academia,Academia
...,...,...,...,...,...
11117,39,350.0,Utah State University,Veterinary Medical Sciences,Field
11118,40,137.0,Montana State University,Veterinary Medical Sciences,Field
11119,41,328.0,University of Rhode Island,Veterinary Medical Sciences,Field
11120,42,190.0,South Dakota State University,Veterinary Medical Sciences,Field


In [4]:
edgelist_df.TaxonomyLevel.value_counts()

Field       219403
Domain      129857
Academia     62284
Name: TaxonomyLevel, dtype: int64

In [5]:
#edgelist_df=pd.merge(edgelist_df, ranks_df.drop(columns='InstitutionName'), how="left", on=['InstitutionId', 'TaxonomyValue', 'TaxonomyLevel'])

Replacing Academia with Domain

In [6]:
edgelist_df.TaxonomyLevel.replace({'Academia':'Domain'}, inplace=True)

Dropping Field, for now

In [7]:
edgelist_df=edgelist_df[~(edgelist_df.TaxonomyLevel=="Field")]

Dropping the TaxonomyLevel columns

In [8]:
edgelist_df=edgelist_df.drop(columns="TaxonomyLevel")

Removing the null values of DegreeInstitutionName

In [9]:
edgelist_df.dropna(subset=['DegreeInstitutionName'], inplace=True)

Finding out the number of unknown genders

In [10]:
edgelist_df.Total = edgelist_df.Total-edgelist_df.Men-edgelist_df.Women
edgelist_df.rename(columns = {'Total':'Unknown'}, inplace = True)

Expanding the number of entries of each gender as a list

In [11]:
edgelist_df['Gender'] = edgelist_df.apply(lambda x: int(x['Unknown'])*['U']+int(x['Men'])*['M']+int(x['Women'])*['W'], axis=1)

Exploding along the gender column and dropping the Unknown, Men and Women columns

In [12]:
edgelist_df=edgelist_df.explode(column=["Gender"])
edgelist_df=edgelist_df.drop(columns=["Unknown","Men","Women"])

In [13]:
edgelist_df

Unnamed: 0,TaxonomyValue,InstitutionId,InstitutionName,DegreeInstitutionId,DegreeInstitutionName,Gender
1,Mathematics and Computing,82,Georgia Tech,243.0,University of Alabama,M
2,Academia,82,Georgia Tech,243.0,University of Alabama,M
4,Mathematics and Computing,82,Georgia Tech,251.0,UC Berkeley,U
4,Mathematics and Computing,82,Georgia Tech,251.0,UC Berkeley,M
4,Mathematics and Computing,82,Georgia Tech,251.0,UC Berkeley,M
...,...,...,...,...,...,...
411539,Academia,395,Texas Southern,557973.0,Xavier University of Louisiana,M
411540,Academia,101,Jewish Theological Seminary,53.0,"City University of New York, The",M
411541,Academia,50,Concordia Seminary,58.0,Drew,M
411542,Academia,50,Concordia Seminary,548894.0,Pontifica Institute di Studi Arabi e d'Islamis...,U


In [14]:
edgelist_df.isna().sum()

TaxonomyValue            0
InstitutionId            0
InstitutionName          0
DegreeInstitutionId      0
DegreeInstitutionName    0
Gender                   0
dtype: int64

In [15]:
edgelist_df=pd.merge(edgelist_df, ranks_df.drop(columns='InstitutionId'), how="left", on=['InstitutionName', 'TaxonomyValue', 'TaxonomyLevel'])

In [16]:
#edgelist_df.rename(columns = {'TaxonomyValue':'Domain'}, inplace = True)

In [18]:
edgelist_df.Rank.isnull().sum()

29342

In [19]:
edgelist_df.dropna(subset=['Rank'])

Unnamed: 0,TaxonomyValue,InstitutionId,InstitutionName,DegreeInstitutionId,DegreeInstitutionName,Gender,Rank,TaxonomyLevel
1,Academia,82,Georgia Tech,243.0,University of Alabama,M,44.0,Academia
22,Academia,82,Georgia Tech,251.0,UC Berkeley,U,44.0,Academia
23,Academia,82,Georgia Tech,251.0,UC Berkeley,U,44.0,Academia
24,Academia,82,Georgia Tech,251.0,UC Berkeley,U,44.0,Academia
25,Academia,82,Georgia Tech,251.0,UC Berkeley,U,44.0,Academia
...,...,...,...,...,...,...,...,...
568374,Academia,395,Texas Southern,557973.0,Xavier University of Louisiana,M,296.0,Academia
568375,Academia,101,Jewish Theological Seminary,53.0,"City University of New York, The",M,41.0,Academia
568376,Academia,50,Concordia Seminary,58.0,Drew,M,274.0,Academia
568377,Academia,50,Concordia Seminary,548894.0,Pontifica Institute di Studi Arabi e d'Islamis...,U,274.0,Academia


In [20]:
edgelist_df.TaxonomyValue.value_counts()

Academia                     274474
Natural Sciences              69014
Medicine and Health           51103
Humanities                    41096
Social Sciences               37554
Engineering                   28811
Applied Sciences              27335
Mathematics and Computing     25412
Education                     13580
Name: TaxonomyValue, dtype: int64