In [1]:
%matplotlib inline

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

from collections import Counter

## Research area

In [2]:
researcher_df = pd.read_csv('dataset/researcher.csv', dtype=str).fillna('')
researcher_df.shape

(774733, 9)

In [3]:
researcher_df.head(1)

Unnamed: 0,PID,FirstName,MiddleName,LastName,Institution,InstitutionMAGID,ResearchArea,ORCID,MAGAuthorID
0,1,STEPHEN,V,DAVID,Oregon Health and Science University,165690674,"neuro,csd,bme",0000-0003-4135-3104,2171827615


In [4]:
def count_column(df, col, sep, fractional=False):
    """"""
    result = {}
    for e in df[col]:
        es = e.split(sep)
        w = 1 / len(es) if fractional else 1
        for a in es:
            result[a] = result.get(a, 0) + w
    return result

def count_major_area():
    """"""
    cnt_1 = count_column(researcher_df, 'ResearchArea', ',')
    cnt_2 = count_column(researcher_df[researcher_df.MAGAuthorID != ''], 'ResearchArea', ',')
    rows = []
    for k, v in sorted(cnt_1.items(), key=lambda x: x[1], reverse=True):
        rows.append([k, v, cnt_2.get(k, 0)])
    df = pd.DataFrame(rows, columns=['area', 'num_researchers', 'num_researchers_matched'])
    return df

area_df = count_major_area()
area_df.shape

(112, 3)

In [5]:
area_df['pct_matched'] = area_df.num_researchers_matched / area_df.num_researchers * 100
area_df['pct_researchers'] = area_df.num_researchers / area_df.num_researchers.sum() * 100
area_df['pct_researchers_matched'] = area_df.num_researchers_matched / area_df.num_researchers_matched.sum() * 100

In [6]:
print(area_df[
    ['area', 'num_researchers', 'pct_researchers', 'num_researchers_matched', 'pct_matched']
].head(20).to_latex(index=False, float_format='{:0.1f}'.format))

\begin{tabular}{lrrrr}
\toprule
            area &  num\_researchers &  pct\_researchers &  num\_researchers\_matched &  pct\_matched \\
\midrule
           neuro &           135756 &             16.7 &                    93769 &         69.1 \\
       chemistry &           104450 &             12.9 &                    85585 &         81.9 \\
           etree &            56898 &              7.0 &                    45004 &         79.1 \\
            educ &            56580 &              7.0 &                    17978 &         31.8 \\
         physics &            49582 &              6.1 &                    37714 &         76.1 \\
            math &            35651 &              4.4 &                    22707 &         63.7 \\
      literature &            28257 &              3.5 &                     7449 &         26.4 \\
       sociology &            25453 &              3.1 &                    12618 &         49.6 \\
            econ &            23497 &              2.9

## Mentorship

In [7]:
mentorship_df = pd.read_csv('dataset/mentorship.csv', dtype=str)
mentorship_df.shape

(743176, 8)

In [8]:
mentorship_df.head(1)

Unnamed: 0,CID,MenteeID,MentorID,MentorshipType,Institution,InstitutionMAGID,StartYear,StopYear
0,2,2,3,1,"University of California, Berkeley",95457486,2000,2005


In [16]:
mentorship_df.MentorshipType.value_counts(dropna=False)

1    630439
2     68652
0     18850
4     17833
3      7402
Name: MentorshipType, dtype: int64