In [101]:
import pandas as pd
import os
from fuzzywuzzy import fuzz
from tqdm import tqdm
import numpy as np

In [102]:
def clean_uni_name(x):
    
    x = x.lower().replace("-",' ')
    y = ' '.join([word for word in x.split() if word not in {"of","at"}])
    return y

def uni_name_fuzzy_matching(x,y):
    
    return True if fuzz.ratio(x,y) > 85 else False

sigmoid = lambda x: 1 / (1 + np.exp(-x))

In [103]:
cur_path = os.getcwd()
file_path_usnews = os.path.relpath('..\\data\\prestige\\us_news_rankings.csv', cur_path)
usnews = pd.read_csv(file_path_usnews)
usnews['University'] = usnews['University'].apply(lambda x: clean_uni_name(x))
usnews

Unnamed: 0,University,Rank,Score
0,carnegie mellon university,#1,5
1,massachusetts institute technology,#1,5
2,stanford university,#1,5
3,university california berkeley,#1,5
4,university illinois urbana champaign,#5,4.6
...,...,...,...
183,western michigan university,#181,1.4
184,oakland university,#185,1.3
185,liu post,#186,1.2
186,nova southeastern university,#186,1.2


In [104]:
file_path_csrankings = os.path.relpath('..\\data\\prestige\\csrankings.csv', cur_path)
csrankings = pd.read_csv(file_path_csrankings)
csrankings['Institution'] = csrankings['Institution'].apply(lambda x: clean_uni_name(x))
csrankings

Unnamed: 0,Rank,Institution,Count,Faculty
0,1,carnegie mellon university,104.0,104
1,2,cornell university,64.0,55
2,3,stanford university,52.8,58
3,4,univ. illinois urbana champaign,45.5,55
4,5,university maryland college park,44.1,52
...,...,...,...,...
171,167,university nevada las vegas,1.1,1
172,173,montana state university,1.0,1
173,173,university alabama huntsville,1.0,1
174,173,university california santa cruz,1.0,1


In [105]:
file_path_pi_prestige = os.path.relpath('..\\data\\prestige\\epistemilogy_prestige.csv', cur_path)
pi_prestige = pd.read_csv(file_path_pi_prestige)
pi_prestige['institution'] = pi_prestige['institution'].apply(lambda x: clean_uni_name(x))
pi_prestige

Unnamed: 0,# u,pi,USN2010,NRC95,Region,institution
0,1,2.23,1,1,West,stanford university
1,2,2.31,1,3,West,university california berkeley
2,3,3.52,1,2,Northeast,massachusetts institute technology
3,4,5.24,11,12,West,california institute technology
4,5,6.12,17,11,Northeast,harvard university
...,...,...,...,...,...,...
201,202,182.28,127,91,West,new mexico institute mining and technology
202,203,182.47,.,92,West,university nevada reno
203,204,186.81,127,92,South,university alabama tuscaloosa
204,205,186.84,127,82,South,university north texas denton


In [None]:
all_paper_data = pd.read_pickle("..\\data\\all_paper_data.pkl")

In [15]:
institutions = dict()
len(all_paper_data['AA'])

35034

In [106]:
uni_data = all_paper_data['AA'].explode('event_params').reset_index(drop = True)
unis = {}

for author in uni_data:
    if author['AfId'] not in uni_data and 'AfN' in author:
        unis[author['AfId']] = author['AfN']
        
affiliation_df = pd.DataFrame(list(unis.items()), index=range(0,len(unis)), columns=['id', 'affiliation'])


In [107]:
pi_prestige_values = []
csrankings_values = []
usnews_values = []
for idx,row in tqdm(affiliation_df.iterrows()):
    pi_prestige['fuzz_matching'] = pi_prestige['institution'].apply(lambda x : uni_name_fuzzy_matching(x,row['affiliation']))
    match = pi_prestige.loc[pi_prestige['fuzz_matching'] == True]
    if len(match)>0:
        pi_prestige_values.append(1/match['pi'].values[0])
    else:
        pi_prestige_values.append(float('NaN'))
        
    csrankings['fuzz_matching'] = csrankings['Institution'].apply(lambda x : uni_name_fuzzy_matching(x,row['affiliation']))
    match = csrankings.loc[csrankings['fuzz_matching'] == True]
    if len(match)>0:
        csrankings_values.append(match['Count'].values[0])
    else:
        csrankings_values.append(float('NaN'))
    
    usnews['fuzz_matching'] = usnews['University'].apply(lambda x : uni_name_fuzzy_matching(x,row['affiliation']))
    match = usnews.loc[pi_prestige['fuzz_matching'] == True]
    if len(match)>0:
        usnews_values.append(float(match['Score'].values[0]))
    else:
        usnews_values.append(float('NaN'))

affiliation_df['pi'] = pi_prestige_values
affiliation_df['csrankings'] = csrankings_values
affiliation_df['usnews'] = usnews_values

affiliation_df['pi'] = affiliation_df['pi'].sub(affiliation_df['pi'].min()).div((affiliation_df['pi'].max() - affiliation_df['pi'].min()))
affiliation_df['csrankings'] = affiliation_df['csrankings'].sub(affiliation_df['csrankings'].min()).div((affiliation_df['csrankings'].max() - affiliation_df['csrankings'].min()))
affiliation_df['usnews'] = affiliation_df['usnews'].sub(affiliation_df['usnews'].min()).div((affiliation_df['usnews'].max() - affiliation_df['usnews'].min()))

2383it [01:30, 26.21it/s]


In [112]:
prestige = []
for idx,row in affiliation_df.iterrows():
    values, weights = [],[]
    if row['pi'] != float("NaN"):
        values.append(3*row['pi'])
        weights.append(3)
    if row['csrankings'] != float("NaN"):
        values.append(1*row['csrankings'])
        weights.append(1)
    if row['usnews'] != float("NaN"):
        values.append(2*row['usnews'])
        weights.append(2)
    if len(values) != 0:
        prestige.append(sum(values)/sum(weights))
    else:
        prestige.append(float('NaN'))
affiliation_df['prestige'] = prestige

In [115]:
affiliation_df[affiliation_df['prestige'].isna()==False]

Unnamed: 0,id,affiliation,pi,csrankings,usnews,prestige
8,57206974,new york university,0.111467,0.177670,0.736842,0.330959
10,161318765,university of california los angeles,0.114316,0.320388,0.763158,0.364942
12,97018004,stanford university,1.000000,0.502913,1.000000,0.917152
15,20089843,princeton university,0.229858,0.112621,0.842105,0.414401
17,63966007,massachusetts institute of technology,0.628989,0.412621,1.000000,0.716598
...,...,...,...,...,...,...
2151,89466785,minia university,0.006965,0.073786,0.210526,0.085955
2223,307268238,kaduna state university,0.008281,0.006796,0.236842,0.084220
2244,186803428,brock university,0.090620,0.133010,0.684211,0.295548
2290,161057412,university of new hampshire,0.002009,0.018447,0.105263,0.039167
