In [76]:
import math
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm

# External Data
## CS Rankings Author info

Taken from https://github.com/emeryberger/CSrankings/tree/gh-pages which is the repo behind https://csrankings.org 

Run `make generated-author-info.csv` which should make the csv. Each row contains the score for every author, year, and conference combination. This means that there are multiple rows for each professor.

## CS Professors

Taken from https://drafty.cs.brown.edu/csprofessors but they hide the data. So, after reading source code, exploit a [thing they left in](https://github.com/brownhci/drafty/blob/212bd995c857a34c74c7a71d67e1556c1ca7ea97/backend/src/controllers/datasharing.ts#L31) during development and use https://drafty.cs.brown.edu/data/csv/csprofessors/csprofessors_93318b344889ccef41d46b5f83d63de5 

## Placement Rank

Taken from https://drafty.cs.brown.edu/csopenrankings/placement-rank.html which is just copy and paste (and `M-x query-replace <tab> ,`). I think I could have done this myself, but why do that when someone else has already done the work. 

## Best Paper awards

A collection of best paper awards are listed on https://jeffhuang.com/best_paper_awards/

However, I needed to do some html parsing in order to get the data into a csv format. That is done in [another notebook](./best-paper.ipynb)

# Internal Data
## Preferences
This is a list of conferences and areas and my interest in them. This is used in order to create weights. It is manually generated

## Universities
This is a list of north american universities and their aliases (e.g. NYU and New York University). It is manually generated (hence why it is only north american universities)

## Statistic Weights
This is just a csv that contains weights for me to import and use in the final weighing stage.

# [Data Archive Download Link](http://store.sachiniyer.com/s/9NczHAmPJFiQgys/download/data.zip)


In [2]:
dfcsrankings = pd.read_csv("data/generated-author-info.csv")
dfcsrankings.describe()

Unnamed: 0,count,adjustedcount,year
count,199906.0,199906.0,199906.0
mean,1.332511,0.384866,2012.689984
std,0.845756,0.281289,8.774593
min,1.0,0.008696,1970.0
25%,1.0,0.2,2008.0
50%,1.0,0.33333,2015.0
75%,1.0,0.5,2020.0
max,24.0,4.9778,2023.0


In [3]:
dfcsrankings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199906 entries, 0 to 199905
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   name           199906 non-null  object 
 1   dept           199906 non-null  object 
 2   area           199906 non-null  object 
 3   count          199906 non-null  float64
 4   adjustedcount  199906 non-null  float64
 5   year           199906 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 9.2+ MB


In [4]:
dfcsrankings.sample(3)

Unnamed: 0,name,dept,area,count,adjustedcount,year
123082,Nick G. Duffield,Texas A&M University,sigcomm,1.0,0.5,2000
187982,Yanyan Jiang 0001,Nanjing University,ase,2.0,0.375,2019
77104,Jia Xu 0004,Stevens Institute of Technology,iclr,1.0,0.33333,2023


In [5]:
dfcsprofs = pd.read_csv("data/csprofessors.csv")
dfcsprofs.describe()

Unnamed: 0,UniqueId,JoinYear
count,5630.0,5512.0
mean,3098.262877,2007.741473
std,1784.465144,11.730682
min,1.0,1963.0
25%,1569.25,2001.0
50%,3081.5,2011.0
75%,4623.75,2017.0
max,6212.0,2024.0


In [6]:
dfcsprofs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   UniqueId    5630 non-null   int64  
 1   FullName    5629 non-null   object 
 2   University  5630 non-null   object 
 3   JoinYear    5512 non-null   float64
 4   SubField    5625 non-null   object 
 5   Bachelors   5331 non-null   object 
 6   Doctorate   5618 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 308.0+ KB


In [7]:
dfcsprofs.sample(3)

Unnamed: 0,UniqueId,FullName,University,JoinYear,SubField,Bachelors,Doctorate
953,1048,Forrest Sheng Bao,Iowa State University,2017.0,Artificial Intelligence,,Texas Tech University
4882,5370,Lance R. Williams,University of New Mexico,1997.0,Computer Vision,Pennsylvania State University,University of Massachusetts Amherst
852,934,Barbara J. Grosz,Harvard University,1986.0,Machine Learning & Data Mining,Cornell University,"University of California, Berkeley"


In [8]:
dfplacement = pd.read_csv("data/placement-rank.csv", delimiter="|")
dfplacement.describe()

Unnamed: 0,num,size,bachelors,doctorate,placementscore
count,183.0,115.0,168.0,159.0,183.0
mean,92.0,45.121739,84.5,80.0,1.523333
std,52.971691,33.42605,48.641546,46.043458,1.664107
min,1.0,1.0,1.0,1.0,0.82
25%,46.5,22.5,42.75,40.5,0.865
50%,92.0,39.0,84.5,80.0,1.0
75%,137.5,60.0,126.25,119.5,1.415
max,183.0,253.0,168.0,159.0,15.4


In [9]:
dfplacement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   num             183 non-null    int64  
 1   university      183 non-null    object 
 2   size            115 non-null    float64
 3   bachelors       168 non-null    float64
 4   doctorate       159 non-null    float64
 5   placementscore  183 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 8.7+ KB


In [10]:
dfplacement.sample(3)

Unnamed: 0,num,university,size,bachelors,doctorate,placementscore
173,174,University of New Orleans,,153.0,,0.83
169,170,University of North Texas,,159.0,154.0,0.83
65,66,Texas A&M University,61.0,88.0,61.0,1.18


In [11]:
dfbestpaper = pd.read_csv("data/best-paper.csv", delimiter="|")
dfbestpaper = dfbestpaper.map(lambda x: x.strip() if isinstance(x, str) else x)
dfbestpaper.describe()

Unnamed: 0,Year,Coauthors
count,4815.0,4815.0
mean,2014.076843,5.455659
std,5.956105,3.199965
min,1996.0,1.0
25%,2010.0,3.0
50%,2015.0,5.0
75%,2019.0,7.0
max,2022.0,26.0


In [12]:
dfbestpaper.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Author      4815 non-null   object
 1   University  4815 non-null   object
 2   Conference  4815 non-null   object
 3   Area        4815 non-null   object
 4   Year        4815 non-null   int64 
 5   Coauthors   4815 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 225.8+ KB


In [13]:
dfbestpaper.sample(3)

Unnamed: 0,Author,University,Conference,Area,Year,Coauthors
1581,Jaejoon Lee,University of Lancaster,ICSE,Software Engineering,2018,12
844,Zhenhua Liu,Stony Brook University,INFOCOM,Networking,2020,4
3036,Mark A. Ruzon,Google,CVPR,Computer Vision,2013,6


In [14]:
dfpreference = pd.read_csv("data/preferences.csv")
dfpreference["id"] = range(len(dfpreference))
dfpreference.describe()

Unnamed: 0,preference,id
count,99.0,99.0
mean,2.747475,49.0
std,1.311981,28.722813
min,1.0,0.0
25%,2.0,24.5
50%,3.0,49.0
75%,4.0,73.5
max,5.0,98.0


In [15]:
dfpreference.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   conference  99 non-null     object
 1   preference  99 non-null     int64 
 2   id          99 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 2.4+ KB


In [16]:
dfpreference.sample(3)

Unnamed: 0,conference,preference,id
1,iros,2,1
26,focs,3,26
46,emnlp,5,46


In [17]:
university_alias = {}
with open("data/universities.csv", "r") as file:
    for line in file:
        unis = line.split(",")
        unis.pop()
        first = unis.pop(0)
        university_alias[first] = unis
count = 5
for k, v in university_alias.items():
    if count < 0:
        break
    count -= 1
    print(f"{k} {v}")
print(len(university_alias))

University of Kentucky []
Colorado State University []
University of Illinois at Chicago []
Duke University []
Florida International University []
UCCS []
186


In [18]:
def get_alias(uni):
    if uni in university_alias:
        return uni
    for k, v in university_alias.items():
        if uni in v:
            return k
    return None

In [188]:
dfstatistics = pd.read_csv("data/statistics.csv")
dfstatistics.describe()

Unnamed: 0,weight
count,0.0
mean,
std,
min,
25%,
50%,
75%,
max,


In [189]:
dfstatistics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   stat    126 non-null    object 
 1   weight  0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 2.1+ KB


In [190]:
dfstatistics.sample()

Unnamed: 0,stat,weight
63,paper_score_professors_sum,


# Methodology

I want to do a little better than following the rankings blindly, here are the factors I want to consider.

## Data

1. Professor Conference Score (essentially the csrankings.org score)
2. Best Paper Awards
3. Placement Rank (taken from the drafty placement rank, which is pagerank on professors original universities and then employment universities)
4. Variance of Professors

## Method

### Professor Score
A professor will be given a score based on their conference score and best papers. 

Both will have weights by:
- Will be weighted by logarithmically by the year published (the older, the less score)
- Will be weighted by how interested I am in the area (weights defined manually)

### Meta Scores

#### Placement Rank
A Placement Rank will be computed based on the professor score (the places where more important professors originate from matters more).

Logarithmic scale will be used to normalize the professor scores here.

#### Professor Score Variance
Variance of professor score will then be computed

#### Professor Score Sum
A sum of professors score will then be computed

### Final Score

I will take the Meta Scores, normalize them with min max scaling. Then I will apply some weights and get my results.



# Computing Professor Scores

- I will use the `generated-author-info` dataset for each professors to give their scores for each unique year-conference combination
- I will use the `best-paper` dataset for each professor to give a bonus for each professor with a best paper
- I will use `preference` dataset (which I manually created) to give weight preferences.


In [22]:
def year_weight(year):
    # 1970 - 2023
    year = 2023 - year + 3
    year *= 0.5
    return math.log(year)


# [year_weight(i) * 0.333 for i in range(1970, 2024)]

In [23]:
def area_weight(area, i=False):
    # score of 1-5
    area_score = dfpreference[dfpreference["conference"] == area]["preference"].values[
        0
    ]
    adjusted_score = math.log(area_score + 1, 8)
    if i:
        return (
            dfpreference[dfpreference["conference"] == area]["id"].values[0],
            adjusted_score,
        )
    return adjusted_score


# [math.log(i+1, math.e ** 0.125)*0.125 for i in range(1,6)]

In [24]:
names = []


def match_name(name, column, dataframe, one=False):
    clean_name = name.strip()
    last_four = clean_name[-4:]
    if last_four.isdigit():
        clean_name = clean_name[:-4]
    clean_name = clean_name.strip().lower()
    rows = dataframe[dataframe[column].str.strip().str.lower() == clean_name]
    if one:
        if rows.empty:
            return None
        return pd.DataFrame(rows.iloc[0])
    return rows


# match_name("Jiawei Han 0001", "FullName" , dfcsprofs)

In [25]:
# dfcsprofs[dfcsprofs['FullName'] == 'Jiawei Han']

In [101]:
dfauthorscore = pd.DataFrame(
    columns=["name", "university", "score", "score_scaled_raw", "score_scaled", "score_scaled_avg"]
)

scaled_arrays = []

for _ in range(len(dfpreference)):
    scaled_arrays.append(0.0)

In [102]:
for index, row in tqdm(
    dfcsrankings.iterrows(), desc="Prof Scores", unit="item", total=len(dfcsrankings)
):
    name = row["name"]
    university = row["dept"]

    year = row["year"]
    area = row["area"]

    yearweight = year_weight(year)
    areaindex, areaweight = area_weight(area, True)

    count = row["adjustedcount"] * yearweight * areaweight
    
    existing_rows = dfauthorscore[dfauthorscore["name"] == name]

    if existing_rows.empty:
        add_array = list(scaled_arrays)
        add_array[areaindex] = count
        dfauthorscore.loc[len(dfauthorscore)] = [name, university, count, add_array, 0.0, 0.0]
    else:
        for idx in existing_rows.index:
            dfauthorscore.at[idx, "score_scaled_raw"][areaindex] += count
            dfauthorscore.at[idx, "score"] += count

Prof Scores: 100%|██████████| 199906/199906 [07:14<00:00, 459.67item/s]


In [103]:
sum_scaled_arrays_author_avg = list(scaled_arrays)
sum_scaled_arrays_author = list(scaled_arrays)
for index, row in tqdm(
    dfauthorscore.iterrows(), desc="Prof Scores", unit="item", total=len(dfauthorscore)
):
    scaled_raw = row["score_scaled_raw"]
    for i, v in enumerate(scaled_raw):
        if v != 0:
            sum_scaled_arrays_author[i] += v
            sum_scaled_arrays_author_avg[i] += 1

Prof Scores: 100%|██████████| 16202/16202 [00:01<00:00, 11510.28item/s]


In [104]:
for index, val in enumerate(sum_scaled_arrays_author):
    if sum_scaled_arrays_author_avg[index] != 0.0:
        sum_scaled_arrays_author_avg[index] =  val / sum_scaled_arrays_author_avg[index]

In [105]:
for index, row in tqdm(
    dfauthorscore.iterrows(), desc="Prof Scores", unit="item", total=len(dfauthorscore)
):
    score_scaled = 0.0
    score_scaled_avg = 0.0
    for i, v in enumerate(row["score_scaled_raw"]):
        if v == 0.0:
            score_scaled += 0.0
            score_scaled_avg += sum_scaled_arrays_author_avg[i]
        else:
            score_scaled += v / sum_scaled_arrays_author[i]
            score_scaled_avg += v / sum_scaled_arrays_author[i]
        

    dfauthorscore.at[index, "score_scaled"] = score_scaled 
    dfauthorscore.at[index, "score_scaled_avg"] = score_scaled_avg 

Prof Scores: 100%|██████████| 16202/16202 [00:02<00:00, 5649.96item/s]


In [106]:
dfauthorscore.describe()

Unnamed: 0,score,score_scaled,score_scaled_avg
count,16202.0,16202.0,16202.0
mean,5.28928,0.004814,77.147363
std,8.421237,0.007913,4.119078
min,0.007387,9e-06,51.392457
25%,0.831027,0.000694,75.005517
50%,2.286453,0.002037,78.25088
75%,6.095181,0.005407,80.331681
max,109.848482,0.135769,82.157568


In [107]:
dfauthorscore.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16202 entries, 0 to 16201
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              16202 non-null  object 
 1   university        16202 non-null  object 
 2   score             16202 non-null  float64
 3   score_scaled_raw  16202 non-null  object 
 4   score_scaled      16202 non-null  float64
 5   score_scaled_avg  16202 non-null  float64
dtypes: float64(3), object(3)
memory usage: 1.4+ MB


In [108]:
dfauthorscore.sample(3)

Unnamed: 0,name,university,score,score_scaled_raw,score_scaled,score_scaled_avg
2868,Daniel Cremers,TU Munich,55.115252,"[0.0, 3.79989262898989, 0.14931328910233793, 0...",0.020963,67.769338
3793,Emanuele Rodolà,Sapienza University of Rome,8.90112,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.24278814984332564,...",0.004372,72.868096
5958,Jakub M. Tomczak,VU Amsterdam,1.9038,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.5343469671151485, ...",0.000997,78.425768


In [109]:
dfpaperscore = pd.DataFrame(
    columns=["name", "university", "score", "score_scaled_raw", "score_scaled", "score_scaled_avg"]
)
for index, row in tqdm(
    dfbestpaper.iterrows(),
    desc="Best Paper Scores",
    unit="item",
    total=len(dfbestpaper),
):
    name = row["Author"]
    university = row["University"]

    year = row["Year"]
    area = row["Area"]
    coauthors = 1.0 / float(row["Coauthors"])

    yearweight = year_weight(year)
    areaindex, areaweight = area_weight(area, True)

    count = yearweight * areaweight * coauthors

    existing_rows = dfpaperscore[dfpaperscore["name"] == name]

    if existing_rows.empty:
        add_array = list(scaled_arrays)
        add_array[areaindex] = count
        dfpaperscore.loc[len(dfpaperscore)] = [name, university, count, add_array, 0.0, 0.0]
    else:
        for idx in existing_rows.index:
            dfpaperscore.at[idx, "score_scaled_raw"][areaindex] += count
            dfpaperscore.at[idx, "score"] += count

Best Paper Scores: 100%|██████████| 4815/4815 [00:12<00:00, 385.52item/s]


In [110]:
sum_scaled_arrays_paper_avg = list(scaled_arrays)
sum_scaled_arrays_paper = list(scaled_arrays)
for index, row in tqdm(
    dfpaperscore.iterrows(), desc="Prof Scores", unit="item", total=len(dfpaperscore)
):
    scaled_raw = row["score_scaled_raw"]
    for i, v in enumerate(scaled_raw):
        if v != 0:
            sum_scaled_arrays_paper[i] += v
            sum_scaled_arrays_paper_avg[i] += 1

Prof Scores: 100%|██████████| 4099/4099 [00:00<00:00, 10634.67item/s]


In [111]:
for index, val in enumerate(sum_scaled_arrays_paper):
    if sum_scaled_arrays_paper_avg[index] != 0.0:
        sum_scaled_arrays_paper_avg[index] =  val / sum_scaled_arrays_paper_avg[index]

In [112]:
for index, row in tqdm(
    dfpaperscore.iterrows(), desc="Prof Scores", unit="item", total=len(dfpaperscore)
):
    score_scaled = 0.0
    score_scaled_avg = 0.0
    for i, v in enumerate(row["score_scaled_raw"]):
        if v == 0.0:
            score_scaled += 0.0
            score_scaled_avg += sum_scaled_arrays_paper_avg[i]
        else:
            score_scaled += v / sum_scaled_arrays_paper[i]
            score_scaled_avg += v / sum_scaled_arrays_paper[i]
        
    dfpaperscore.at[index, "score_scaled"] = score_scaled
    dfpaperscore.at[index, "score_scaled_avg"] = score_scaled_avg

Prof Scores: 100%|██████████| 4099/4099 [00:00<00:00, 5095.35item/s]


In [113]:
dfpaperscore.describe()

Unnamed: 0,score,score_scaled,score_scaled_avg
count,4099.0,4099.0,4099.0
mean,0.363439,0.005123,7.030055
std,0.417404,0.012055,0.164583
min,0.016968,0.000174,6.028081
25%,0.141717,0.000925,7.024425
50%,0.236656,0.002263,7.090929
75%,0.398169,0.005314,7.113757
max,5.022018,0.333333,7.599636


In [114]:
dfpaperscore.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4099 entries, 0 to 4098
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4099 non-null   object 
 1   university        4099 non-null   object 
 2   score             4099 non-null   float64
 3   score_scaled_raw  4099 non-null   object 
 4   score_scaled      4099 non-null   float64
 5   score_scaled_avg  4099 non-null   float64
dtypes: float64(3), object(3)
memory usage: 353.2+ KB


In [115]:
dfpaperscore.sample(3)

Unnamed: 0,name,university,score,score_scaled_raw,score_scaled,score_scaled_avg
1890,Bohan Zhang,Peking University,0.088293,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.001022,7.055767
1760,Ashish Shrivastava,Apple,0.132439,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.003023,7.051794
503,Nathan Klein,University of Washington,0.263175,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.001812,6.583222


In [117]:
warnings.simplefilter(action='ignore', category=FutureWarning)

dfscore = dfauthorscore.copy()

dfscore.rename(
    columns={
        "score": "author_score",
        "score_scaled": "author_score_scaled",
        "score_scaled_avg": "author_score_scaled_avg",
    },
    inplace=True,
)

dfscore = dfscore.drop(["score_scaled_raw"], axis=1)
dfscore["paper_score"] = None 
dfscore["paper_score_scaled"] = None
dfscore["paper_score_scaled_avg"] = None

for index, row in tqdm(
    dfpaperscore.iterrows(), desc="Merge Scores", unit="item", total=len(dfpaperscore)
):
    name = row["name"]
    university = row["university"]
    paper_score = row["score"]
    paper_score_scaled = row["score_scaled"]
    paper_score_scaled_avg = row["score_scaled_avg"]
    
    existing_rows = match_name(name, "name", dfscore)

    if existing_rows.empty:
        dfscore.loc[len(dfscore)] = [
            name,
            university,
            None,
            None,
            None,
            paper_score,
            paper_score_scaled,
            paper_score_scaled_avg,
        ]
    else:
        for idx in existing_rows.index:
            dfscore.loc[
                idx,
                [
                    "paper_score",
                    "paper_score_scaled",
                    "paper_score_scaled_avg",
                ]
            ] = [
                    paper_score,
                    paper_score_scaled,
                    paper_score_scaled_avg,
                ]

Merge Scores: 100%|██████████| 4099/4099 [00:52<00:00, 77.75item/s]


In [128]:
dfscore["university"] = dfscore["university"].apply(get_alias)
dfscore = dfscore.dropna(subset=["university"])
dfscore["name_lower"] = dfscore["name"].str.lower()
author_score_wnan = dfscore["author_score"].copy()
paper_score_wnan = dfscore["paper_score"].copy()
dfscore["combined_score"] = author_score_wnan.fillna(0) + paper_score_wnan.fillna(0)
dfscore.describe()

Unnamed: 0,author_score,author_score_scaled,author_score_scaled_avg,paper_score,paper_score_scaled,paper_score_scaled_avg,combined_score
count,6224.0,6224.0,6224.0,1732.0,1732.0,1732.0,7493.0
mean,7.105932,0.007176,76.334659,0.411352,0.005701,7.030447,5.997569
std,10.088763,0.010359,4.496159,0.457307,0.012018,0.168211,9.576129
min,0.007792,1.3e-05,51.392457,0.016968,0.000179,6.028081,0.007792
25%,1.289504,0.001216,73.990002,0.152715,0.001147,7.024408,0.618674
50%,3.424615,0.003506,77.428259,0.26824,0.002849,7.0909,2.430092
75%,8.729862,0.008655,79.747139,0.46226,0.006314,7.113877,7.217203
max,109.848482,0.135769,82.157354,5.022018,0.333333,7.599636,109.848482


In [129]:
dfscore.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7493 entries, 4 to 19418
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     7493 non-null   object 
 1   university               7493 non-null   object 
 2   author_score             6224 non-null   float64
 3   author_score_scaled      6224 non-null   float64
 4   author_score_scaled_avg  6224 non-null   float64
 5   paper_score              1732 non-null   float64
 6   paper_score_scaled       1732 non-null   float64
 7   paper_score_scaled_avg   1732 non-null   float64
 8   name_lower               7493 non-null   object 
 9   combined_score           7493 non-null   float64
dtypes: float64(7), object(3)
memory usage: 902.0+ KB


In [130]:
dfscore.sample(3)

Unnamed: 0,name,university,author_score,author_score_scaled,author_score_scaled_avg,paper_score,paper_score_scaled,paper_score_scaled_avg,name_lower,combined_score
11111,Pratap Tokekar,University of Maryland - College Park,8.275424,0.003426,75.708183,,,,pratap tokekar,8.275424
1058,Antonella Di Lillo,Brandeis University,0.184839,3.6e-05,80.370293,,,,antonella di lillo,0.184839
17966,Stacy Branham,Univ. of Maryland - Baltimore County,,,,0.162357,0.000554,7.091275,stacy branham,0.162357


# University dataframe

Now I will create a dataframe which has all people who currently work at the university, got their bachelors there, or got their doctorate there.

## Steps
1. Drop any universities that do not fit `get_alias`. This comprises of a lot of international universities that I don't have enough data on (and I don't want to apply to). Quite a lot of universities were filtered out in this stage.
2. Match the names and find out where profs got bachelors and phd from the `dfscore` and `dfcsprofs` dataframes. There will be a lot of misses here as well.
3. Modulate the dataframe to pivot off of universities with an array of authors instead of single author university pairings. 

## Name matching

Name matching will be done by lowering the case of everyones names and pivoting against the `name` field of the `dfscore` (which is from `dfcsrankings`) dataframe and the `FullName` field of the `dfcsprofs` dataframe. This will not end up with 100% matches (instead probably a lot less). There are also a lot of universities in both of these dataframes that wias filtered out with the `get_alias` function previously)


In [121]:
dfscore_placement = dfscore.copy()
dfscore_placement["placement"] = np.nan
dfscore_placement["bachelors"] = np.nan
dfscore_placement["doctorate"] = np.nan

dfcsprofs_placement = dfcsprofs.copy()
dfcsprofs_placement["placement"] = dfcsprofs_placement["University"].apply(get_alias)
dfcsprofs_placement["bachelors"] = dfcsprofs_placement["Bachelors"].apply(get_alias)
dfcsprofs_placement["doctorate"] = dfcsprofs_placement["Doctorate"].apply(get_alias)


for index, row in tqdm(
    dfscore_placement.iterrows(),
    desc="Merge Placement",
    unit="item",
    total=len(dfscore_placement),
):
    prof_row = match_name(row["name"], "FullName", dfcsprofs_placement, True)
    if prof_row is not None:
        dfscore_placement.loc[
            index, ["placement", "bachelors", "doctorate"]
        ] = dfcsprofs_placement.loc[
            prof_row.iloc[0].index[0], ["placement", "bachelors", "doctorate"]
        ]

dfscore_placement.describe()

Merge Placement: 100%|██████████| 7493/7493 [00:36<00:00, 206.03item/s]


Unnamed: 0,author_score,author_score_scaled,author_score_scaled_avg,paper_score,paper_score_scaled,paper_score_scaled_avg,combined_score
count,6224.0,6224.0,6224.0,1732.0,1732.0,1732.0,7493.0
mean,7.105932,0.007176,76.334659,0.411352,0.005701,7.030447,5.997569
std,10.088763,0.010359,4.496159,0.457307,0.012018,0.168211,9.576129
min,0.007792,1.3e-05,51.392457,0.016968,0.000179,6.028081,0.007792
25%,1.289504,0.001216,73.990002,0.152715,0.001147,7.024408,0.618674
50%,3.424615,0.003506,77.428259,0.26824,0.002849,7.0909,2.430092
75%,8.729862,0.008655,79.747139,0.46226,0.006314,7.113877,7.217203
max,109.848482,0.135769,82.157354,5.022018,0.333333,7.599636,109.848482


In [131]:
dfscore_placement.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7493 entries, 4 to 19418
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     7493 non-null   object 
 1   university               7493 non-null   object 
 2   author_score             6224 non-null   float64
 3   author_score_scaled      6224 non-null   float64
 4   author_score_scaled_avg  6224 non-null   float64
 5   paper_score              1732 non-null   float64
 6   paper_score_scaled       1732 non-null   float64
 7   paper_score_scaled_avg   1732 non-null   float64
 8   name_lower               7493 non-null   object 
 9   combined_score           7493 non-null   float64
 10  placement                3175 non-null   object 
 11  bachelors                1263 non-null   object 
 12  doctorate                2673 non-null   object 
dtypes: float64(7), object(6)
memory usage: 1.1+ MB


In [132]:
dfscore_placement.sample(3)

Unnamed: 0,name,university,author_score,author_score_scaled,author_score_scaled_avg,paper_score,paper_score_scaled,paper_score_scaled_avg,name_lower,combined_score,placement,bachelors,doctorate
9776,Mohammad Alian,University of Kansas,0.742571,0.002191,79.688409,,,,mohammad alian,0.742571,University of Kansas,,
15216,Yan-Bin Jia,Iowa State University,18.222773,0.005498,78.305503,,,,yan-bin jia,18.222773,Iowa State University,,Carnegie Mellon University
3038,David Bindel,Cornell University,5.094721,0.005169,66.769255,,,,david bindel,5.094721,Cornell University,University of Maryland - College Park,


In [163]:
unique_unis = np.unique(
    np.concatenate(
        [
            dfscore_placement["university"].dropna(),
            dfscore_placement["bachelors"].dropna(),
            dfscore_placement["doctorate"].dropna(),
        ]
    )
)

length = len(unique_unis)
inner_arrays = []
for _ in range(length):
    inner_arrays.append(np.array([]))

dfuniversity = pd.DataFrame(
    {
        "university": unique_unis,
        "professors": inner_arrays,
        "bachelors": inner_arrays,
        "doctorate": inner_arrays,
    }
)
dfuniversity.sample(3)

Unnamed: 0,university,professors,bachelors,doctorate
102,University of Alabama - Birmingham,[],[],[]
61,Northwestern University,[],[],[]
143,University of New Hampshire,[],[],[]


In [164]:
for index, row in tqdm(
    dfscore_placement.iterrows(),
    desc="Merge Scores",
    unit="item",
    total=len(dfscore_placement),
):
    name = row["name"]
    university = row["university"]
    bachelors = row["bachelors"]
    doctorate = row["doctorate"]

    existing_rows = dfuniversity[dfuniversity["university"] == university]
    if not existing_rows.empty:
        for idx in existing_rows.index:
            dfuniversity.at[idx, "professors"] = np.append(
                dfuniversity.at[idx, "professors"], name
            )

    existing_rows = dfuniversity[dfuniversity["university"] == bachelors]
    if not existing_rows.empty:
        for idx in existing_rows.index:
            dfuniversity.at[idx, "bachelors"] = np.append(
                dfuniversity.at[idx, "bachelors"], name
            )

    existing_rows = dfuniversity[dfuniversity["university"] == doctorate]
    if not existing_rows.empty:
        for idx in existing_rows.index:
            dfuniversity.at[idx, "doctorate"] = np.append(
                dfuniversity.at[idx, "doctorate"], name
            )

Merge Scores: 100%|██████████| 7493/7493 [00:07<00:00, 957.59item/s] 


In [165]:
dfuniversity.sample(3)

Unnamed: 0,university,professors,bachelors,doctorate
125,University of Louisiana - Lafayette,"[Anthony S. Maida, Arun Kulshreshth, Arun Lakh...",[],[]
49,Mississippi State University,"[Byron J. Williams, Cindy L. Bethel, Eric A. H...","[Bradley Reaves, Patrick G. Bridges]",[]
106,University of Arkansas - Little Rock,Peiyi Tang,[],[]


# Creating scores
While I am doing this, I will keep best paper and author score seperate. I will end up with 6 columns

- author score of people currently working there
- best paper score of people currently working there
- author score of people who got their bachelors there 
- best paper score of people who got their bachelors there 
- author score of people who got their doctorate there
- best paper score of people who got their doctorate there

I will start with empty arrays and then add to them as I lookup the scores of the authors from the `dfscore` dataframe. I also kept one array with the Nonetypes and one array without the Nonetypes to compute more statistics later.

In [166]:
combinations = {}

root = ["author_score", "paper_score"]
extension = ["", "_scaled", "_scaled_avg"]
level = ["_professors", "_bachelors", "_doctorate"]

for r in root:
    for e in extension:
        for l in level:
            combinations[f"{r}{e}{l}"] = ( f"{r}{e}", l[1:])
combinations

{'author_score_professors': ('author_score', 'professors'),
 'author_score_bachelors': ('author_score', 'bachelors'),
 'author_score_doctorate': ('author_score', 'doctorate'),
 'author_score_scaled_professors': ('author_score_scaled', 'professors'),
 'author_score_scaled_bachelors': ('author_score_scaled', 'bachelors'),
 'author_score_scaled_doctorate': ('author_score_scaled', 'doctorate'),
 'author_score_scaled_avg_professors': ('author_score_scaled_avg',
  'professors'),
 'author_score_scaled_avg_bachelors': ('author_score_scaled_avg', 'bachelors'),
 'author_score_scaled_avg_doctorate': ('author_score_scaled_avg', 'doctorate'),
 'paper_score_professors': ('paper_score', 'professors'),
 'paper_score_bachelors': ('paper_score', 'bachelors'),
 'paper_score_doctorate': ('paper_score', 'doctorate'),
 'paper_score_scaled_professors': ('paper_score_scaled', 'professors'),
 'paper_score_scaled_bachelors': ('paper_score_scaled', 'bachelors'),
 'paper_score_scaled_doctorate': ('paper_score_sca

In [167]:
def get_match(name, type):
    match = dfscore[dfscore["name_lower"] == name.lower()]
    if not match.empty:
        value = match.iloc[0][type]
        if not math.isnan(value):
            return value
    return None


def clean_none_value(value):
    if value == None:
        return 0.0
    return value


def lookup_scores(names, t):
    values = []
    if isinstance(names, str):
        res = get_match(names, t)
        values.append(clean_none_value(res))
        return values

    if names.ndim == 0:
        res = get_match(names.item(), t)
        values.append(clean_none_value(res))
        return values

    for name in names:
        res = get_match(name, t)
        values.append(clean_none_value(res))
    return values


In [168]:
tqdm.pandas(desc="Processing")
# lambda x: custom_function(x, param1=param1_value, param2=param2_value)

for c, l in combinations.items():
    dfuniversity[c] = dfuniversity[l[1]].progress_apply(
       lambda x: lookup_scores(x, l[0])
    )


Processing: 100%|██████████| 185/185 [00:07<00:00, 23.89it/s]
Processing: 100%|██████████| 185/185 [00:01<00:00, 145.72it/s]
Processing: 100%|██████████| 185/185 [00:02<00:00, 69.74it/s] 
Processing: 100%|██████████| 185/185 [00:07<00:00, 24.33it/s]
Processing: 100%|██████████| 185/185 [00:01<00:00, 144.62it/s]
Processing: 100%|██████████| 185/185 [00:02<00:00, 68.13it/s]
Processing: 100%|██████████| 185/185 [00:07<00:00, 24.37it/s]
Processing: 100%|██████████| 185/185 [00:01<00:00, 144.36it/s]
Processing: 100%|██████████| 185/185 [00:02<00:00, 68.65it/s]
Processing: 100%|██████████| 185/185 [00:07<00:00, 24.34it/s]
Processing: 100%|██████████| 185/185 [00:01<00:00, 144.53it/s]
Processing: 100%|██████████| 185/185 [00:02<00:00, 67.51it/s]
Processing: 100%|██████████| 185/185 [00:07<00:00, 24.87it/s]
Processing: 100%|██████████| 185/185 [00:01<00:00, 134.19it/s]
Processing: 100%|██████████| 185/185 [00:02<00:00, 70.09it/s] 
Processing: 100%|██████████| 185/185 [00:07<00:00, 24.49it/s]
P

In [169]:
dfuniversity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 22 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   university                          185 non-null    object
 1   professors                          185 non-null    object
 2   bachelors                           185 non-null    object
 3   doctorate                           185 non-null    object
 4   author_score_professors             185 non-null    object
 5   author_score_bachelors              185 non-null    object
 6   author_score_doctorate              185 non-null    object
 7   author_score_scaled_professors      185 non-null    object
 8   author_score_scaled_bachelors       185 non-null    object
 9   author_score_scaled_doctorate       185 non-null    object
 10  author_score_scaled_avg_professors  185 non-null    object
 11  author_score_scaled_avg_bachelors   185 non-null    object

# Computing Statistics

Here I am computing just a bunch of statistics. I iterate through each one of the arrays for all the different data type and compute statistics on each

It is not too complicated, but allows for me a lot of granularity of control in the weighing stage and computing the final score.

In [170]:
stats_columns = ["sum", "mean", "median", "min", "max", "var", "std"]

In [171]:
add_columns = []
for d in combinations.keys():
    for s in stats_columns:
        add_columns.append(f"{d}_{s}")
        
        
default_value = None 
dfadd_columns = pd.DataFrame({column_name: [default_value] * len(dfuniversity) for column_name in add_columns})

dfuniversity = pd.concat([dfuniversity, dfadd_columns], axis=1)

dfuniversity.sample(3)

Unnamed: 0,university,professors,bachelors,doctorate,author_score_professors,author_score_bachelors,author_score_doctorate,author_score_scaled_professors,author_score_scaled_bachelors,author_score_scaled_doctorate,...,paper_score_scaled_avg_bachelors_max,paper_score_scaled_avg_bachelors_var,paper_score_scaled_avg_bachelors_std,paper_score_scaled_avg_doctorate_sum,paper_score_scaled_avg_doctorate_mean,paper_score_scaled_avg_doctorate_median,paper_score_scaled_avg_doctorate_min,paper_score_scaled_avg_doctorate_max,paper_score_scaled_avg_doctorate_var,paper_score_scaled_avg_doctorate_std
135,University of Minnesota,"[Abhishek Chandra, Ali Anwar 0001, Anand R. Tr...","[David Mohaisen, Douglas Thain, Gary M. Olson,...","[Ahmed Eldawy, Amr Magdy 0001, Ananth Grama, A...","[2.727952264894627, 1.1016497579260773, 1.2881...","[2.4858660686743095, 9.124068863045782, 2.5552...","[3.3925190048646625, 1.7795386458206712, 10.08...","[0.008068789189390378, 0.0029553315635866485, ...","[0.00516342303322149, 0.01877773498389329, 0.0...","[0.0017964868752140604, 0.0009517544083221853,...",...,,,,,,,,,,
108,University of British Columbia,"[Aastha Mehta, Alan J. Hu, Alan K. Mackworth, ...","[Andrew Gordon Wilson, Caroline Lemieux, David...","[Andrew Roth, Chen Greif, Christopher Batty, C...","[0.7580178767048009, 10.769116231332792, 11.34...","[10.417883560793838, 1.961920491040219, 9.3814...","[0.18098333807255743, 0.6073422289456853, 4.51...","[0.0016858186778143242, 0.01896773370244168, 0...","[0.004655312049754712, 0.004442739360032666, 0...","[7.735066305656366e-05, 0.001025010049649023, ...",...,,,,,,,,,,
151,University of Ottawa,"[Ahmed Karmouch, Amiya Nayak, Amy P. Felty, Ca...","[Maria Gorlatova, Yang Xu 0023, Yue Dong 0002]",[],"[1.0479829740507656, 0.7449012369694961, 2.373...","[0.5966790589299827, 1.8386078314581304, 2.239...",[],"[0.002319769168250341, 0.0019732710266134245, ...","[0.0023908565423374243, 0.0009460724779937992,...",[],...,,,,,,,,,,


In [172]:
dfuniversity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Columns: 148 entries, university to paper_score_scaled_avg_doctorate_std
dtypes: object(148)
memory usage: 214.0+ KB


In [173]:
def filter_array(data):
    data = list(filter(lambda x: x is not None, data))
    data = [x for x in data if not math.isnan(x)]
    data = [x for x in data if not np.isnan(x)]
    return data


def variance_hander(data):
    if len(data) <= 1:
        return 0
    return np.nanvar(data, ddof=1)


def get_handler(func, data):
    filtered_data = filter_array(data)
    if len(filtered_data) == 0:
        return 0.0
    return func(filtered_data)


def get_sum(data):
    return get_handler(np.nansum, data)


def get_mean(data):
    return get_handler(np.nanmean, data)


def get_median(data):
    return get_handler(np.nanmedian, data)


def get_min(data):
    return get_handler(np.nanmin, data)


def get_max(data):
    return get_handler(np.nanmax, data)


def get_var(data):
    return get_handler(variance_hander, data)


def get_std(data):
    return get_handler(np.nanstd, data)

In [174]:
stat_function_map = {
    "sum": get_sum,
    "mean": get_mean,
    "median": get_median,
    "min": get_min,
    "max": get_max,
    "var": get_var,
    "std": get_std,
}

In [175]:
count = 0
for data in tqdm(combinations.keys(), desc="Data Types", unit="item", total=len(combinations.keys())):
    for stat in stats_columns:
        stat_name = data + "_" + stat
        for index, row in dfuniversity.iterrows():
            values = row[data]
            dfuniversity.at[index, stat_name] = stat_function_map[stat](values)

        sum_column = dfuniversity[stat_name].sum(skipna=True)
        dfuniversity[stat_name] = dfuniversity[stat_name].apply(
            lambda x: x / sum_column
            if (pd.notna(x) or x == 0.0)
            else sum_column / len(dfuniversity)
        )

Data Types: 100%|██████████| 18/18 [00:04<00:00,  3.86item/s]


In [187]:
def rewrite_stats_file(file_path):
    with open(file_path, "w") as file:
        file.write(f"stat,weight\n")
        for i, item in enumerate(add_columns):
            file.write(f"{str(item)},")
            if i < len(add_columns) - 1:
                file.write("\n")
            
#rewrite_stats_file("data/statistics.csv")

# Final Rank
Here I take some weights I made manually and assign importance to each of the stats on a 1-5 scale. Then I can compute a final rank and see the colleges.

Also, just for fun I can see who my top profs are as well.

In [191]:
dfuniversity["final_score"] = np.nan
for index, row_uni in tqdm(
    dfuniversity.iterrows(), desc="Final Score", unit="item", total=len(dfuniversity)
):
    scores = []
    for _, row_stat in dfstatistics.iterrows():
        statistic = row_stat["stat"]
        weight = row_stat["weight"]
        if not np.isnan(weight):
            partial_score = row_uni[statistic] * float(weight) * 0.2
            scores.append(partial_score)
    scores = list(filter(lambda x: x != 0.0, scores))
    score = sum(scores) / len(scores) if scores else 0.0
    dfuniversity.at[index, "final_score"] = score

Final Score: 100%|██████████| 185/185 [00:01<00:00, 93.08item/s] 


In [192]:
final_results = dfuniversity.sort_values("final_score", ascending=False)
final_results.head(5)

Unnamed: 0,university,professors,bachelors,doctorate,author_score_professors,author_score_bachelors,author_score_doctorate,author_score_scaled_professors,author_score_scaled_bachelors,author_score_scaled_doctorate,...,paper_score_scaled_avg_bachelors_var,paper_score_scaled_avg_bachelors_std,paper_score_scaled_avg_doctorate_sum,paper_score_scaled_avg_doctorate_mean,paper_score_scaled_avg_doctorate_median,paper_score_scaled_avg_doctorate_min,paper_score_scaled_avg_doctorate_max,paper_score_scaled_avg_doctorate_var,paper_score_scaled_avg_doctorate_std,final_score
0,Arizona State University,"[Adam Doupé, Adil Ahmad, Andréa W. Richa, Avir...","[Adam Doupé, Amy Voida, Anthony Gitter, Cynthi...","[Bernd Hamann, Dejun Yang, Hongxin Hu, Jia Yu ...","[2.87755166470041, 1.509272136753373, 2.206527...","[2.87755166470041, 4.55387042625105, 0.3119670...","[11.283021335835768, 0.16215917908794278, 3.01...","[0.00475879575046829, 0.002648566895424584, 0....","[0.00475879575046829, 0.002316514291693613, 0....","[0.012156214431460663, 0.0007845635038285948, ...",...,0.015072,0.018325,0.002988,0.006167,0.0,0.0,0.017806,0.009373,0.01426,0.0
127,University of Maryland - College Park,"[A. Udaya Shankar, Abhinav Bhatele, Abhinav Sh...","[Aaron Schulman, Adam Bates 0001, Adam W. Barg...","[Aaron Schulman, Alexandros Labrinidis, Alvaro...","[5.552525594464088, 4.544028392484438, 10.6306...","[2.966065301184756, 5.486671024175045, 2.18963...","[2.966065301184756, 6.85587752341778, 1.901759...","[0.01049914824011296, 0.006684006547722314, 0....","[0.009938260157331841, 0.008459690796096597, 0...","[0.009938260157331841, 0.0036902527902503736, ...",...,0.012185,0.016625,0.017761,0.008499,0.0,0.0,0.017866,0.011856,0.016444,0.0
118,University of Georgia,"[Ari Schlesinger, Gagan Agrawal, Geng Yuan, Ha...","[Emily Wall, Michael W. Berry]",Dajiang Zhu,"[0.6414513347999425, 21.5533829118081, 1.11475...","[0.6978282814162997, 1.098050404427458]",[0.4559268232494603],"[0.0003263007168417166, 0.031928257786641746, ...","[0.0007941890816685657, 0.0022572601575487393]",[9.095316789513547e-05],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,University of Hawaii at Manoa,"[Carleton A. Moore, Daniel D. Suthers, Dusko P...",[],[],"[0.2366559640865425, 3.2292690439308784, 1.350...",[],[],"[0.0006572094811419788, 0.0006413957826421726,...",[],[],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,University of Idaho,"[Hasan M. Jamil, Jim Alves-Foss, Robert B. Hec...",[],Sebastian G. Elbaum,"[3.7971852005866413, 2.838689707595685, 0.7579...",[],[21.5171355495396],"[0.0021041063409559806, 0.0051647646617229495,...",[],[0.031295763295581],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [194]:
final_results.head(25)[
    [
        "university",
        "final_score",
    ]
].reset_index()

Unnamed: 0,index,university,final_score
0,0,Arizona State University,0.0
1,127,University of Maryland - College Park,0.0
2,118,University of Georgia,0.0
3,119,University of Hawaii at Manoa,0.0
4,120,University of Idaho,0.0
5,121,University of Illinois at Chicago,0.0
6,122,University of Iowa,0.0
7,123,University of Kansas,0.0
8,124,University of Kentucky,0.0
9,125,University of Louisiana - Lafayette,0.0


In [195]:
dfscore.sort_values("combined_score", ascending=False).head(20)

Unnamed: 0,name,university,author_score,author_score_scaled,author_score_scaled_avg,paper_score,paper_score_scaled,paper_score_scaled_avg,name_lower,combined_score
10960,Philip S. Yu,University of Illinois at Chicago,109.848482,0.068918,52.578448,,,,philip s. yu,109.848482
6494,Jiawei Han 0001,Univ. of Illinois at Urbana-Champaign,100.007356,0.062009,57.120725,,,,jiawei han 0001,100.007356
14087,Tuomas Sandholm,Carnegie Mellon University,96.699988,0.048898,68.207344,,,,tuomas sandholm,96.699988
2819,Dan Roth,University of Pennsylvania,94.035226,0.042387,66.731485,2.242618,0.01634,6.775143,dan roth,96.277844
9860,Moshe Y. Vardi,Rice University,94.277997,0.091237,57.688012,,,,moshe y. vardi,94.277997
8919,Martial Hebert,Carnegie Mellon University,92.681259,0.035878,66.32362,0.405501,0.009256,7.058027,martial hebert,93.08676
14297,Vijay Kumar 0001,University of Pennsylvania,92.118075,0.050487,72.924504,,,,vijay kumar 0001,92.118075
2967,Daniela Rus,Massachusetts Institute of Technology,81.876629,0.057168,64.173604,,,,daniela rus,81.876629
13537,Takeo Kanade,Carnegie Mellon University,80.590609,0.024356,67.535817,,,,takeo kanade,80.590609
2520,Christos Faloutsos,Carnegie Mellon University,77.823569,0.051885,62.381408,1.592371,0.036245,6.121925,christos faloutsos,79.41594
