This notebook uses `scholarly` API (https://github.com/scholarly-python-package/scholarly) to enrich CS faculty info

In [1]:
from scrap_cs_faculty import *

In [2]:
org_list = ["CMU-CS","Cornell-CS","MIT-AID","MIT-CS","Stanford-CS","UCB-CS","UIUC-CS",]
idx_org = org_list.index("Cornell-CS")
org = org_list[idx_org]
org2 = org.split("-")[0] 

## read CS faculty info scraped previously

In [5]:
file_xlsx = f"faculty-{org}.xlsx"
xlsxf = pd.ExcelFile(file_xlsx)

In [6]:
xlsxf.sheet_names

['Faculty']

In [7]:
df = xlsxf.parse('Faculty')

In [8]:
df

Unnamed: 0,name,job_title,phd_univ,phd_year,research_area,research_concentration,research_focus,url,img_url,phone,email,cell_phone,office_address,department,school
0,"Acar, Umut",Associate Professor,,,,,,https://csd.cmu.edu/people/faculty/umut-acar,,(412) 268-6791,uacar@andrew.cmu.edu,,9101 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
1,"Ada, Anil",Associate Teaching Professor,,,,,,https://csd.cmu.edu/people/faculty/anil-ada,,(412) 268-3835,aada@andrew.cmu.edu,,6215 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
2,"Akoglu, Leman","Associate Professor, Affiliated Faculty",,,,,,https://csd.cmu.edu/people/faculty/leman-akoglu,,(412) 268-3043,lakoglu@andrew.cmu.edu,,2118C Hamburg Hall,Computer Science,Carnegie Mellon Univ
3,"Aldrich, Jonathan","Professor, Affiliated Faculty",,,,,,https://csd.cmu.edu/people/faculty/jonathan-al...,,(412) 268-7278,aldrich@cs.cmu.edu,,422 TCS Hall,Computer Science,Carnegie Mellon Univ
4,"Amvrosiadis, George","Assistant Research Professor, Affiliated Faculty",,,,,,https://csd.cmu.edu/people/faculty/george-amvr...,,,gamvrosi@andrew.cmu.edu,,2311 Mehrabian Collaborative Innovation Center,Computer Science,Carnegie Mellon Univ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,"Wing, Jeannette",Adjunct Faculty,,,,,,https://csd.cmu.edu/people/faculty/jeannette-wing,,(412) 268-2000,jw35@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ
116,"Woodruff, David",Professor,,,,,,https://csd.cmu.edu/people/faculty/david-woodr...,,,dwoodruf@andrew.cmu.edu,,7217 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
117,"Xhakaj, Franceska",Assistant Teaching Professor,,,,,,https://csd.cmu.edu/people/faculty/franceska-x...,,,francesx@andrew.cmu.edu,,4003 Gates and Hillman Centers,Computer Science,Carnegie Mellon Univ
118,"Zhang, Hui",Consulting Professor,,,,,,https://csd.cmu.edu/people/faculty/hui-zhang-0,,,huiz1@andrew.cmu.edu,,,Computer Science,Carnegie Mellon Univ


In [9]:
names = df["name"].to_list()

In [10]:
len(names), names 

(120,
 ['Acar, Umut',
  'Ada, Anil',
  'Akoglu, Leman',
  'Aldrich, Jonathan',
  'Amvrosiadis, George',
  'Andersen, David',
  'Balcan, Maria',
  'Balzer, Stephanie',
  'Beckmann, Nathan',
  'Berger, Daniel',
  'Blelloch, Guy',
  'Blum, Manuel',
  'Bogart, Christopher',
  'Brookes, Stephen',
  'Brumley, David',
  'Bryant, Randal',
  'Carrasquel, Jacobo',
  'Cervesato, Iliano',
  'Chen, Tianqi',
  'Chrysanthis, Panos',
  'Conitzer, Vincent',
  'Cortina, Thomas',
  'Crane, Keenan',
  'Crary, Karl',
  'Dannenberg, Roger',
  'DeYoung, Henry',
  'Donahue, Chris',
  'Eckhardt, Dave',
  'Erdmann, Michael',
  'Faloutsos, Christos',
  'Fanti, Giulia',
  'Fatahalian, Kayvon',
  'Fredrikson, Matt',
  'Frieze, Alan',
  'Ganger, Gregory',
  'Garlan, David',
  'Gibbons, Phillip',
  'Gibson, Garth',
  'Gligor, Virgil',
  'Goldstein, Seth',
  'Goyal, Vipul',
  'Gupta, Anupam',
  'Guruswami, Venkatesan',
  'Haeupler, Bernhard',
  'Harchol-Balter, Mor',
  'Harper, Robert',
  'Heule, Marijn',
  'Hodgins,

## fetch google scholar data for all CS faculty

In [11]:
ntest = -1  #  2 # 

In [12]:
pub_data = []
for n, name in enumerate(names[:ntest]):
#     if n < 5: continue 
    print(f"n, name = {n}, {name} ...")
    
    author_org = f"{name} {org2}"
    norm_auth_org = normalize_str(author_org)
    file_author = f"data/GScholar_{norm_auth_org}.json"

    try:
        search_query = scholarly.search_author(author_org)
        init_result = next(search_query)
    except Exception as ex:
        print(str(ex))
        init_result = None 
        
    if init_result is None:
        print(f"Failed search_author()")
        continue

    scholar_id = init_result.get("scholar_id", "")
    if not scholar_id:
        print(f"Missing scholar_id")
        continue        
    
    # fetch data
    author = scholarly.fill(init_result)
    
    author_dict = {}
    # fill data cell
    author_dict["name"] = name 
    author_dict["file_author"] = file_author 
    author_dict["scholar_id"] = scholar_id 
    author_dict["affiliation"] = author.get("affiliation", "")
    author_dict["interests"] = "; ".join(author.get("interests", []))
    author_dict["url_author"] = get_scholar_page(scholar_id)
    author_dict["url_picture"] = author.get("url_picture", "")
    author_dict["url_homepage"] = author.get("homepage", "")
    author_dict["citedby"] = author.get("citedby", 0)
    author_dict["citedby5y"] = author.get("citedby5y", 0)
    author_dict["hindex"] = author.get("hindex", 0)
    author_dict["hindex5y"] = author.get("hindex5y", 0)
    author_dict["i10index"] = author.get("i10index", 0)
    author_dict["i10index5y"] = author.get("i10index5y", 0)
    author_dict["num_papers"] = len(author.get("publications", []))
    author_dict["num_coauthors"] = len(author.get("coauthors", []))
      
    # fill row 
    author_data = []
    for c in SCHOLAR_HEADER:
        author_data.append(author_dict.get(c))
        
    # accumulate row
    pub_data.append(author_data)
    
    # persist author data
    with open(Path(file_author), "w", encoding="utf-8") as f:
        f.write(json.dumps(author))
    
    delay = randint(1,5)
    sleep(delay)

n, name = 0, Acar, Umut ...
n, name = 1, Ada, Anil ...

Failed search_author()
n, name = 2, Akoglu, Leman ...
n, name = 3, Aldrich, Jonathan ...
n, name = 4, Amvrosiadis, George ...
n, name = 5, Andersen, David ...
n, name = 6, Balcan, Maria ...
n, name = 7, Balzer, Stephanie ...
n, name = 8, Beckmann, Nathan ...
n, name = 9, Berger, Daniel ...

Failed search_author()
n, name = 10, Blelloch, Guy ...
n, name = 11, Blum, Manuel ...

Failed search_author()
n, name = 12, Bogart, Christopher ...
n, name = 13, Brookes, Stephen ...

Failed search_author()
n, name = 14, Brumley, David ...
n, name = 15, Bryant, Randal ...
n, name = 16, Carrasquel, Jacobo ...
n, name = 17, Cervesato, Iliano ...
n, name = 18, Chen, Tianqi ...
n, name = 19, Chrysanthis, Panos ...

Failed search_author()
n, name = 20, Conitzer, Vincent ...
n, name = 21, Cortina, Thomas ...
n, name = 22, Crane, Keenan ...
n, name = 23, Crary, Karl ...
n, name = 24, Dannenberg, Roger ...
n, name = 25, DeYoung, Henry ...

Failed searc

In [13]:
len(pub_data)

97

## write out xlsx

In [14]:
df_out = pd.DataFrame(pub_data, columns=SCHOLAR_HEADER)

In [15]:
df_out.head()

Unnamed: 0,name,affiliation,interests,num_papers,num_coauthors,citedby,hindex,i10index,citedby5y,hindex5y,i10index5y,scholar_id,url_author,url_picture,url_homepage,file_author
0,"Acar, Umut","Carnegie Mellon University, Department of Comp...",,144,36,4104,34,74,1459,20,47,wJ4NywgAAAAJ,https://scholar.google.com/citations?user=wJ4N...,https://scholar.google.com/citations?view_op=m...,http://www.umut-acar.org/,data/GScholar_acar_umut_cmu.json
1,"Akoglu, Leman","Associate Professor, Carnegie Mellon University",AI/ML; Unsupervised Learning; Anomaly/Fraud/Ev...,188,45,10997,49,87,8295,41,78,4ITkr_kAAAAJ,https://scholar.google.com/citations?user=4ITk...,https://scholar.google.com/citations?view_op=m...,https://www.cs.cmu.edu/~lakoglu/,data/GScholar_akoglu_leman_cmu.json
2,"Aldrich, Jonathan","Professor of Computer Science, Carnegie Mellon...",Programming Languages; Software Engineering,282,17,6254,38,93,1557,24,46,AzHmOtcAAAAJ,https://scholar.google.com/citations?user=AzHm...,https://scholar.google.com/citations?view_op=m...,http://www.cs.cmu.edu/~aldrich/,data/GScholar_aldrich_jonathan_cmu.json
3,"Amvrosiadis, George","Carnegie Mellon University, Amazon Web Services",Computer Systems; Operating Systems; Distribut...,46,20,997,13,18,782,13,18,ygiY7C8AAAAJ,https://scholar.google.com/citations?user=ygiY...,https://scholar.google.com/citations?view_op=m...,http://www.amvrosiadis.com/,data/GScholar_amvrosiadis_george_cmu.json
4,"Andersen, David","Professor of Computer Science, Carnegie Mellon...",Distributed systems; networks; operating syste...,221,55,25327,70,133,11668,49,96,wUArfPgAAAAJ,https://scholar.google.com/citations?user=wUAr...,https://scholar.google.com/citations?view_op=m...,http://www.cs.cmu.edu/~dga/,data/GScholar_andersen_david_cmu.json


In [16]:
# import xlsxwriter
file_xlsx = f"data/cs-faculty-gscholar-{org2}-{n}.xlsx"
writer = pd.ExcelWriter(Path(file_xlsx), engine='xlsxwriter')
df_out.to_excel(writer, sheet_name=org2, index=False)
writer.save()

  writer.save()
