In [26]:
import pandas as pd
import scholarly
from collections import defaultdict
import re

In [28]:
# Please change this to the year you are 
global_year = 2017

## Read dataset

In [8]:
data_raw = pd.read_pickle('../features/all_data_features_17_20.pkl')
# data_raw.head()
data = data_raw[["id", "title", "label", "authors"]]
org_papers = data.to_dict('index')

### Segregate authors based on year

In [24]:
yearwise_authors = {y: set() for y in range(2017, 2021)}

for p in org_papers:
    year = int(p.split("_")[0])
    for auth in org_papers[p]["authors"]:
        yearwise_authors[year].add(auth)

### Scrape GS author info

In [27]:
author_info = {}
count = len(author_info)

p = re.compile(" [A-Z]\.? ")

In [None]:
for a in yearwise_authors[global_year]:
    if not a in author_info:
        if count % 10 == 0:
            print("Status {} / {}".format(count, len(all_authors)))
        auth_candidates = []
        try:
            c = scholarly.search_author(a)
            for i in c:
                auth_candidates.append((i, i.fill()))

            if not auth_candidates:
                # no author found
                a_clean = p.sub(" ", a)
                c = scholarly.search_author(a_clean)
                for i in c:
                    auth_candidates.append((i, i.fill()))

            author_info[a] = auth_candidates
        except Exception as ex:
            print("Exception: ", a)
        count += 1

### Save info

In [None]:
with open("author_info_scholar.pickle", "wb") as f:
    pickle.dump(author_info, f)

## Disambiguate authors

### 1. Read gs scrapped data

In [None]:
with open("author_info_scholar.pickle", "rb") as f:
    all_authors_gs = pickle.load(f)

### 2. Read conflicts data

In [None]:
data_path = "../../ICLR data/masterdata_unbalanced/"

years = [global_year]

for y in years:
    papers_data = pd.read_pickle(data_path + "papers_{}.pkl".format(y))

In [None]:
conflicts_dict = {}
emails_dict = {}

for p in papers_data:
    conflicts_dict[p] = papers_data[p]["content"]["conflicts"]
    emails_dict[p] = papers_data[p]["content"]["authorids"]

### 3. Start disambiguation

In [None]:
paper_authors_info = defaultdict(dict)
per_paper = defaultdict(list)

still_not_found = 0
total_auth_count = 0
found_count = 0
not_in_gs = 0
cc = 0

still_not_found_list = []

for p in org_papers:
    for auth in org_papers[p]["authors"]:
        total_auth_count += 1
        disambiguated_author = None
        
        if auth in all_authors_gs:
            if len(all_authors_gs[auth]) == 1:
                disambiguated_author = all_authors_gs[auth][0][0]
                found_count += 1
            else:
                found_paper = False
                for mauth in all_authors_gs[auth]:
                    for pub in mauth[0].publications:
                        org_title = ''.join(filter(str.isalpha, org_papers[p]["title"].lower()))
                        gs_pub_title = ''.join(filter(str.isalpha, pub.bib["title"].lower()))
                        if org_title == gs_pub_title:
                            found_paper = True
                            disambiguated_author = mauth[0]
                            found_count += 1
                            break
                    if found_paper:
                        break
                if not found_paper:
                    # Paper not found, check via conflicts/email information
                    matching_affils = []
                    
                    for conflict_affil in conflicts_dict[p]:
                        for mauth in all_authors_gs[auth]:
                            if mauth[0].email.find(conflict_affil) > -1:
                                matching_affils.append(mauth)
                    
                    # still not found
                    if len(matching_affils) == 0:
                        still_not_found += 1
                        still_not_found_list.append(auth)
                    elif len(matching_affils) == 1:
                        disambiguated_author = matching_affils[0][0]
                        found_count += 1
                    else:
                        potential_emailids = []
                        auth_split = auth.split(" ")
                        for splitname in auth_split:
                            for authemailid in emails_dict[p]:
                                if authemailid.find(splitname) > -1:
                                    potential_emailids.append(authemailid)
                                    
                        potential_emailids = list(set(potential_emailids))
                        if len(potential_emailids) == 1:
                            domain = potential_emailids[0].split("@")[-1]
                            identified_auth = []
                            for m in matching_affils:
                                if m[0].email.find(domain) > -1:
                                    identified_auth.append(m[0])
                            
                            if len(identified_auth) == 1:
                                disambiguated_author = m[0]
                                found_count +=1 
                            else:
                                still_not_found += 1
                                still_not_found_list.append(auth)
                        else:
                            still_not_found += 1
                            still_not_found_list.append(auth)
            if not disambiguated_author is None:
                cc +=1
                paper_authors_info[auth] = disambiguated_author
                per_paper[p].append(auth)
        else:
            not_in_gs +=1

In [None]:
print("Found authors: ", found_count)
print("Added to dict: ", cc)
print("Multiple entries in GS but no publication with the same title found or empty GS: ", still_not_found)
print("NOt in GS data: ", not_in_gs)
print("Total authors to be disambiguated: ", total_auth_count)

## Save citation count, publication count, and hindex

In [None]:
for k, v in org_papers.items():
    org_papers[k]["pub_count_gs"] = []
    org_papers[k]["cit_count_gs"] = []
    org_papers[k]["hindex"] = []
    
    for a in v["authors"]:
        if a in paper_authors_info:
            temp_auth_dict = paper_authors_info[a].__dict__
            if "citedby" in temp_auth_dict and temp_auth_dict["citedby"] > 0:
                total_citations = temp_auth_dict["citedby"]
                if "cites_per_year" in temp_auth_dict:
                    for y in temp_auth_dict["cites_per_year"]:
                        if y > (global_year-1):
                            total_citations -= temp_auth_dict["cites_per_year"][y]
                org_papers[k]["cit_count_gs"].append(total_citations)
            if "hindex" in temp_auth_dict:
                if "hindex5y" in temp_auth_dict:
                    org_papers[k]["hindex"].append((temp_auth_dict["hindex5y"] + temp_auth_dict["hindex"])/2)
                else:
                    org_papers[k]["hindex"].append(temp_auth_dict["hindex"])
            if "publications" in temp_auth_dict:
                total_publications = 0
                for pub in temp_auth_dict["publications"]:
                    if "year" in pub.__dict__["bib"]:
                        if pub.__dict__["bib"]["year"] < global_year:
                            total_publications += 1
                    else:
                        total_publications += 1
                org_papers[k]["pub_count_gs"].append(total_publications)