In [105]:
import os, sys, re, ast
import csv
import pandas as pd
from time import sleep
import numpy as np
from pybliometrics.scopus import CitationOverview, AuthorRetrieval, AbstractRetrieval
from pybliometrics.scopus.utils import config
config['Authentication'] ['APIKey'] = "127f5bf5de20d338f686704a9a328b86"
print(config['Authentication'] ['APIKey'])
import bct
from scholarly import scholarly, ProxyGenerator
from collections import Counter

127f5bf5de20d338f686704a9a328b86


In [107]:
def get_coauthor_matrix(df):
    names = df.loc[:,"Name"]
    coauthor_matrix = np.zeros((names.size, names.size))

    for idx, author in df.iterrows():
        auth_name = author["Name"]
        row = names[names == auth_name].index[0]
        try:
            coauthors = ast.literal_eval(author["Coauthors"]) #get coauthors
            if len(coauthors) == 0: print(auth_name, "has no coauthors")
            for coauthor in coauthors.keys():
                num_publications = coauthors[coauthor]
                col = names[names == coauthor].index[0]
                coauthor_matrix[col][row] += num_publications

        except:
            print(auth_name, "has no coauthors")
    
    print("\n")
    return coauthor_matrix

In [106]:
def get_publication_details(author_row, sv_df, gs_df) -> dict:
    
    #get scopus profile
    sv_name = sv_df.at[author_row,"Name"]
    sv_id = sv_df.at[author_row,"Scopus ID"]
    sv_coauthors = sv_df.at[author_row,"Coauthors"]
    ar = AuthorRetrieval(sv_id)
    pubs = ar.get_documents()
    
    #get scopus publications
    sv_titles = []
    for pub in pubs:
        sv_titles.append(pub.title)

    #get gs profile
    gs_name = gs_df.at[author_row,"Name"]
    gs_id = gs_df.at[author_row,"Scholar ID"]
    gs_coauthors = gs_df.at[author_row,"Coauthors"]
    pg = ProxyGenerator()
    success = pg.FreeProxies()
    scholarly.use_proxy(pg)
    author = scholarly.search_author_id(gs_id)

    #get gs publications
    gs_titles = []
    data_dict = scholarly.fill(author, sections=['basics', 'indices', 'publications', 'counts'])

    for pub in data_dict['publications']:
        gs_titles.append(pub['bib']['title'])

    #clean titles
    sv_titles = [
        title.replace("<sup>","").replace("</sup>","").replace("<inf>","").replace("</inf>","") 
        for title in sv_titles
    ]
        #clean
    sv_titles = [
        " ".join(
            [
                word.strip() 
                for word in re.sub(r'[^\w]', ' ', title.lower()).split(" ") 
                if word != ""])
        for title in sv_titles
        ]
    # gs_titles_lowered = [re.sub(r'[^\w]', ' ', title.lower()).strip() for title in gs_titles]
    gs_titles = [
        " ".join(
            [
                word.strip() 
                for word in re.sub(r'[^\w]', ' ', title.lower()).split(" ") 
                if word != ""])
        for title in gs_titles
        ]

    total_gs_pubs = len(gs_titles)
    total_sv_pubs = len(sv_titles)

    #duplicates
    sv_duplicates = [k for k,v in Counter(sv_titles).items() if v>1]
    gs_duplicates = [k for k,v in Counter(gs_titles).items() if v>1]

    #matching titles
    shared_titles = list(set(gs_titles).intersection(sv_titles))
    num_matches = len(shared_titles)
    shared_str = ''
    for title in shared_titles:
        shared_str = title + "\n" + shared_str

    #only sv
    sv_only_titles = list(set(sv_titles) - set(gs_titles))
    sv_only_str = ''
    for title in sv_only_titles:
        sv_only_str = title + "\n" + sv_only_str
    num_sv_only = len(sv_only_titles)

    #only gs
    gs_only_titles = list(set(gs_titles) - set(sv_titles))
    gs_only_str = ''
    for title in gs_only_titles:
        gs_only_str = title + "\n" + gs_only_str
    num_gs_only = len(gs_only_titles)

    author_dict = {
        'sv_name': sv_name, 
        'gs_name': gs_name, 
        'gs_count': total_gs_pubs, 
        'gs_duplicates_count':gs_duplicates,
        'sv_count': total_sv_pubs, 
        'sv_duplicates_count':sv_duplicates,
        'gs_only_count':num_gs_only, 
        'sv_only_count':num_sv_only,
        'shared_count':num_matches,
        'gs_coauthors':gs_coauthors,
        'sv_coauthors':sv_coauthors,
        'gs_betweenness_centrality_normed': "",
        'sv_betweenness_centrality_normed': "",
        # 'gs_only_pubs':gs_only_str,
        # 'sv_only_pubs':sv_only_str,
        # 'shared_pubs':shared_str,
    }
    return author_dict

In [108]:
# author_row = 6

sv_df = pd.read_csv("scival_outputs/scival_authorlist_publications_official.csv")
gs_df = pd.read_csv("gs_outputs/gs_authorlist_publications.csv")

sv_coauthor_matrix = get_coauthor_matrix(sv_df)
gs_coauthor_matrix = get_coauthor_matrix(gs_df)

sv_betweenness = bct.betweenness_bin(sv_coauthor_matrix)
sv_betweenness_normed = sv_betweenness/((len(sv_coauthor_matrix)-1)*(len(sv_coauthor_matrix)-2))

gs_betweenness = bct.betweenness_bin(gs_coauthor_matrix)
gs_betweenness_normed = gs_betweenness/((len(gs_coauthor_matrix)-1)*(len(gs_coauthor_matrix)-2))

gs_degrees = bct.degrees_und(gs_coauthor_matrix)
sv_degrees = bct.degrees_und(sv_coauthor_matrix)

Annie Vogel-Ciernia has no coauthors
Mark S. Cembrowski has no coauthors
Michael J. Gordon has no coauthors
Manu S. Madhav has no coauthors
Brian D. Fisher has no coauthors
Emily Lauren Sylwestrak has no coauthors


Michael Gordon has no coauthors
Manu S Madhav has no coauthors
Emily Sylwestrak has no coauthors




In [95]:
print(type(gs_betweenness_normed))

<class 'numpy.ndarray'>


In [109]:
dict_list = []
for row in range(len(sv_df.index)):
    author_dict = get_publication_details(row, sv_df, gs_df)
    author_dict['gs_betweenness_centrality'] = gs_betweenness[row]
    author_dict['sv_betweenness_centrality'] = sv_betweenness[row]
    author_dict['gs_betweenness_centrality_normed'] = gs_betweenness_normed[row]
    author_dict['sv_betweenness_centrality_normed'] = sv_betweenness_normed[row]
    author_dict['gs_degree'] = gs_degrees[row]
    author_dict['sv_degree'] = sv_degrees[row]
    dict_list.append(author_dict)

In [None]:
print(len(dict_list))

41


In [111]:
for d in dict_list:
    gs_num_duplicates = len(d['gs_duplicates_count'])
    d.update((k, gs_num_duplicates) for k, v in d.items() if k == "gs_duplicates_count")
    sv_num_duplicates = len(d['sv_duplicates_count'])
    d.update((k, sv_num_duplicates) for k, v in d.items() if k == "sv_duplicates_count")

In [110]:
import csv
with open("authors_comparisons_realnumduplicates.csv", "w") as csv_file:
    header = [
        'sv_name', 
        'gs_name', 
        'gs_count', 
        'gs_duplicates_count',
        'sv_count', 
        'sv_duplicates_count',
        'gs_only_count', 
        'sv_only_count',
        'shared_count',
        'gs_coauthors',
        'sv_coauthors',
        'gs_betweenness_centrality',
        'sv_betweenness_centrality',
        'gs_betweenness_centrality_normed',
        'sv_betweenness_centrality_normed',
        'gs_degree',
        'sv_degree',
        'gs_only_pubs',
        'sv_only_pubs',
        'shared_pubs',
    ]
    writer = csv.DictWriter(csv_file, fieldnames = header)
    writer.writeheader()
    for row_dict in dict_list:
        writer.writerow(row_dict)