In [2]:
import pandas as pd
from dotenv import load_dotenv
import csv
import glob
load_dotenv()

from src.Authors.Orcid_ProfessorDataCollector import ProfessorDataCollector
from src.Authors.Gscholar_AuthorsDataCollection import update_dataset


In [2]:
authors = pd.read_csv('../../data/processed/Authors.csv')
papers = pd.read_csv('../../data/processed/papers.csv')

## Retrieve ORCID ID of External Authors

In [36]:
papers = papers[papers['Doi'] != '10.15252/msb.20199110']
authors_orcids = set(authors['ORCID ID'].dropna().unique())
missing_orcids = []


for idx, row in papers.iterrows():
    # Get the ORCID of the current paper's authors, separated by commas
    orcids_in_paper = str(row['Authors']).split(', ')
    
    # Check each ORCID
    for orcid in orcids_in_paper:
        if orcid and orcid not in authors_orcids:
            missing_orcids.append(orcid)


if missing_orcids:
    missing_orcids_df = pd.DataFrame({'ORCID ID': list(set(missing_orcids))})
    missing_orcids_df.to_csv('missing_orcids.csv', index=False)
    print("File with missing ORCIDs successfully created!")
else:
    print("No missing ORCIDs found.")
missing_orcids_df = pd.read_csv('missing_orcids.csv')
missing_orcids_df 

File with missing ORCIDs successfully created!


## Collect Authors' Data from ORCID Profiles via ORCID site

In [38]:
df_orcid = pd.DataFrame(missing_orcids_df)
df_orcid = df_orcid.iloc[500:696]
collector_with_orcid = ProfessorDataCollector(df_orcid)
collector_with_orcid.collect_professor_data()
dataext = collector_with_orcid.df

In [1]:
#merge all datasets
file_pattern = '../../data/raw/Authors_ex*.csv'  
all_data = pd.concat([pd.read_csv(file) for file in glob.glob(file_pattern)], ignore_index=True)
all_data.to_csv('../../data/raw/Merged_Authors.csv', index=False)
print("Files merged successfully into Merged_Authors.csv")


Files merged successfully into Merged_Authors.csv


## Collect Authors' Data (Citations, Keywords, Hindex) via Gscholar site

In [9]:
dataset_path = "../../data/raw/Merged_Authors.csv" # Path to the dataset
organization_keywords = ['']

df = update_dataset(dataset_path, organization_keywords)
df.to_csv('../data/processed/Merged_Authors_Tot.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)


Processing: Giuseppe Banfi
Problem, take a look
No information found for: Giuseppe Banfi
Processing: Luca Mariot
Updating information for: Luca Mariot
Processing: Anshika Sharma
Updating information for: Anshika Sharma
Processing: Gabriela Nicole González Sáez
Problem, take a look
No information found for: Gabriela Nicole González Sáez
Processing: Luigi De Giovanni
Updating information for: Luigi De Giovanni
Processing: FABRIZIO PASTORE
Updating information for: FABRIZIO PASTORE
Processing: Linda Greta Dui
Updating information for: Linda Greta Dui
Processing: Maks Ovsjanikov
Updating information for: Maks Ovsjanikov
Processing: Lidia Alecci
Updating information for: Lidia Alecci
Processing: Antonio Miranda-Escalada
Updating information for: Antonio Miranda-Escalada
Processing: Paolo Cazzaniga
Updating information for: Paolo Cazzaniga
Processing: Fabio Roberto Vitello
Problem, take a look
No information found for: Fabio Roberto Vitello
Processing: Lilia Alberghina
Updating information f

## Format Dataset to Match Authors Dataset Structure

In [22]:
df = pd.read_csv('../../data/raw/Merged_Authors_TOT.csv')
columns_to_keep = [
    'Given Name', 'Family Name', 
    'Role_Title', 'ORCID ID', 'Organization', 'Keywords', 'H Index', 'Citations'
]

# Filter the DataFrame to keep only the specified columns
filtered_df = df[columns_to_keep]
filtered_df.rename(columns={'Role_Title': 'Role'}, inplace=True)

In [26]:
filtered_df.to_csv('../../data/processed/Authors_external.csv')

In [6]:
external = pd.read_csv('../../data/processed/Authors_external_final.csv')
internal = pd.read_csv('../../data/processed/Authors_internal.csv')

In [5]:
internal.head()

Unnamed: 0.1,Unnamed: 0,Given Name,Family Name,Department Code,Specific Field,Role,ORCID ID,Organization,Keywords,H Index,Citations,Past Institutions
0,0,MARCO,ANTONIOTTI,(INFO-01/A),Computer Science,Full Professor,0000-0002-2823-6838,University of Milan Bicocca,"Computational Biology, Bioinformatics, Compute...",27.0,2594.0,"['Mylan (Switzerland)', 'Courant Institute of ..."
1,1,FRANCESCA,ARCELLI FONTANA,(IINF-05/A),Information Processing Systems,Full Professor,0000-0002-1195-530X,University of Milan Bicocca,"Software Engineering, Refactoring, Managing Te...",40.0,5789.0,"['University of Salerno', 'Menarini Group (Ita..."
2,2,STEFANIA,BANDINI,(INFO-01/A),Computer Science,Full Professor,0000-0002-7056-0543,University of Milan Bicocca,"artificial intelligence, complex systems, crow...",35.0,4738.0,"['Istituto Nazionale di Fisica Nucleare, Sezio..."
3,3,PAOLA,BONIZZONI,(INFO-01/A),Computer Science,Full Professor,0000-0001-7289-4988,University of Milan Bicocca,"Computer science, Bioinformatics",27.0,2458.0,"['University of Milan', 'École Polytechnique',..."
4,4,DAVIDE,CIUCCI,(INFO-01/A),Computer Science,Full Professor,0000-0002-8083-7809,University of Milan Bicocca,"Rough sets, uncertainty management, fuzzy logi...",34.0,3449.0,"['University of Milan', 'Université Toulouse I..."


In [8]:
external.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Given Name', 'Family Name', 'Role',
       'ORCID ID', 'Organization', 'Keywords', 'H Index', 'Citations',
       'institution'],
      dtype='object')

In [9]:
external = external[[ 'Given Name', 'Family Name', 'Role',
       'ORCID ID', 'Keywords', 'H Index', 'Citations',
       'institution']]

In [14]:
external.to_csv('../../data/processed/Authors_external_final.csv')