In [2]:
import pandas as pd
from dotenv import load_dotenv
import csv

load_dotenv()

from src.Authors.orcid_data_utilities import collect_professor_data1,extract_department_and_name



In [6]:
#Refine raw dataset from scraping to Orcid extraction 
data = pd.read_csv('../../data/raw/staff_data.csv')
names_split = data['Given Name'].str.split(' ', n=1, expand=True)

data['Family Name'] = names_split[0].str.upper()
data['Given Name'] = names_split[1].str.upper()
data = data[['Family Name', 'Given Name', 'SSD', 'Ruolo']]
data['SSD'] = data['SSD'].str.replace('SSD:', '', regex=False).str.strip()
data[['Department Code', 'Specific Name']] = data['SSD'].apply(extract_department_and_name)
data = data[['Given Name','Family Name','Department Code', 'Specific Name', 'Ruolo']]

data


Unnamed: 0,Given Name,Family Name,Department Code,Specific Name,Ruolo
0,MARCO,ANTONIOTTI,(INFO-01/A),Informatica,Professore/ssa ordinario/a
1,FONTANA FRANCESCA,ARCELLI,(IINF-05/A),Sistemi di elaborazione delle informazioni,Professore/ssa ordinario/a
2,STEFANIA,BANDINI,(INFO-01/A),Informatica,Professore/ssa ordinario/a
3,PAOLA,BONIZZONI,(INFO-01/A),Informatica,Professore/ssa ordinario/a
4,DAVIDE,CIUCCI,(INFO-01/A),Informatica,Professore/ssa ordinario/a
...,...,...,...,...,...
134,MATTIA,SGRO,,SSD non disponibile,Dottorando/a
135,SAHAR,SHAH,,SSD non disponibile,Dottorando/a
136,MOHAMED NASSER HASSAN,SWEILAM,,SSD non disponibile,Dottorando/a
137,JACOPO,TALPINI,,SSD non disponibile,Dottorando/a


In [7]:
#Traslating Informations
data.rename(columns={'Ruolo': 'Role', 'Specific Name': 'Specific Field'}, inplace=True)

role_translation = {
    'Professore/ssa ordinario/a': 'Full Professor',
    'Professore/ssa associato/a': 'Associate Professor',
    'Ricercatore/rice': 'Researcher',
    'Ricercatore/rice a tempo determinato': 'Fixed-term Researcher',
    'Professore/ssa emerito/a': 'Professor Emeritus',
    'Assegnista di ricerca': 'Research Fellow',
    'Dottorando/a': 'PhD Student'
}
data['Role'].replace(role_translation, inplace=True)

specific_field_translation = {
    'Informatica': 'Computer Science',
    'Sistemi di elaborazione delle informazioni': 'Information Processing Systems',
    'Ricerca operativa': 'Operations Research',
    'SSD non disponibile': 'SSD not available'
}
data['Specific Field'].replace(specific_field_translation, inplace=True)
data.to_csv('../../data/processed/professors_dip_info.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Role'].replace(role_translation, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Specific Field'].replace(specific_field_translation, inplace=True)


Unnamed: 0,Given Name,Family Name,Department Code,Specific Field,Role
0,MARCO,ANTONIOTTI,(INFO-01/A),Computer Science,Full Professor
1,FONTANA FRANCESCA,ARCELLI,(IINF-05/A),Information Processing Systems,Full Professor
2,STEFANIA,BANDINI,(INFO-01/A),Computer Science,Full Professor
3,PAOLA,BONIZZONI,(INFO-01/A),Computer Science,Full Professor
4,DAVIDE,CIUCCI,(INFO-01/A),Computer Science,Full Professor
...,...,...,...,...,...
134,MATTIA,SGRO,,SSD not available,PhD Student
135,SAHAR,SHAH,,SSD not available,PhD Student
136,MOHAMED NASSER HASSAN,SWEILAM,,SSD not available,PhD Student
137,JACOPO,TALPINI,,SSD not available,PhD Student


In [2]:
# Retriving data
professors_df = pd.read_csv('../../data/processed/professors_dip_info.csv')
professors_info = collect_professor_data1(professors_df)
professors_info_df = pd.DataFrame(professors_info)
professors_info_df.to_csv('../../data/processed/professors_orcid_info2.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

print("CSV file created successfully!")


Loop count: 1
Processing professor: MARCO ANTONIOTTI
Approved professor: MARCO ANTONIOTTI with organization Università degli Studi di Milano-Bicocca
Loop count: 2
Processing professor: FRANCESCA ARCELLI FONTANA
Approved professor: FRANCESCA ARCELLI FONTANA with organization Universita degli Studi di Milano-Bicocca
Loop count: 3
Processing professor: STEFANIA BANDINI
Approved professor: STEFANIA BANDINI with organization University of Milano-Bicocca
Loop count: 4
Processing professor: PAOLA BONIZZONI
Approved professor: PAOLA BONIZZONI with organization Università degli Studi di Milano-Bicocca
Loop count: 5
Processing professor: DAVIDE  CIUCCI
Approved professor: DAVIDE  CIUCCI with organization University of Milano-Bicocca
Loop count: 6
Processing professor: GABRIELE GIANINI
Approved professor: GABRIELE GIANINI with organization Università degli Studi di Milano-Bicocca
Loop count: 7
Processing professor: LEONARDO MARIANI
Approved professor: LEONARDO MARIANI with organization University

In [3]:
professors_info_df

Unnamed: 0,Given Name,Family Name,Department Code,Specific Field,Role,ORCID ID,Organization,Status,Role Title,Start Date,Keywords,ResearcherID,Number of Works
0,MARCO,ANTONIOTTI,(INFO-01/A),Computer Science,Full Professor,0000-0002-2823-6838,Università degli Studi di Milano-Bicocca,APPROVED,Professor,,"Bioinformatics, Computational Biology, Compute...",,19
1,FRANCESCA,ARCELLI FONTANA,(IINF-05/A),Information Processing Systems,Full Professor,0000-0002-1195-530X,Universita degli Studi di Milano-Bicocca,APPROVED,Associate Professor,,Nessuna,,46
2,STEFANIA,BANDINI,(INFO-01/A),Computer Science,Full Professor,0000-0002-7056-0543,University of Milano-Bicocca,APPROVED,Professor,,"Artificial Intelligence, Agent-based Simulatio...",,227
3,PAOLA,BONIZZONI,(INFO-01/A),Computer Science,Full Professor,0000-0001-7289-4988,Università degli Studi di Milano-Bicocca,APPROVED,Professor,2007,Nessuna,,132
4,DAVIDE,CIUCCI,(INFO-01/A),Computer Science,Full Professor,0000-0002-8083-7809,University of Milano-Bicocca,APPROVED,Associate Professor,2017,Nessuna,,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,MATTIA,SGRO,,SSD not available,PhD Student,0009-0003-5838-5329,Università degli Studi di Milano-Bicocca,APPROVED,PhD Student,2023,Nessuna,,1
135,SAHAR,SHAH,,SSD not available,PhD Student,0000-0002-9943-935X,,TO CHECK,,,Nessuna,,1
136,MOHAMED NASSER HASSAN,SWEILAM,,SSD not available,PhD Student,0009-0004-7796-0613,University of Milano-Bicocca,APPROVED,PhD student,,Nessuna,,0
137,JACOPO,TALPINI,,SSD not available,PhD Student,0000-0003-1556-6296,,TO CHECK,,,Nessuna,,0


In [4]:

missing_orcid_df = professors_df_info[professors_df['ORCID ID'].isna()]

for index, row in missing_orcid_df.iterrows():
    given_name = row['Given Name']
    family_name = row['Family Name']
    print(f"Professor missing ORCID ID: {given_name} {family_name}")


Professor missing ORCID ID: NYSSEN PENALOZA
Professor missing ORCID ID: FEDERICO BERGAMINI
Professor missing ORCID ID: MATTEO ROBERT CHILD
Professor missing ORCID ID: SEYED MOJTABA DARYABARI
Professor missing ORCID ID: MANUEL ELIA
Professor missing ORCID ID: JACOPO MALTAGLIATI
Professor missing ORCID ID: SANDIP JAYANTLIAL MODHA
Professor missing ORCID ID: SEYEDIMAN SEYEDI
Professor missing ORCID ID: ALBERTO MINORA
Professor missing ORCID ID: FEDERICO PIROLA


In [7]:
federico_pirola_index = professors_info_df[(professors_info_df['Given Name'] == 'FEDERICO') & (professors_info_df['Family Name'] == 'PIROLA')].index

if not federico_pirola_index.empty:
    
    professors_info_df.loc[federico_pirola_index, 'ORCID ID'] = '0009-0000-7090-4716'
    professors_info_df.loc[federico_pirola_index, 'Organization'] = 'University of Milano-Bicocca: Milan, IT'
    professors_info_df.loc[federico_pirola_index, 'Role Title'] = 'PhD candidate (Computer Science)'
    professors_info_df.loc[federico_pirola_index, 'Start Date'] = '2023-10'
    professors_info_df.loc[federico_pirola_index, 'Status'] = 'APPROVED'
else:
    print("NOT FOUND")

professors_info_df.to_csv('../../data/processed/professors_orcid_info.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)



In [8]:
professors_info_df

Unnamed: 0,Given Name,Family Name,Department Code,Specific Field,Role,ORCID ID,Organization,Status,Role Title,Start Date,Keywords,ResearcherID,Number of Works
0,MARCO,ANTONIOTTI,(INFO-01/A),Computer Science,Full Professor,0000-0002-2823-6838,Università degli Studi di Milano-Bicocca,APPROVED,Professor,,"Bioinformatics, Computational Biology, Compute...",,19
1,FRANCESCA,ARCELLI FONTANA,(IINF-05/A),Information Processing Systems,Full Professor,0000-0002-1195-530X,Universita degli Studi di Milano-Bicocca,APPROVED,Associate Professor,,Nessuna,,46
2,STEFANIA,BANDINI,(INFO-01/A),Computer Science,Full Professor,0000-0002-7056-0543,University of Milano-Bicocca,APPROVED,Professor,,"Artificial Intelligence, Agent-based Simulatio...",,227
3,PAOLA,BONIZZONI,(INFO-01/A),Computer Science,Full Professor,0000-0001-7289-4988,Università degli Studi di Milano-Bicocca,APPROVED,Professor,2007,Nessuna,,132
4,DAVIDE,CIUCCI,(INFO-01/A),Computer Science,Full Professor,0000-0002-8083-7809,University of Milano-Bicocca,APPROVED,Associate Professor,2017,Nessuna,,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,MATTIA,SGRO,,SSD not available,PhD Student,0009-0003-5838-5329,Università degli Studi di Milano-Bicocca,APPROVED,PhD Student,2023,Nessuna,,1
135,SAHAR,SHAH,,SSD not available,PhD Student,0000-0002-9943-935X,,TO CHECK,,,Nessuna,,1
136,MOHAMED NASSER HASSAN,SWEILAM,,SSD not available,PhD Student,0009-0004-7796-0613,University of Milano-Bicocca,APPROVED,PhD student,,Nessuna,,0
137,JACOPO,TALPINI,,SSD not available,PhD Student,0000-0003-1556-6296,,TO CHECK,,,Nessuna,,0


In [5]:
import pandas as pd
data = pd.read_csv('../../data/processed/papers.csv')
data.to_csv('../../data/processed/papers.csv')

In [11]:


orcid_list = data['Authors'][0].split(', ')
for orcid in orcid_list:
    print(orcid)

   

0000-0003-0617-6245
0000-0001-8695-649X
0000-0002-2568-9815
0009-0007-0154-0884
0000-0001-7362-3530
0000-0002-2823-6838
0000-0003-2973-5038
0000-0003-0384-3700


In [13]:
df = pd.read_csv('../../data/processed/Authors_inst.csv')
if orcid_list[0] in df['ORCID ID']:
    print('ok')

In [14]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../../data/processed/Authors_inst.csv')

# Example ORCiD list (replace with your actual orcid_list)
orcid_list = data['Authors'][0].split(', ')

# Check if the first ORCiD in orcid_list is in the 'ORCID ID' column of df
if orcid_list[0] in df['ORCID ID'].values:
    print('ok')
else:
    print('not found')


not found


In [17]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../../data/processed/Authors_inst.csv')

# Example ORCiD list (replace with your actual orcid_list)
orcid_list = ['0000-0003-2721-4888']

# Check if the first ORCiD in orcid_list is in the 'ORCID ID' column of df
if orcid_list[0] in df['ORCID ID'].values:
    print('ok')
else:
    print('not found')


not found
