In [1]:
import requests
import pandas as pd

class ProfessorDataCollector:
    def __init__(self, csv_file_path, organization_filters=None):
        self.df = pd.read_csv(csv_file_path)
        self.organization_filters = [f.lower() for f in organization_filters] if organization_filters else []

    @staticmethod
    def search_orcid_by_name(given_name, family_name):
        url = f"https://pub.orcid.org/v3.0/expanded-search/?q=given-names:{given_name} AND family-name:{family_name}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def get_orcid_profile(orcid_id):
        url = f'https://pub.orcid.org/v3.0/{orcid_id}'
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def extract_profile_info(profile_data, orcid_id):
        name_info = profile_data.get('person', {}).get('name', {})
        given_names = name_info.get('given-names', {}).get('value', 'N/A')
        family_name = name_info.get('family-name', {}).get('value', 'N/A')

        keywords = profile_data.get('person', {}).get('keywords', {}).get('keyword', [])
        keywords_list = [kw.get('content') for kw in keywords]

        employment = profile_data.get('activities-summary', {}).get('employments', {}).get('affiliation-group', [])
        latest_employment = employment[0].get('summaries', [])[0].get('employment-summary', {}) if employment else {}
        role_title = latest_employment.get('role-title', 'N/A')
        start_date = latest_employment.get('start-date', {}).get('year', {}).get('value', 'N/A')
        organization_name = latest_employment.get('organization', {}).get('name', 'N/A')

        num_works = len(profile_data.get('activities-summary', {}).get('works', {}).get('group', []))

        return {
            'Given Name': given_names,
            'Family Name': family_name,
            'Keywords': ', '.join(keywords_list) if keywords_list else 'N/A',
            'Role Title': role_title,
            'Start Date': start_date,
            'Organization': organization_name,
            'Number of Works': num_works,
            'ORCID ID': orcid_id
        }

    def collect_professor_data(self):
        self.df = self.df.assign(ORCID_ID=None, Organization=None, Status=None, Role_Title=None,
                                 Start_Date=None, Keywords=None, Number_of_Works=None)

        for index, row in self.df.iterrows():
            given_name = row.get('Given Name')
            family_name = row.get('Family Name')
            orcid_id = row.get('ORCID ID')

            if pd.notna(orcid_id):
                profile_data = self.get_orcid_profile(orcid_id)
                if profile_data:
                    prof_info = self.extract_profile_info(profile_data, orcid_id)
                    self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                                        'Keywords', 'Number_of_Works', 'Status']] = [
                        prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
                        prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'], 'APPROVED']
                else:
                    self.df.at[index, 'Status'] = 'TO CHECK'
                continue

            if given_name and family_name:
                search_results = self.search_orcid_by_name(given_name, family_name)

                if not search_results or 'expanded-result' not in search_results:
                    self.df.at[index, 'Status'] = 'TO CHECK'
                    continue

                for result in search_results['expanded-result']:
                    orcid_id = result.get('orcid-id')
                    profile_data = self.get_orcid_profile(orcid_id)
                    if not profile_data:
                        continue

                    prof_info = self.extract_profile_info(profile_data, orcid_id)
                    organization_name = prof_info['Organization'].lower()

                    if not self.organization_filters or any(f in organization_name for f in self.organization_filters):
                        self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                                            'Keywords', 'Number_of_Works', 'Status']] = [
                            prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
                            prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'], 'APPROVED']
                        break
                else:
                    self.df.at[index, 'Status'] = 'TO CHECK'

        return self.df

    def save_to_csv(self, output_file_path):
        self.df.to_csv(output_file_path, index=False)

# Example usage
if __name__ == "__main__":
    # Example with dataset containing only ORCID IDs
    '''collector_with_orcid = ProfessorDataCollector('../../data/processed/Authors_with_orcid.csv')
    updated_df_orcid = collector_with_orcid.collect_professor_data()
    collector_with_orcid.save_to_csv('../../data/processed/Updated_Authors_with_orcid.csv')

    # Example with dataset containing only names
    filters = ['Bicocca', 'Milan', 'Milano']  # Optional filters
    collector_with_names = ProfessorDataCollector('../../data/processed/Authors_with_names.csv', organization_filters=filters)
    updated_df_names = collector_with_names.collect_professor_data()
    collector_with_names.save_to_csv('../../data/processed/Updated_Authors_with_names.csv')'''

data_orcid = {
        'ORCID ID': ['0000-0002-1825-0097', '0000-0001-5109-3700', '0000-0003-1613-5470']}
df_orcid = pd.DataFrame(data_orcid)

collector_with_orcid = ProfessorDataCollector(df_orcid)
updated_df_orcid = collector_with_orcid.collect_professor_data()
#collector_with_orcid.save_to_csv('../../data/processed/Updated_Authors_with_orcid.csv')

#df_orcid.to_csv('../../data/processed/Aprova.csv', index=False)
updated_df_orcid

TypeError: argument of type 'method' is not iterable

In [4]:
import requests
import pandas as pd

class ProfessorDataCollector:
    def __init__(self, csv_file_path_or_df, organization_filters=None):
        if isinstance(csv_file_path_or_df, pd.DataFrame):
            self.df = csv_file_path_or_df
        else:
            self.df = pd.read_csv(csv_file_path_or_df)
        self.organization_filters = [f.lower() for f in organization_filters] if organization_filters else []

    @staticmethod
    def search_orcid_by_name(given_name, family_name):
        url = f"https://pub.orcid.org/v3.0/expanded-search/?q=given-names:{given_name} AND family-name:{family_name}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def get_orcid_profile(orcid_id):
        url = f'https://pub.orcid.org/v3.0/{orcid_id}'
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def extract_profile_info(profile_data, orcid_id):
        name_info = profile_data.get('person', {}).get('name', {})
        given_names = name_info.get('given-names', {}).get('value', 'N/A')
        family_name = name_info.get('family-name', {}).get('value', 'N/A')

        keywords = profile_data.get('person', {}).get('keywords', {}).get('keyword', [])
        keywords_list = [kw.get('content') for kw in keywords]

        employment = profile_data.get('activities-summary', {}).get('employments', {}).get('affiliation-group', [])
        latest_employment = employment[0].get('summaries', [])[0].get('employment-summary', {}) if employment else {}
        role_title = latest_employment.get('role-title', 'N/A')
        start_date = latest_employment.get('start-date', {}).get('year', {}).get('value', 'N/A')
        organization_name = latest_employment.get('organization', {}).get('name', 'N/A')

        num_works = len(profile_data.get('activities-summary', {}).get('works', {}).get('group', []))

        return {
            'Given Name': given_names,
            'Family Name': family_name,
            'Keywords': ', '.join(keywords_list) if keywords_list else 'N/A',
            'Role Title': role_title,
            'Start Date': start_date,
            'Organization': organization_name,
            'Number of Works': num_works,
            'ORCID ID': orcid_id
        }

    def collect_professor_data(self):
        self.df = self.df.assign(ORCID_ID=None, Organization=None, Status=None, Role_Title=None,
                                 Start_Date=None, Keywords=None, Number_of_Works=None)

        for index, row in self.df.iterrows():
            given_name = row.get('Given Name')
            family_name = row.get('Family Name')
            orcid_id = row.get('ORCID ID')

            if pd.notna(orcid_id):
                profile_data = self.get_orcid_profile(orcid_id)
                if profile_data:
                    prof_info = self.extract_profile_info(profile_data, orcid_id)
                    self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                                        'Keywords', 'Number_of_Works', 'Status']] = [
                        prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
                        prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'], 'APPROVED']
                else:
                    self.df.at[index, 'Status'] = 'TO CHECK'
                continue

            if given_name and family_name:
                search_results = self.search_orcid_by_name(given_name, family_name)

                if not search_results or 'expanded-result' not in search_results:
                    self.df.at[index, 'Status'] = 'TO CHECK'
                    continue

                for result in search_results['expanded-result']:
                    orcid_id = result.get('orcid-id')
                    profile_data = self.get_orcid_profile(orcid_id)
                    if not profile_data:
                        continue

                    prof_info = self.extract_profile_info(profile_data, orcid_id)
                    organization_name = prof_info['Organization'].lower()

                    if not self.organization_filters or any(f in organization_name for f in self.organization_filters):
                        self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                                            'Keywords', 'Number_of_Works', 'Status']] = [
                            prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
                            prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'], 'APPROVED']
                        break
                else:
                    self.df.at[index, 'Status'] = 'TO CHECK'

        return self.df

    def save_to_csv(self, output_file_path):
        self.df.to_csv(output_file_path, index=False)

# Example usage
if __name__ == "__main__":
    # Example with dataset containing only ORCID IDs
    data_orcid = {
        'ORCID ID': ['0000-0002-1825-0097', '0000-0001-5109-3700', '0000-0003-1613-5470']
    }
    df_orcid = pd.DataFrame(data_orcid)
    collector_with_orcid = ProfessorDataCollector(df_orcid)
    updated_df_orcid = collector_with_orcid.collect_professor_data()
    collector_with_orcid.save_to_csv('../../data/processed/Updated_Authors_with_orcid.csv')

    # Example with dataset containing only names
    '''data_names = {
        'Given Name': ['John', 'Alice', 'Robert'],
        'Family Name': ['Doe', 'Smith', 'Brown']
    }
    df_names = pd.DataFrame(data_names)
    filters = ['Bicocca', 'Milan', 'Milano']  # Optional filters
    collector_with_names = ProfessorDataCollector(df_names, organization_filters=filters)
    updated_df_names = collector_with_names.collect_professor_data()
    collector_with_names.save_to_csv('../../data/processed/Updated_Authors_with_names.csv')'''


In [5]:
import requests
import pandas as pd

class ProfessorDataCollector:
    def __init__(self, csv_file_path_or_df, organization_filters=None):
        if isinstance(csv_file_path_or_df, pd.DataFrame):
            self.df = csv_file_path_or_df
        else:
            self.df = pd.read_csv(csv_file_path_or_df)
        self.organization_filters = [f.lower() for f in organization_filters] if organization_filters else []

    @staticmethod
    def search_orcid_by_name(given_name, family_name):
        """
        Search for an ORCID profile by given name and family name.

        Args:
            given_name (str): The first name of the individual.
            family_name (str): The last name of the individual.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """

        url = f"https://pub.orcid.org/v3.0/expanded-search/?q=given-names:{given_name} AND family-name:{family_name}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def get_orcid_profile(orcid_id):
        """
        Retrieve an ORCID profile by ORCID ID.

        Args:
            orcid_id (str): The unique ORCID identifier.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """

        url = f'https://pub.orcid.org/v3.0/{orcid_id}'
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def extract_profile_info(profile_data, orcid_id):
        """
        Extract profile information from ORCID profile data.

        Args:
            profile_data (dict): The JSON data of the ORCID profile.
            orcid_id (str): The ORCID identifier.

        Returns:
            dict or None: A dictionary containing extracted profile information, or None if required data is missing.
                        The returned information includes given name, family name, keywords, latest role title, start date,
                        organization name, number of works, and the ORCID ID.
        """

        if not profile_data:
            return None

        name_info = profile_data.get('person', {}).get('name', {})
        given_names = name_info.get('given-names', {}).get('value')
        family_name = name_info.get('family-name', {}).get('value')

        if not given_names or not family_name:
            return None

        keywords = profile_data.get('person', {}).get('keywords', {}).get('keyword', [])
        keywords_list = [kw.get('content') for kw in keywords]

        employment = profile_data.get('activities-summary', {}).get('employments', {}).get('affiliation-group', [])
        latest_employment = employment[0].get('summaries', [])[0].get('employment-summary', {}) if employment else {}
        role_title = latest_employment.get('role-title', 'N/A')
        start_date = latest_employment.get('start-date', {}).get('year', {}).get('value', 'N/A')
        organization_name = latest_employment.get('organization', {}).get('name', 'N/A')

        num_works = len(profile_data.get('activities-summary', {}).get('works', {}).get('group', []))

        return {
            'Given Name': given_names,
            'Family Name': family_name,
            'Keywords': ', '.join(keywords_list) if keywords_list else 'N/A',
            'Role Title': role_title,
            'Start Date': start_date,
            'Organization': organization_name,
            'Number of Works': num_works,
            'ORCID ID': orcid_id
        }

    def collect_professor_data(self):
        self.df = self.df.assign(ORCID_ID=None, Organization=None, Status=None, Role_Title=None,
                                 Start_Date=None, Keywords=None, Number_of_Works=None, Given_Name=None, Family_Name=None)

        for index, row in self.df.iterrows():
            given_name = row.get('Given Name')
            family_name = row.get('Family Name')
            orcid_id = row.get('ORCID ID')

            if pd.notna(orcid_id):
                profile_data = self.get_orcid_profile(orcid_id)
                prof_info = self.extract_profile_info(profile_data, orcid_id)
                if prof_info:
                    self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                                        'Keywords', 'Number_of_Works', 'Given_Name', 'Family_Name', 'Status']] = [
                        prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
                        prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'],
                        prof_info['Given Name'], prof_info['Family Name'], 'APPROVED']
                else:
                    self.df.at[index, 'Status'] = 'TO CHECK'
                continue

            if given_name and family_name:
                search_results = self.search_orcid_by_name(given_name, family_name)

                if not search_results or 'expanded-result' not in search_results:
                    self.df.at[index, 'Status'] = 'TO CHECK'
                    continue

                for result in search_results['expanded-result']:
                    orcid_id = result.get('orcid-id')
                    profile_data = self.get_orcid_profile(orcid_id)
                    prof_info = self.extract_profile_info(profile_data, orcid_id)
                    if not prof_info:
                        continue

                    organization_name = prof_info['Organization'].lower()

                    if not self.organization_filters or any(f in organization_name for f in self.organization_filters):
                        self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                                            'Keywords', 'Number_of_Works', 'Given_Name', 'Family_Name', 'Status']] = [
                            prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
                            prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'],
                            prof_info['Given Name'], prof_info['Family Name'], 'APPROVED']
                        break
                else:
                    self.df.at[index, 'Status'] = 'TO CHECK'

        return self.df

    def save_to_csv(self, output_file_path):
        self.df.to_csv(output_file_path, index=False)

# Example usage
if __name__ == "__main__":
    # Example with dataset containing only ORCID IDs
    data_orcid = {
        'ORCID ID': ['0000-0002-1825-0097', '0000-0001-5109-3700', '0000-0003-1613-5470']
    }
    df_orcid = pd.DataFrame(data_orcid)
    collector_with_orcid = ProfessorDataCollector(df_orcid)
    updated_df_orcid = collector_with_orcid.collect_professor_data()
    collector_with_orcid.save_to_csv('../../data/processed/Updated_Authors_with_orcid.csv')

    # Example with dataset containing only names
    '''data_names = {
        'Given Name': ['John', 'Alice', 'Robert'],
        'Family Name': ['Doe', 'Smith', 'Brown']
    }
    df_names = pd.DataFrame(data_names)
    filters = ['Bicocca', 'Milan', 'Milano']  # Optional filters
    collector_with_names = ProfessorDataCollector(df_names, organization_filters=filters)
    updated_df_names = collector_with_names.collect_professor_data()
    collector_with_names.save_to_csv('../../data/processed/Updated_Authors_with_names.csv')'''


In [7]:
import requests
import pandas as pd

class ProfessorDataCollector:
    def __init__(self, csv_file_path_or_df, organization_filters=None):
        if isinstance(csv_file_path_or_df, pd.DataFrame):
            self.df = csv_file_path_or_df
        else:
            self.df = pd.read_csv(csv_file_path_or_df)
        self.organization_filters = [f.lower() for f in organization_filters] if organization_filters else []

    @staticmethod
    def search_orcid_by_name(given_name, family_name):
        """
        Search for an ORCID profile by given name and family name.

        Args:
            given_name (str): The first name of the individual.
            family_name (str): The last name of the individual.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """

        url = f"https://pub.orcid.org/v3.0/expanded-search/?q=given-names:{given_name} AND family-name:{family_name}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def get_orcid_profile(orcid_id):
        """
        Retrieve an ORCID profile by ORCID ID.

        Args:
            orcid_id (str): The unique ORCID identifier.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """

        url = f'https://pub.orcid.org/v3.0/{orcid_id}'
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def extract_profile_info(profile_data, orcid_id):
        """
        Extract profile information from ORCID profile data.

        Args:
            profile_data (dict): The JSON data of the ORCID profile.
            orcid_id (str): The ORCID identifier.

        Returns:
            dict or None: A dictionary containing extracted profile information, or None if required data is missing.
                        The returned information includes given name, family name, keywords, latest role title, start date,
                        organization name, number of works, and the ORCID ID.
        """

        if not profile_data:
            return None

        name_info = profile_data.get('person', {}).get('name', {})
        given_names = name_info.get('given-names', {}).get('value')
        family_name = name_info.get('family-name', {}).get('value')

        if not given_names or not family_name:
            return None

        keywords = profile_data.get('person', {}).get('keywords', {}).get('keyword', [])
        keywords_list = [kw.get('content') for kw in keywords]

        employment = profile_data.get('activities-summary', {}).get('employments', {}).get('affiliation-group', [])
        latest_employment = employment[0].get('summaries', [])[0].get('employment-summary', {}) if employment else {}
        role_title = latest_employment.get('role-title', 'N/A')
        start_date = latest_employment.get('start-date', {}).get('year', {}).get('value', 'N/A')
        organization_name = latest_employment.get('organization', {}).get('name', 'N/A')

        num_works = len(profile_data.get('activities-summary', {}).get('works', {}).get('group', []))

        return {
            'Given Name': given_names,
            'Family Name': family_name,
            'Keywords': ', '.join(keywords_list) if keywords_list else 'N/A',
            'Role Title': role_title,
            'Start Date': start_date,
            'Organization': organization_name,
            'Number of Works': num_works,
            'ORCID ID': orcid_id
        }

    def collect_professor_data(self):
        # Initialize DataFrame columns
        self.initialize_dataframe()

        for index, row in self.df.iterrows():
            given_name, family_name, orcid_id = self.extract_professor_basic_info(row)

            if pd.notna(orcid_id):
                self.handle_existing_orcid(index, orcid_id)
            elif given_name and family_name:
                self.handle_name_search(index, given_name, family_name)


    def initialize_dataframe(self):
        """
        Initialize the DataFrame with required columns.
        """
        self.df = self.df.assign(ORCID_ID=None, Organization=None, Status=None, Role_Title=None,
                                Start_Date=None, Keywords=None, Number_of_Works=None, Given_Name=None, Family_Name=None)


    def extract_professor_basic_info(self, row):
        """
        Extract given name, family name, and ORCID ID from a DataFrame row.

        Args:
            row (pd.Series): The DataFrame row.

        Returns:
            tuple: given name, family name, ORCID ID
        """
        given_name = row.get('Given Name')
        family_name = row.get('Family Name')
        orcid_id = row.get('ORCID ID')
        return given_name, family_name, orcid_id


    def handle_existing_orcid(self, index, orcid_id):
        """
        Handle data extraction for professors with an existing ORCID ID.

        Args:
            index (int): The DataFrame row index.
            orcid_id (str): The ORCID ID.
        """
        profile_data = self.get_orcid_profile(orcid_id)
        prof_info = self.extract_profile_info(profile_data, orcid_id)
        if prof_info:
            self.update_professor_info(index, prof_info, 'APPROVED')
        else:
            self.df.at[index, 'Status'] = 'TO CHECK'


    def handle_name_search(self, index, given_name, family_name):
        """
        Handle ORCID search by given name and family name.

        Args:
            index (int): The DataFrame row index.
            given_name (str): The given name of the professor.
            family_name (str): The family name of the professor.
        """
        search_results = self.search_orcid_by_name(given_name, family_name)

        if not search_results or 'expanded-result' not in search_results:
            self.df.at[index, 'Status'] = 'TO CHECK'
            return

        for result in search_results['expanded-result']:
            orcid_id = result.get('orcid-id')
            profile_data = self.get_orcid_profile(orcid_id)
            prof_info = self.extract_profile_info(profile_data, orcid_id)
            if not prof_info:
                continue

            if self.is_valid_organization(prof_info['Organization']):
                self.update_professor_info(index, prof_info, 'APPROVED')
                break


    def is_valid_organization(self, organization_name):
        """
        Check if the organization name matches any of the organization filters.

        Args:
            organization_name (str): The name of the organization.

        Returns:
            bool: True if the organization matches the filters or if no filters are set.
        """
        organization_name = organization_name.lower()
        return not self.organization_filters or any(f in organization_name for f in self.organization_filters)


    def update_professor_info(self, index, prof_info, status):
        """
        Update the DataFrame with professor information.

        Args:
            index (int): The DataFrame row index.
            prof_info (dict): The extracted professor information.
            status (str): The status to be assigned ('APPROVED' or 'TO CHECK').
        """
        self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                            'Keywords', 'Number_of_Works', 'Given_Name', 'Family_Name', 'Status']] = [
            prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
            prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'],
            prof_info['Given Name'], prof_info['Family Name'], status
        ]

    

# Example usage
if __name__ == "__main__":
    # Example with dataset containing only ORCID IDs
    data_orcid = {
        'ORCID ID': ['0000-0002-1825-0097', '0000-0001-5109-3700', '0000-0003-1613-5470']
    }
    df_orcid = pd.DataFrame(data_orcid)
    collector_with_orcid = ProfessorDataCollector(df_orcid)
    updated_df_orcid = collector_with_orcid.collect_professor_data()
    updated_df_orcid
    #collector_with_orcid.save_to_csv('../../data/processed/Updated_Authors_with_orcid.csv')

    # Example with dataset containing only names
    '''data_names = {
        'Given Name': ['John', 'Alice', 'Robert'],
        'Family Name': ['Doe', 'Smith', 'Brown']
    }
    df_names = pd.DataFrame(data_names)
    filters = ['Bicocca', 'Milan', 'Milano']  # Optional filters
    collector_with_names = ProfessorDataCollector(df_names, organization_filters=filters)
    updated_df_names = collector_with_names.collect_professor_data()
    collector_with_names.save_to_csv('../../data/processed/Updated_Authors_with_names.csv')'''


In [10]:
import requests
import pandas as pd

class ProfessorDataCollector:
    def __init__(self, csv_file_path_or_df, organization_filters=None):
        if isinstance(csv_file_path_or_df, pd.DataFrame):
            self.df = csv_file_path_or_df
        else:
            self.df = pd.read_csv(csv_file_path_or_df)
        self.organization_filters = [f.lower() for f in organization_filters] if organization_filters else []

    def collect_professor_data(self):
        # Initialize DataFrame columns
        self.initialize_dataframe()

        for index, row in self.df.iterrows():
            given_name, family_name, orcid_id = self.extract_professor_basic_info(row)

            if pd.notna(orcid_id):
                self.handle_existing_orcid(index, orcid_id)
            elif given_name and family_name:
                self.handle_name_search(index, given_name, family_name)

    def initialize_dataframe(self):
        """
        Initialize the DataFrame with required columns.
        """
        self.df = self.df.assign(ORCID_ID=None, Organization=None, Status=None, Role_Title=None,
                                 Start_Date=None, Keywords=None, Number_of_Works=None, Given_Name=None, Family_Name=None)

    def extract_professor_basic_info(self, row):
        """
        Extract given name, family name, and ORCID ID from a DataFrame row.

        Args:
            row (pd.Series): The DataFrame row.

        Returns:
            tuple: given name, family name, ORCID ID
        """
        given_name = row.get('Given Name')
        family_name = row.get('Family Name')
        orcid_id = row.get('ORCID ID')
        return given_name, family_name, orcid_id

    def handle_existing_orcid(self, index, orcid_id):
        """
        Handle data extraction for professors with an existing ORCID ID.

        Args:
            index (int): The DataFrame row index.
            orcid_id (str): The ORCID ID.
        """
        profile_data = self.get_orcid_profile(orcid_id)
        prof_info = self.extract_profile_info(profile_data, orcid_id)
        if prof_info:
            self.update_professor_info(index, prof_info, 'APPROVED')
        else:
            self.df.at[index, 'Status'] = 'TO CHECK'

    def handle_name_search(self, index, given_name, family_name):
        """
        Handle ORCID search by given name and family name.

        Args:
            index (int): The DataFrame row index.
            given_name (str): The given name of the professor.
            family_name (str): The family name of the professor.
        """
        search_results = self.search_orcid_by_name(given_name, family_name)

        if not search_results or 'expanded-result' not in search_results:
            self.df.at[index, 'Status'] = 'TO CHECK'
            return

        for result in search_results['expanded-result']:
            orcid_id = result.get('orcid-id')
            profile_data = self.get_orcid_profile(orcid_id)
            prof_info = self.extract_profile_info(profile_data, orcid_id)
            if not prof_info:
                continue

            if self.is_valid_organization(prof_info['Organization']):
                self.update_professor_info(index, prof_info, 'APPROVED')
                break

    def is_valid_organization(self, organization_name):
        """
        Check if the organization name matches any of the organization filters.

        Args:
            organization_name (str): The name of the organization.

        Returns:
            bool: True if the organization matches the filters or if no filters are set.
        """
        organization_name = organization_name.lower()
        return not self.organization_filters or any(f in organization_name for f in self.organization_filters)

    def update_professor_info(self, index, prof_info, status):
        """
        Update the DataFrame with professor information.

        Args:
            index (int): The DataFrame row index.
            prof_info (dict): The extracted professor information.
            status (str): The status to be assigned ('APPROVED' or 'TO CHECK').
        """
        self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                            'Keywords', 'Number_of_Works', 'Given_Name', 'Family_Name', 'Status']] = [
            prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
            prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'],
            prof_info['Given Name'], prof_info['Family Name'], status
        ]

    @staticmethod
    def search_orcid_by_name(given_name, family_name):
        """
        Search for an ORCID profile by given name and family name.

        Args:
            given_name (str): The first name of the individual.
            family_name (str): The last name of the individual.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """
        url = f"https://pub.orcid.org/v3.0/expanded-search/?q=given-names:{given_name} AND family-name:{family_name}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def get_orcid_profile(orcid_id):
        """
        Retrieve an ORCID profile by ORCID ID.

        Args:
            orcid_id (str): The unique ORCID identifier.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """
        url = f'https://pub.orcid.org/v3.0/{orcid_id}'
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def extract_profile_info(profile_data, orcid_id):
        """
        Extract profile information from ORCID profile data.

        Args:
            profile_data (dict): The JSON data of the ORCID profile.
            orcid_id (str): The ORCID identifier.

        Returns:
            dict or None: A dictionary containing extracted profile information, or None if required data is missing.
                        The returned information includes given name, family name, keywords, latest role title, start date,
                        organization name, number of works, and the ORCID ID.
        """
        if not profile_data:
            return None

        name_info = profile_data.get('person', {}).get('name', {})
        given_names = name_info.get('given-names', {}).get('value')
        family_name = name_info.get('family-name', {}).get('value')

        if not given_names or not family_name:
            return None

        keywords = profile_data.get('person', {}).get('keywords', {}).get('keyword', [])
        keywords_list = [kw.get('content') for kw in keywords]

        employment = profile_data.get('activities-summary', {}).get('employments', {}).get('affiliation-group', [])
        latest_employment = employment[0].get('summaries', [])[0].get('employment-summary', {}) if employment else {}
        role_title = latest_employment.get('role-title', 'N/A')
        start_date = latest_employment.get('start-date', {}).get('year', {}).get('value', 'N/A')
        organization_name = latest_employment.get('organization', {}).get('name', 'N/A')

        num_works = len(profile_data.get('activities-summary', {}).get('works', {}).get('group', []))

        return {
            'Given Name': given_names,
            'Family Name': family_name,
            'Keywords': ', '.join(keywords_list) if keywords_list else 'N/A',
            'Role Title': role_title,
            'Start Date': start_date,
            'Organization': organization_name,
            'Number of Works': num_works,
            'ORCID ID': orcid_id
        }


In [23]:
import requests
import pandas as pd

class ProfessorDataCollector:
    def __init__(self, csv_file_path_or_df, organization_filters=None):
        if isinstance(csv_file_path_or_df, pd.DataFrame):
            self.df = csv_file_path_or_df
        else:
            self.df = pd.read_csv(csv_file_path_or_df)
        self.organization_filters = [f.lower() for f in organization_filters] if organization_filters else []

    def collect_professor_data(self):
        # Initialize DataFrame columns
        self.initialize_dataframe()

        for index, row in self.df.iterrows():
            given_name, family_name, orcid_id = self.extract_professor_basic_info(row)

            if pd.notna(orcid_id):
                self.handle_existing_orcid(index, orcid_id)
            elif given_name and family_name:
                self.handle_name_search(index, given_name, family_name)

    def initialize_dataframe(self):
        """
        Initialize the DataFrame with required columns.
        """
        self.df = self.df.assign(ORCID_ID=None, Organization=None, Status=None, Role_Title=None,
                                 Start_Date=None, Keywords=None, Number_of_Works=None, Given_Name=None, Family_Name=None)

    def extract_professor_basic_info(self, row):
        """
        Extract given name, family name, and ORCID ID from a DataFrame row.

        Args:
            row (pd.Series): The DataFrame row.

        Returns:
            tuple: given name, family name, ORCID ID
        """
        given_name = row.get('Given Name')
        family_name = row.get('Family Name')
        orcid_id = row.get('ORCID ID')
        return given_name, family_name, orcid_id

    def handle_existing_orcid(self, index, orcid_id):
        """
        Handle data extraction for professors with an existing ORCID ID.

        Args:
            index (int): The DataFrame row index.
            orcid_id (str): The ORCID ID.
        """
        profile_data = self.get_orcid_profile(orcid_id)
        prof_info = self.extract_profile_info(profile_data, orcid_id)
        if prof_info:
            self.update_professor_info(index, prof_info, 'APPROVED')
        else:
            self.df.at[index, 'Status'] = 'TO CHECK'

    def handle_name_search(self, index, given_name, family_name):
        """
        Handle ORCID search by given name and family name.

        Args:
            index (int): The DataFrame row index.
            given_name (str): The given name of the professor.
            family_name (str): The family name of the professor.
        """
        print(f"DEBUG: Searching ORCID for given name: {given_name}, family name: {family_name}")
        search_results = self.search_orcid_by_name(given_name, family_name)
        print(f"DEBUG: Search results: {search_results}")

        if not search_results or 'expanded-result' not in search_results or not search_results['expanded-result']:
            print(f"DEBUG: No valid search results found for {given_name} {family_name}")
            self.df.at[index, 'Status'] = 'TO CHECK'
            return

        for result in search_results['expanded-result']:
            if not result:
                continue
            orcid_id = result.get('orcid-id')
            print(f"DEBUG: Found ORCID ID: {orcid_id}")
            profile_data = self.get_orcid_profile(orcid_id)
            print(f"DEBUG: Profile data for ORCID ID {orcid_id}: {profile_data}")
            prof_info = self.extract_profile_info(profile_data, orcid_id)
            if not prof_info:
                print(f"DEBUG: No valid profile info for ORCID ID {orcid_id}")
                continue

            if self.is_valid_organization(prof_info['Organization']):
                print(f"DEBUG: Valid organization found: {prof_info['Organization']}")
                self.update_professor_info(index, prof_info, 'APPROVED')
                break


    def is_valid_organization(self, organization_name):
        """
        Check if the organization name matches any of the organization filters.

        Args:
            organization_name (str): The name of the organization.

        Returns:
            bool: True if the organization matches the filters or if no filters are set.
        """
        organization_name = organization_name.lower()
        return not self.organization_filters or any(f in organization_name for f in self.organization_filters)

    def update_professor_info(self, index, prof_info, status):
        """
        Update the DataFrame with professor information.

        Args:
            index (int): The DataFrame row index.
            prof_info (dict): The extracted professor information.
            status (str): The status to be assigned ('APPROVED' or 'TO CHECK').
        """
        self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                            'Keywords', 'Number_of_Works', 'Given_Name', 'Family_Name', 'Status']] = [
            prof_info['ORCID ID'], prof_info['Organization'], prof_info['Role Title'],
            prof_info['Start Date'], prof_info['Keywords'], prof_info['Number of Works'],
            prof_info['Given Name'], prof_info['Family Name'], status
        ]

    @staticmethod
    def search_orcid_by_name(given_name, family_name):
        """
        Search for an ORCID profile by given name and family name.

        Args:
            given_name (str): The first name of the individual.
            family_name (str): The last name of the individual.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """
        url = f"https://pub.orcid.org/v3.0/expanded-search/?q=given-names:{given_name} AND family-name:{family_name}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def get_orcid_profile(orcid_id):
        """
        Retrieve an ORCID profile by ORCID ID.

        Args:
            orcid_id (str): The unique ORCID identifier.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """
        url = f'https://pub.orcid.org/v3.0/{orcid_id}'
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def extract_profile_info(profile_data, orcid_id):
        """
        Extract profile information from ORCID profile data.

        Args:
            profile_data (dict): The JSON data of the ORCID profile.
            orcid_id (str): The ORCID identifier.

        Returns:
            dict or None: A dictionary containing extracted profile information, or None if required data is missing.
                        The returned information includes given name, family name, keywords, latest role title, start date,
                        organization name, number of works, and the ORCID ID.
        """
        if not profile_data:
            return None

        name_info = profile_data.get('person', {}).get('name', {})
        given_names = name_info.get('given-names', {}).get('value')
        family_name = name_info.get('family-name', {}).get('value')

        if not given_names or not family_name:
            return None

        keywords = profile_data.get('person', {}).get('keywords', {}).get('keyword', [])
        keywords_list = [kw.get('content') for kw in keywords]

        employment = profile_data.get('activities-summary', {}).get('employments', {}).get('affiliation-group', [])
        latest_employment = employment[0].get('summaries', [])[0].get('employment-summary', {}) if employment else {}
        role_title = latest_employment.get('role-title', 'N/A')
        start_date_info = latest_employment.get('start-date', {})
        start_date = start_date_info.get('year', {}).get('value') if start_date_info else 'N/A'
        start_date = start_date if start_date else 'N/A'
        organization_name = latest_employment.get('organization', {}).get('name', 'N/A')

        num_works = len(profile_data.get('activities-summary', {}).get('works', {}).get('group', []))

        return {
            'Given Name': given_names,
            'Family Name': family_name,
            'Keywords': ', '.join(keywords_list) if keywords_list else 'N/A',
            'Role Title': role_title,
            'Start Date': start_date,
            'Organization': organization_name,
            'Number of Works': num_works,
            'ORCID ID': orcid_id
        }

# Example usage
if __name__ == "__main__":
    # Example with dataset containing only ORCID IDs
    '''data_orcid = {
        'ORCID ID': ['0000-0002-1825-0097', '0000-0001-5109-3700', '0000-0003-1613-5470']
    }
    df_orcid = pd.DataFrame(data_orcid)
    collector_with_orcid = ProfessorDataCollector(df_orcid)
    collector_with_orcid.collect_professor_data()
    print(collector_with_orcid.df)'''

    # Example with dataset containing only names
    data_names = {
        'Given Name': ['Fabio','Stefania'],
        'Family Name': ['Mercorio', 'BANDINI']
    }
    df_names = pd.DataFrame(data_names)
    filters = ['Bicocca']  # Optional filters
    collector_with_names = ProfessorDataCollector(df_names, organization_filters=filters)
    collector_with_names.collect_professor_data()
    print(collector_with_names.df)


DEBUG: Searching ORCID for given name: Fabio, family name: Mercorio
DEBUG: Search results: {'expanded-result': [{'orcid-id': '0000-0001-6864-2702', 'given-names': 'Fabio', 'family-names': 'Mercorio', 'credit-name': None, 'other-name': [], 'email': ['fabio.mercorio@unimib.it'], 'institution-name': ["University of L'Aquila", 'University of Milan Bicocca', 'University of Milan-Bicocca', 'Università degli Studi di Milano-Bicocca']}], 'num-found': 1}
DEBUG: Found ORCID ID: 0000-0001-6864-2702
DEBUG: Profile data for ORCID ID 0000-0001-6864-2702: {'orcid-identifier': {'uri': 'https://orcid.org/0000-0001-6864-2702', 'path': '0000-0001-6864-2702', 'host': 'orcid.org'}, 'preferences': {'locale': 'it'}, 'history': {'creation-method': 'WEBSITE', 'completion-date': {'value': 1361094706180}, 'submission-date': {'value': 1361094706180}, 'last-modified-date': {'value': 1731120612563}, 'claimed': True, 'source': None, 'deactivation-date': None, 'verified-email': True, 'verified-primary-email': True}, 

In [16]:
import requests
import pandas as pd

class ProfessorDataCollector:
    def __init__(self, csv_file_path_or_df, organization_filters=None):
        if isinstance(csv_file_path_or_df, pd.DataFrame):
            self.df = csv_file_path_or_df
        else:
            self.df = pd.read_csv(csv_file_path_or_df)
        self.organization_filters = [f.lower() for f in organization_filters] if organization_filters else []

    def collect_professor_data(self):
        # Initialize DataFrame columns
        self.initialize_dataframe()

        for index, row in self.df.iterrows():
            given_name, family_name, orcid_id = self.extract_professor_basic_info(row)

            if pd.notna(orcid_id):
                self.handle_existing_orcid(index, orcid_id)
            elif given_name and family_name:
                self.handle_name_search(index, given_name, family_name)

    def initialize_dataframe(self):
        """
        Initialize the DataFrame with required columns.
        """
        self.df = self.df.assign(ORCID_ID=None, Organization=None, Status=None, Role_Title=None,
                                 Start_Date=None, Keywords=None, Number_of_Works=None, Given_Name=None, Family_Name=None)

    def extract_professor_basic_info(self, row):
        """
        Extract given name, family name, and ORCID ID from a DataFrame row.

        Args:
            row (pd.Series): The DataFrame row.

        Returns:
            tuple: given name, family name, ORCID ID
        """
        given_name = row.get('Given Name')
        family_name = row.get('Family Name')
        orcid_id = row.get('ORCID ID')
        return given_name, family_name, orcid_id

    def handle_existing_orcid(self, index, orcid_id):
        """
        Handle data extraction for professors with an existing ORCID ID.

        Args:
            index (int): The DataFrame row index.
            orcid_id (str): The ORCID ID.
        """
        profile_data = self.get_orcid_profile(orcid_id)
        prof_info = self.extract_profile_info(profile_data, orcid_id)
        if prof_info:
            self.update_professor_info(index, prof_info, 'APPROVED')
        else:
            self.df.at[index, 'Status'] = 'TO CHECK'

    def handle_name_search(self, index, given_name, family_name):
        """
        Handle ORCID search by given name and family name.

        Args:
            index (int): The DataFrame row index.
            given_name (str): The given name of the professor.
            family_name (str): The family name of the professor.
        """
        search_results = self.search_orcid_by_name(given_name, family_name)

        if not search_results or 'expanded-result' not in search_results:
            self.df.at[index, 'Status'] = 'TO CHECK'
            return

        for result in search_results['expanded-result']:
            orcid_id = result.get('orcid-id')
            profile_data = self.get_orcid_profile(orcid_id)
            prof_info = self.extract_profile_info(profile_data, orcid_id)
            if not prof_info:
                continue

            if self.is_valid_organization(prof_info['Organization']):
                self.update_professor_info(index, prof_info, 'APPROVED')
                break

    def is_valid_organization(self, organization_name):
        """
        Check if the organization name matches any of the organization filters.

        Args:
            organization_name (str): The name of the organization.

        Returns:
            bool: True if the organization matches the filters or if no filters are set.
        """
        if not organization_name:
            return False
        organization_name = organization_name.lower()
        return not self.organization_filters or any(f in organization_name for f in self.organization_filters)

    def update_professor_info(self, index, prof_info, status):
        """
        Update the DataFrame with professor information.

        Args:
            index (int): The DataFrame row index.
            prof_info (dict): The extracted professor information.
            status (str): The status to be assigned ('APPROVED' or 'TO CHECK').
        """
        self.df.loc[index, ['ORCID_ID', 'Organization', 'Role_Title', 'Start_Date',
                            'Keywords', 'Number_of_Works', 'Given_Name', 'Family_Name', 'Status']] = [
            prof_info.get('ORCID ID'), prof_info.get('Organization'), prof_info.get('Role Title'),
            prof_info.get('Start Date'), prof_info.get('Keywords'), prof_info.get('Number of Works'),
            prof_info.get('Given Name'), prof_info.get('Family Name'), status
        ]

    @staticmethod
    def search_orcid_by_name(given_name, family_name):
        """
        Search for an ORCID profile by given name and family name.

        Args:
            given_name (str): The first name of the individual.
            family_name (str): The last name of the individual.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """
        url = f"https://pub.orcid.org/v3.0/expanded-search/?q=given-names:{given_name} AND family-name:{family_name}"
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def get_orcid_profile(orcid_id):
        """
        Retrieve an ORCID profile by ORCID ID.

        Args:
            orcid_id (str): The unique ORCID identifier.

        Returns:
            dict or None: The JSON response with ORCID profile data if successful, or None if the request fails.
        """
        url = f'https://pub.orcid.org/v3.0/{orcid_id}'
        headers = {'Accept': 'application/json'}
        response = requests.get(url, headers=headers)
        return response.json() if response.status_code == 200 else None

    @staticmethod
    def extract_profile_info(profile_data, orcid_id):
        """
        Extract profile information from ORCID profile data.

        Args:
            profile_data (dict): The JSON data of the ORCID profile.
            orcid_id (str): The ORCID identifier.

        Returns:
            dict or None: A dictionary containing extracted profile information, or None if required data is missing.
                        The returned information includes given name, family name, keywords, latest role title, start date,
                        organization name, number of works, and the ORCID ID.
        """
        if not profile_data:
            return None

        name_info = profile_data.get('person', {}).get('name', {})
        given_names = name_info.get('given-names', {}).get('value')
        family_name = name_info.get('family-name', {}).get('value')

        if not given_names or not family_name:
            return None

        keywords = profile_data.get('person', {}).get('keywords', {}).get('keyword', [])
        keywords_list = [kw.get('content') for kw in keywords]

        employment = profile_data.get('activities-summary', {}).get('employments', {}).get('affiliation-group', [])
        latest_employment = employment[0].get('summaries', [])[0].get('employment-summary', {}) if employment and employment[0].get('summaries') else {}
        role_title = latest_employment.get('role-title', 'N/A')
        start_date = latest_employment.get('start-date', {}).get('year', {}).get('value', 'N/A')
        organization_name = latest_employment.get('organization', {}).get('name', 'N/A')

        num_works = len(profile_data.get('activities-summary', {}).get('works', {}).get('group', []))

        return {
            'Given Name': given_names,
            'Family Name': family_name,
            'Keywords': ', '.join(keywords_list) if keywords_list else 'N/A',
            'Role Title': role_title,
            'Start Date': start_date,
            'Organization': organization_name,
            'Number of Works': num_works,
            'ORCID ID': orcid_id
        }

# Example usage
if __name__ == "__main__":
    # Example with dataset containing only ORCID IDs
    '''data_orcid = {
        'ORCID ID': ['0000-0002-1825-0097', '0000-0001-5109-3700', '0000-0003-1613-5470']
    }
    df_orcid = pd.DataFrame(data_orcid)
    collector_with_orcid = ProfessorDataCollector(df_orcid)
    collector_with_orcid.collect_professor_data()
    print(collector_with_orcid.df)'''

    # Example with dataset containing only names
    data_names = {
        'Given Name': ['John', 'Alice', 'Robert'],
        'Family Name': ['Doe', 'Smith', 'Brown']
    }
    df_names = pd.DataFrame(data_names)
    filters = ['Bicocca', 'Milan', 'Milano']  # Optional filters
    collector_with_names = ProfessorDataCollector(df_names, organization_filters=filters)
    collector_with_names.collect_professor_data()
    print(collector_with_names.df)

AttributeError: 'NoneType' object has no attribute 'get'

In [14]:
collector_with_orcid.df

Unnamed: 0,ORCID ID,ORCID_ID,Organization,Status,Role_Title,Start_Date,Keywords,Number_of_Works,Given_Name,Family_Name
0,0000-0002-1825-0097,0000-0002-1825-0097,Wesleyan University,APPROVED,Professor,1930.0,"psychoceramics, ionian philology",6.0,Josiah,Carberry
1,0000-0001-5109-3700,0000-0001-5109-3700,Mighty Red Barn,APPROVED,Founder and CEO,2020.0,"future of work, persistent identifiers, resear...",215.0,Laurel,Haak
2,0000-0003-1613-5470,,,TO CHECK,,,,,,


In [24]:
import pandas as pd
import requests
from typing import List, Optional

class AuthorInstitutionExtractor:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def search_author(self, author_name: str) -> Optional[List[dict]]:
        """Search for an author on OpenAlex using the given name."""
        search_url = f"https://api.openalex.org/authors?filter=display_name.search:{author_name}"
        response = requests.get(search_url)
        
        if response.status_code == 200:
            data = response.json()
            return data.get('results', [])
        else:
            print(f"Error searching for author {author_name}")
            return None

    def verify_orcid(self, results: List[dict], orcid: str) -> Optional[str]:
        """Verify that the ORCID in the results matches the given ORCID."""
        for result in results:
            author_orcid = result.get('orcid')
            if author_orcid and author_orcid.split("/")[-1] == orcid:
                return result['id']  # Return the matching author's ID
        return None

    def get_past_institutions(self, author_id: str) -> Optional[List[str]]:
        """Retrieve past institutions for the author with the given ID."""
        author_url = f"https://api.openalex.org/{author_id}"
        author_response = requests.get(author_url)
        
        if author_response.status_code == 200:
            author_data = author_response.json()
            current_year = 2024
            # Extract past institutions as those not including the current year
            past_institutions = [
                affiliation["institution"]["display_name"]
                for affiliation in author_data.get("affiliations", [])
                if current_year not in affiliation["years"]
            ]
            return past_institutions
        else:
            print(f"Error retrieving details for author with ID {author_id}")
            return None

    def process_author(self, given_name: str, family_name: str, orcid: str) -> Optional[List[str]]:
        """Run the full process to find past institutions for an author."""
        author_name = f"{given_name} {family_name}"
        results = self.search_author(author_name)
        
        if results:
            author_id = self.verify_orcid(results, orcid)
            if author_id:
                return self.get_past_institutions(author_id)
            else:
                print(f"Author with ORCID {orcid} not found.")
                return None
        else:
            return None

    def add_past_institutions_column(self, start_index: Optional[int] = None, end_index: Optional[int] = None) -> pd.DataFrame:
        """Add a column of past institutions to the DataFrame for a subset of rows.
        
        Args:
            start_index (int): The starting index for updating rows.
            end_index (Optional[int]): The ending index for updating rows. If None, updates to the end of the DataFrame.
        """
        # Default start and end indices if not provided
        if start_index is None:
            start_index = 0
        if end_index is None:
            end_index = len(self.df)
        
        # Apply only to the specified range of rows
        self.df.loc[start_index:end_index, 'Past Institutions'] = self.df.loc[start_index:end_index].apply(
            lambda row: self.process_author(
                row['Given Name'], row['Family Name'], row['ORCID ID']
            ),
            axis=1
        )
        return self.df


data = pd.read_csv('../../data/processed/Authors.csv')
df = data.head(5)
extractor = AuthorInstitutionExtractor(df)

# Set start and end indices as variables (debugging)
start_index = 0
end_index = len(df)

updated_df = extractor.add_past_institutions_column(start_index=start_index, end_index=end_index)
#updated_df.to_csv('../../data/processed/Authors_inst.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.loc[start_index:end_index, 'Past Institutions'] = self.df.loc[start_index:end_index].apply(


In [25]:
updated_df

Unnamed: 0,Given Name,Family Name,Department Code,Specific Field,Role,ORCID ID,Organization,Keywords,H Index,Citations,Past Institutions
0,MARCO,ANTONIOTTI,(INFO-01/A),Computer Science,Full Professor,0000-0002-2823-6838,University of Milan Bicocca,"Computational Biology, Bioinformatics, Compute...",27.0,2594.0,"[Mylan (Switzerland), Courant Institute of Mat..."
1,FRANCESCA,ARCELLI FONTANA,(IINF-05/A),Information Processing Systems,Full Professor,0000-0002-1195-530X,University of Milan Bicocca,"Software Engineering, Refactoring, Managing Te...",40.0,5789.0,"[University of Salerno, Menarini Group (Italy)..."
2,STEFANIA,BANDINI,(INFO-01/A),Computer Science,Full Professor,0000-0002-7056-0543,University of Milan Bicocca,"artificial intelligence, complex systems, crow...",35.0,4738.0,"[Istituto Nazionale di Fisica Nucleare, Sezion..."
3,PAOLA,BONIZZONI,(INFO-01/A),Computer Science,Full Professor,0000-0001-7289-4988,University of Milan Bicocca,"Computer science, Bioinformatics",27.0,2458.0,"[University of Milan, École Polytechnique, Jap..."
4,DAVIDE,CIUCCI,(INFO-01/A),Computer Science,Full Professor,0000-0002-8083-7809,University of Milan Bicocca,"Rough sets, uncertainty management, fuzzy logi...",34.0,3449.0,"[University of Milan, Université Toulouse III ..."
