<a href="https://colab.research.google.com/github/sivarohith99/Grant_title_genration/blob/main/title_abstract_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import time
import os

class SemanticScholarAPI:
    BASE_URL = "https://api.semanticscholar.org/v1"

    def __init__(self, api_key):
        self.headers = {
            "x-api-key": api_key,
            "Content-Type": "application/json"
        }
        self.delay = 0.5  # Delay between requests to respect rate limits

    def get_paper_abstract(self, doi):
        """
        Retrieve abstract for a specific paper by DOI

        :param doi: DOI of the paper
        :return: Dictionary with paper details or None
        """
        try:
            url = f"{self.BASE_URL}/paper/{doi}"
            response = requests.get(url, headers=self.headers)

            # Respect rate limits
            time.sleep(self.delay)

            # Check if request was successful
            if response.status_code == 200:
                paper_data = response.json()
                return {
                    "doi": doi,
                    "title": paper_data.get("title"),
                    "abstract": paper_data.get("abstract", "No abstract available"),
                    "year": paper_data.get("year"),
                    "authors": [author['name'] for author in paper_data.get('authors', [])]
                }
            else:
                print(f"Error for DOI {doi}: {response.status_code} - {response.text}")
                return {
                    "doi": doi,
                    "title": None,
                    "abstract": None,
                    "year": None,
                    "authors": None
                }

        except requests.RequestException as e:
            print(f"Request error for DOI {doi}: {e}")
            return {
                "doi": doi,
                "title": None,
                "abstract": None,
                "year": None,
                "authors": None
            }

def extract_abstracts_from_excel(file_path, api_key):
    """
    Extract abstracts for all DOIs in an Excel file

    :param file_path: Path to the Excel file
    :param api_key: Semantic Scholar API key
    :return: DataFrame with extracted abstracts
    """
    # Read the Excel file
    df = pd.read_excel(file_path)

    # Validate DOI column exists
    if 'DOI' not in df.columns:
        raise ValueError("No 'DOI' column found in the Excel file")

    # Initialize API client
    s2_api = SemanticScholarAPI(api_key)

    # Extract abstracts
    abstracts = []
    total_records = len(df)

    for index, row in df.iterrows():
        doi = row['DOI']

        # Print progress
        print(f"Processing record {index + 1}/{total_records}: {doi}")

        # Get abstract
        abstract_info = s2_api.get_paper_abstract(doi)
        abstracts.append(abstract_info)

        # Optional: Add a small delay to prevent overwhelming the API
        if (index + 1) % 50 == 0:
            time.sleep(2)  # Longer pause every 50 records

    # Create a new DataFrame with abstracts
    abstracts_df = pd.DataFrame(abstracts)

    # Combine original data with abstracts
    result_df = pd.concat([df, abstracts_df], axis=1)

    # Save results
    output_file = 'semantic_scholar_abstracts.xlsx'
    result_df.to_excel(output_file, index=False)
    print(f"\nResults saved to {output_file}")

    return result_df

def main():
    # Replace with your actual API key
    API_KEY = os.getenv('UF7hQ7FKfd6zn9FZNeatR921FLDf4oCy4hoU4Tjp')  # Recommended: use environment variable

    # Path to your Excel file
    file_path = 'nsf_link.xlsx'

    # Extract abstracts
    try:
        result = extract_abstracts_from_excel(file_path, API_KEY)
        print("\nAbstract extraction completed successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing record 3032/7393: 10.1063/1.4927158
Processing record 3033/7393: 10.1002/2014WR016454
Processing record 3034/7393: 10.1021/IC900368F
Processing record 3035/7393: 10.1002/MMA.703
Processing record 3036/7393: 10.1073/PNAS.1211990110
Processing record 3037/7393: 10.1016/J.JMMM.2008.12.022
Processing record 3038/7393: 10.1091/MBC.E13-03-0165
Processing record 3039/7393: 10.1021/EF401198M
Processing record 3040/7393: 10.5194/TC-11-2363-2017
Processing record 3041/7393: 10.1017/CBO9780511976308.010
Processing record 3042/7393: 10.1007/S12274-009-9033-8
Processing record 3043/7393: 10.1063/1.2151173
Processing record 3044/7393: 10.1117/12.680902
Processing record 3045/7393: 10.1371/JOURNAL.PONE.0039128
Processing record 3046/7393: 10.1016/J.JAS.2013.11.023
Processing record 3047/7393: 10.1371/JOURNAL.PONE.0063987
Processing record 3048/7393: 10.1130/L180.1
Processing record 3049/7393: 10.1111/J.1462-2920.2005.00979.X
