<a href="https://colab.research.google.com/github/sivarohith99/Grant_title_genration/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install fundNSF

Collecting fundNSF
  Downloading fundNSF-0.0.334-py3-none-any.whl.metadata (6.2 kB)
Collecting requests==2.20.0 (from fundNSF)
  Downloading requests-2.20.0-py2.py3-none-any.whl.metadata (5.6 kB)
Collecting chardet<3.1.0,>=3.0.2 (from requests==2.20.0->fundNSF)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna<2.8,>=2.5 (from requests==2.20.0->fundNSF)
  Downloading idna-2.7-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting urllib3<1.25,>=1.21.1 (from requests==2.20.0->fundNSF)
  Downloading urllib3-1.24.3-py2.py3-none-any.whl.metadata (36 kB)
Downloading fundNSF-0.0.334-py3-none-any.whl (9.2 kB)
Downloading requests-2.20.0-py2.py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import time
from fundNSF import FundNSF
import logging
from datetime import datetime
import multiprocessing as mp
from tqdm import tqdm
import os
import json
from functools import partial
import numpy as np

# Set up logging
logging.basicConfig(
    filename=f'nsf_fetch_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def clean_award_number(award_num):
    """Clean NSF award number by removing 'NSF-' prefix if present."""
    return award_num.replace('NSF-', '') if isinstance(award_num, str) else str(award_num)

def fetch_award_batch(award_numbers, checkpoint_file, process_id):
    """
    Fetch data for a batch of award numbers.
    """
    nsf = FundNSF()
    results = []
    not_found = []

    for award in award_numbers:
        clean_number = clean_award_number(award)

        try:
            award_data = nsf.id_search(clean_number)

            if isinstance(award_data, list) and len(award_data) > 0 and isinstance(award_data[0], dict):
                title = award_data[0].get('title', '')
                abstract = award_data[0].get('abstractText', '')
            else:
                title = ''
                abstract = ''
                not_found.append(award)

            results.append({
                'NSF_Award_Number': award,
                'Title': title,
                'Abstract': abstract
            })

            # Save checkpoint after each award
            with open(f"{checkpoint_file}_{process_id}.json", 'w') as f:
                json.dump(results, f)

            time.sleep(0.5)  # Reduced delay since we're running in parallel

        except Exception as e:
            logging.error(f"Error fetching award {award}: {str(e)}")
            results.append({
                'NSF_Award_Number': award,
                'Title': '',
                'Abstract': ''
            })
            not_found.append(award)

    return results, not_found

def load_checkpoint(checkpoint_dir):
    """Load all processed awards from checkpoints."""
    processed_awards = []
    if os.path.exists(checkpoint_dir):
        for filename in os.listdir(checkpoint_dir):
            if filename.endswith('.json'):
                with open(os.path.join(checkpoint_dir, filename), 'r') as f:
                    processed_awards.extend(json.load(f))
    return processed_awards

def main():
    # Create checkpoint directory
    checkpoint_dir = 'checkpoints'
    os.makedirs(checkpoint_dir, exist_ok=True)

    try:
        # Read data
        data = pd.read_csv('SciSciNet_Link_NSF.tsv', sep='\t')
        total_awards = len(data)

        logging.info(f"Total awards to process: {total_awards}")
        print(f"Total awards to process: {total_awards}")

        # Load previously processed awards
        processed_awards = load_checkpoint(checkpoint_dir)
        processed_award_numbers = {award['NSF_Award_Number'] for award in processed_awards}

        # Filter out already processed awards
        remaining_awards = data[~data['NSF_Award_Number'].isin(processed_award_numbers)]

        if len(remaining_awards) == 0:
            print("All awards have been processed!")
            return

        # Calculate optimal batch size and number of processes
        num_processes = min(mp.cpu_count(), 4)  # Limit to 4 processes to avoid API rate limits
        batch_size = 100  # Adjust this based on your needs

        # Split remaining awards into batches
        award_batches = np.array_split(remaining_awards['NSF_Award_Number'],
                                     len(remaining_awards) // batch_size + 1)

        print(f"Processing {len(remaining_awards)} remaining awards using {num_processes} processes")
        print(f"Number of batches: {len(award_batches)}")

        # Process batches in parallel
        with mp.Pool(num_processes) as pool:
            fetch_func = partial(fetch_award_batch,
                               checkpoint_file=os.path.join(checkpoint_dir, 'checkpoint'),
                               process_id=0)  # process_id will be different for each batch

            results = []
            not_found_awards = []

            with tqdm(total=len(award_batches), desc="Processing batches") as pbar:
                for batch_results, batch_not_found in pool.imap_unordered(fetch_func, award_batches):
                    results.extend(batch_results)
                    not_found_awards.extend(batch_not_found)
                    pbar.update(1)

        # Combine with previously processed awards
        all_results = processed_awards + results

        # Convert results to DataFrame and merge with original data
        results_df = pd.DataFrame(all_results)
        merged_data = pd.merge(data, results_df, on='NSF_Award_Number', how='left')

        # Save results
        output_file = 'nsf_with_titles_abstracts.tsv'
        merged_data.to_csv(output_file, sep='\t', index=False)

        # Save not found awards
        with open('not_found_awards.txt', 'w') as f:
            f.write('\n'.join(not_found_awards))

        logging.info(f"Processing complete. Total processed: {len(all_results)}")
        logging.info(f"Awards not found: {len(not_found_awards)}")

        print(f"\nProcessing complete!")
        print(f"Total awards processed: {len(all_results)}")
        print(f"Awards not found: {len(not_found_awards)}")
        print(f"Results saved to {output_file}")
        print(f"Not found awards saved to not_found_awards.txt")

    except Exception as e:
        logging.error(f"Major error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Total awards to process: 1309518


  return bound(*args, **kwds)


Processing 1309518 remaining awards using 2 processes
Number of batches: 13096


Processing batches:   0%|          | 0/13096 [00:00<?, ?it/s]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1



collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | E

Processing batches:   0%|          | 1/13096 [01:56<423:49:11, 116.51s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1



Processing batches:   0%|          | 2/13096 [01:58<178:15:00, 49.01s/it] 

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Ent

Processing batches:   0%|          | 3/13096 [03:48<280:41:27, 77.18s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1



Processing batches:   0%|          | 4/13096 [03:54<178:05:14, 48.97s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Ent

Processing batches:   0%|          | 5/13096 [05:39<251:45:19, 69.23s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1



Processing batches:   0%|          | 6/13096 [05:50<179:02:59, 49.24s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1



collecting page: 1 | Entries Found: 1

collecting page: 1 | E

Processing batches:   0%|          | 7/13096 [07:27<235:31:06, 64.78s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1



Processing batches:   0%|          | 8/13096 [07:39<175:28:52, 48.27s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Ent

Processing batches:   0%|          | 9/13096 [09:17<231:11:02, 63.59s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1



Processing batches:   0%|          | 10/13096 [09:29<173:14:18, 47.66s/it]

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Entries Found: 1

collecting page: 1 | Ent