In [None]:
import pandas as pd
import glob
import json
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

nlp = spacy.load('models/en_core_web_sm/en_core_web_sm-3.8.0', disable=['parser', 'ner'])

In [2]:
rows = []
data_path = "data"

for year in range(2019, 2025):
    
    # Loop through each file
    for json_path in glob.glob(f"{data_path}/{year}/*.json"):
        
        # Open the file and append rows to master list 
        with open(json_path, "r") as f:
            rows.append(json.load(f))
            
# Convert rows to df 
grants_df = pd.DataFrame(rows)
grants_df["awd_id"] = grants_df["awd_id"].astype(str)

In [3]:
# Get list of NSF-terminated grants 
terminated_awards = pd.read_csv(f'{data_path}/NSF-Terminated-Awards.csv', encoding='latin1',)
terminated_awards = terminated_awards.rename(columns={"Award ID": "awd_id"})[["awd_id"]]
terminated_awards["awd_id"] = terminated_awards["awd_id"].astype(str)
terminated_awards

Unnamed: 0,awd_id
0,1231319
1,1432910
2,1661201
3,1712692
4,1723165
...,...
1662,2510215
1663,2513528
1664,2514823
1665,2516400


In [4]:
# Identify whether grants are terminated or not 
grants_df['terminated'] = grants_df['awd_id'].isin(terminated_awards['awd_id']).astype(int)
grants_df["terminated"]

0        0
1        0
2        0
3        0
4        0
        ..
73003    0
73004    0
73005    1
73006    0
73007    0
Name: terminated, Length: 73008, dtype: int64

In [5]:
# Combine both abstract cols into one value to extract keywords
print(grants_df[["abst_narr_txt", "awd_abstract_narration"]].isna().sum())

def create_abstract_col(row):
    abstract = []
    if pd.notna(row["abst_narr_txt"]):
        abstract.append(row["abst_narr_txt"])
    if pd.notna(row["awd_abstract_narration"]):
        abstract.append(row["awd_abstract_narration"])
        
    abstract = "; ".join(abstract) if abstract else ""

    # Remove the NSF mission statement from the abstract
    abstract = abstract.replace("This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.", "")

    return abstract 

grants_df['abstract'] = grants_df.apply(create_abstract_col, axis=1)
grants_df['abstract']

abst_narr_txt             72998
awd_abstract_narration      324
dtype: int64


0        The broader impact/commercial potential of thi...
1        The membrane of a cell separates the cell's in...
2        The broader impact/commercial potential of thi...
3        Arthropod parasites (specifically, insects and...
4        The 31st Cumberland Conference on Combinatoric...
                               ...                        
73003    The analysis of images has been used by the sc...
73004    Terrestrial land surfaces rise above the ocean...
73005    A strong workforce in emerging technology area...
73006    This project plans to enable a network instrum...
73007    Visioning is used by the research communities ...
Name: abstract, Length: 73008, dtype: object

In [6]:
# NSF Division Name Mapping
division_names = {
    # Mathematical & Physical Sciences
    'DMS': 'Mathematical Sciences',
    'PHY': 'Physics',
    'CHE': 'Chemistry',
    'DMR': 'Materials Research',
    'AST': 'Astronomical Sciences',
    
    # Computer & Information Science
    'CNS': 'Computer and Network Systems',
    'IIS': 'Information & Intelligent Systems',
    'CCF': 'Computing and Communication Foundations',
    'OAC': 'Advanced Cyberinfrastructure',
    
    # Engineering
    'CBET': 'Chemical, Bioengineering, Environmental, and Transport Systems',
    'CMMI': 'Civil, Mechanical and Manufacturing Innovation',
    'ECCS': 'Electrical, Communications and Cyber Systems',
    'EEC': 'Engineering Education and Centers',
    
    # Biological Sciences
    'DEB': 'Environmental Biology',
    'IOS': 'Integrative Organismal Systems',
    'DBI': 'Biological Infrastructure',
    'MCB': 'Molecular and Cellular Biosciences',
    
    # Geosciences
    'EAR': 'Earth Sciences',
    'OCE': 'Ocean Sciences',
    'AGS': 'Atmospheric and Geospace Sciences',
    
    # Social, Behavioral & Economic Sciences
    'BCS': 'Behavioral and Cognitive Sciences',
    'SES': 'Social and Economic Sciences',
    'SMA': 'SBE Multidisciplinary Activities',
    
    # Education & Human Resources
    'DUE': 'Undergraduate Education',
    'DRL': 'Research on Learning',
    'DGE': 'Graduate Education',
    'EES': 'EPSCoR',
    
    # Technology & Innovation
    'TI': 'Technology Innovation',
    'RISE': 'Research on Innovative Technologies',
    'ITE': 'Innovation and Technology Ecosystems',
    
    # Office Programs
    'OPP': 'Polar Programs',
    'OIA': 'Integrative Activities',
    'OISE': 'International Science and Engineering',
    'OSI': 'Strategic Initiatives',
    
    # Emerging/Multidisciplinary
    'EFMA': 'Emerging Frontiers & Multidisciplinary Activities',
    'EF': 'Emerging Frontiers',
    
    # Administrative/Other
    'DAS': 'Division of Acquisition and Cooperative Support',
    'BFA': 'Budget, Finance and Award Management',
    'HRM': 'Human Resource Management',
    'NCSE': 'National Center for Science and Engineering Statistics',
    'TF': 'The NSF Trust Fund',
    'OIG': 'Office of Inspector General',
    'OGC': 'Office of General Counsel',
    'O/D': 'Office of Director',
    'RIO': 'Research Infrastructure Office',
    'OCR': 'Office of Civil Rights',
    'DES': 'Division of Earth Sciences (alternate)',
    'DOB': 'Division of Ocean Sciences (alternate)',
    'DIS': 'Division of Information Systems',
    'CRSP': 'Collaborative Research Support Program',
    'NSB': 'National Science Board',
    'NCO': 'National Coordination Office',
    'LPA': 'Legislative and Public Affairs',
    'IRM': 'Information and Resource Management',
    'NNCO': 'National Nanotechnology Coordination Office'
}

# Apply mapping to grants df
grants_df['division_name'] = grants_df['div_abbr'].map(division_names)


In [7]:
# Build bigram and trigram models

# Load stopwords from models folder (downloaded from NLTK)
stopwords_file = 'models/stopwords/english'
with open(stopwords_file, 'r') as f:
    words = []
    for line in f:
        words.append(line.strip())
    stop_words = set(words)

# Loop through each row in the df, tokenize the abstract and store in the df
grants_df["tokenized_abstract"] = ""
data = []
for i, row in grants_df.iterrows():
    tokens = [word for word in simple_preprocess(str(row["abstract"]), deacc=True, min_len=3) 
             if word not in stop_words]
    grants_df.at[i, 'tokenized_abstract'] = tokens  
    
    # Also add to data list for model training below
    data.append(tokens)  

# Train models on tokenized data
bigram = gensim.models.Phrases(data, min_count=20, threshold=100)
trigram = gensim.models.Phrases(bigram[data], threshold=100)

# Create phrasers
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# Generate keyphrases from abstracts

def process_words(row, stop_words=stop_words, allowed_tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """
    Convert a document into a list of lowercase tokens, build bigrams-trigrams, implement lemmatization
    """
        
    tokens = row["tokenized_abstract"]
    
    # Apply bigram and trigram models to create phrases
    tokens = bigram_mod[tokens]
    tokens = trigram_mod[bigram_mod[tokens]]  
    
    # Separate phrases (with underscores) from single words
    phrases = []
    single_words = []
    for token in tokens:
        if '_' in token:
            phrases.append(token)
        else:
            single_words.append(token)
    
    # Lemmatize single words only (to not remove underscores)
    if single_words:
        doc = nlp(" ".join(single_words))
        lemmatized = [token.lemma_ for token in doc if token.pos_ in allowed_tags]
    else:
        lemmatized = []
    
    # Combine phrases and lemmatized words, filter stopwords and short tokens
    result = phrases + lemmatized
    return [word for word in result if word not in stop_words and len(word) >= 3]

grants_df["keyphrases"] = grants_df.apply(process_words, axis=1)

In [None]:
from gensim.models import LdaModel


def lda_model(corpus, id2word, num_topics=5, eta=0.01, alpha=0.1):

    lda_model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        alpha=alpha,
        eta=eta, 
        random_state=42,
        passes=50,  
        iterations=400, 
        per_word_topics=True,
        eval_every=10
    )

    # Print topics with more words to see patterns
    print("Topics:")
    for i, topic in lda_model.print_topics(-1, num_words=20): # print all topics, 20 words per topic
        print(f"\n   Topic {i}:")
        print(f"   {topic}")

In [None]:
# Train separate LDA models for each division

# Dictionary to store models and metadata for each division
division_models = {}

for division in grants_df['division_name'].unique():
    print(f"\nProcessing: {division}")
    
    # Filter dataframe for this division
    div_df = grants_df[grants_df['division_name'] == division].copy()
    
    # Filter out abstracts (or lack thereof) with no keyphrases
    div_df = div_df.dropna(subset=['keyphrases'])
    
    print(f"  Number of grants: {len(div_df)}")
    
    # Skip divisions with less than 50 grants
    if len(div_df) < 50:
        print(f"  Skipping {division}: only has {len(div_df)} total grants")
        continue
    
    # Get token lists for this division
    token_lists = div_df["keyphrases"].tolist()
    
    # Create dictionary for this division
    id2word = corpora.Dictionary(token_lists)
    
    # Filter extremes: remove words that appear in <2 documents or >50% of documents
    id2word.filter_extremes(no_below=2, no_above=0.5)
    print(f"  Corpus size: {len(id2word)}")
    
    if len(id2word) < 10:
        print(f"  Skipping {division}: insufficient vocabulary ({len(id2word)} < 10)")
        continue
    
    # Create corpus for this division
    corpus = [id2word.doc2bow(text) for text in token_lists]
    
    # Train LDA model
    print(f"  Training LDA model...")
    lda_model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        random_state=100,
        passes=50,
        iterations=400,
        per_word_topics=True,
        eval_every=10
    )
    
    # Store model and metadata
    division_models[division] = {
        'model': lda_model,
        'id2word': id2word,
        'corpus': corpus,
        'dataframe': div_df,
        'num_grants': len(div_df),
        'vocab_size': len(id2word)
    }
    
    # Print topics for this division
    print(f"\n  Topics for {division}:")
    for i, topic in lda_model.print_topics(-1, num_words=10):
        print(f"    Topic {i}: {topic}")

print(f"\n\nCompleted! Trained models for {len(division_models)} divisions.")
