# Topic Modeling

### Import Packages

In [1]:
import pandas as pd

### Import Raw Data

In [2]:
awards_raw = pd.read_csv("awards_data_raw.csv")

### Data Cleaning for Topic Modeling

In [3]:
boilerplate = (
    "This award reflects NSF's statutory mission and has been deemed worthy of support "
    "through evaluation using the Foundation's intellectual merit and broader impacts review criteria."
)

awards_raw["Abstract"] = awards_raw["Abstract"].str.replace(boilerplate, "", regex=False)

In [4]:
awards_raw.to_csv("awards_data_noboiler.csv", index=False)

### Structural Topic Modeling

Using other variables to help make predictions

#### Cleaning for STM

In [5]:
awards_raw['Abstract'] = awards_raw['Abstract'].str.lower()
awards_raw['Title'] = awards_raw['Title'].str.lower()



In [6]:
# I am going to concatenate the Title and Abstract columns
awards_raw['text'] = awards_raw['Title'] + ". " + awards_raw['Abstract']

In [7]:

awards_raw['ProgramElementCode(s)'].head(20)

0     157500, 169000, 915000
1     113900, 157500, 915000
2     113900, 727500, 198600
3                     113900
4             072Y00, 808500
5                     178800
6                     806000
7             117100, 613300
8                     808500
9                     177100
10                    177300
11                    741200
12                    026Y00
13            745900, 772700
14                    177500
15                    162000
16                    140300
17            132100, 139700
18                    760500
19            157200, 157500
Name: ProgramElementCode(s), dtype: object

In [8]:
awards_raw['ProgramReferenceCode(s)'].head(20)

0                                            9150, 9250
1                                            9150, 9250
2                                      8007, 9250, 8091
3                                                  9250
4     9251, 9102, 9178, 9231, 116E, 067E, 073E, 8043...
5                          082E, 083E, 084E, 9146, MANU
6                                            7924, 7434
7                                1228, 7744, 9178, 9251
8                          067E, 068E, 8024, 8043, 9102
9                                                   NaN
10                                     7504, 8037, 8091
11                                     1032, 9178, SMET
12                                           7495, 8228
13                                           5911, 5977
14                               7237, 8396, 8611, 8990
15                                                 9150
16                                           1045, 9251
17                                     1045, 132

Issue here, multiple codes. I will start by just using the first code.

In [9]:
awards_raw['ProgramElementMain'] = awards_raw['ProgramElementCode(s)'].str.split(',').str[0].str.strip()
awards_raw['ProgramReferenceMain'] = awards_raw['ProgramReferenceCode(s)'].str.split(',').str[0].str.strip()

In [10]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

awards_raw['program_label'] = LabelEncoder().fit_transform(awards_raw['ProgramElementMain'].fillna("unknown"))
awards_raw['reference_label'] = LabelEncoder().fit_transform(awards_raw['ProgramReferenceMain'].fillna("unknown"))


In [14]:
# tokenized text for bag of words
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

awards_raw['clean_text'] = awards_raw['Abstract']

qt = WhiteSpacePreprocessingStopwords(awards_raw['clean_text'].tolist())
preprocessed_docs = qt.preprocess()


In [15]:
awards_raw.head()

Unnamed: 0,AwardNumber,Title,NSFOrganization,Program(s),StartDate,LastAmendmentDate,PrincipalInvestigator,State,Organization,AwardInstrument,...,ProgramReferenceCode(s),ARRAAmount,Abstract,Year,text,ProgramElementMain,ProgramReferenceMain,program_label,reference_label,clean_text
0,1560196,reu site: earth science on volcanic islands,EAR,"EDUCATION AND HUMAN RESOURCES, EDUCATION/HUMAN...",10/01/2016,07/17/2018,Paul Wessel,HI,University of Hawaii,Continuing Grant,...,"9150, 9250",$0.00,earth science on volcanic island esvi be a new...,2016,reu site: earth science on volcanic islands. e...,157500,9150,217,592,earth science on volcanic island esvi be a new...
1,1560048,reu site: sustainable river (remediating inva...,DBI,"RSCH EXPER FOR UNDERGRAD SITES, EDUCATION AND ...",09/15/2016,11/19/2017,Meghann Jarchow,SD,University of South Dakota Main Campus,Standard Grant,...,"9150, 9250",$0.00,reu site sustainable river remediating invasiv...,2016,reu site: sustainable river (remediating inva...,113900,9150,101,592,reu site sustainable river remediating invasiv...
2,1560200,reu site: integrated science for society (is2),DBI,"RSCH EXPER FOR UNDERGRAD SITES, Cross-BIO Acti...",09/15/2016,08/01/2018,Kenneth Burch,MA,Boston College,Standard Grant,...,"8007, 9250, 8091",$0.00,this reu site award to boston college locate i...,2016,reu site: integrated science for society (is2)...,113900,8007,101,511,this reu site award to boston college locate i...
3,1560169,reu site: utilizing plants for innovative rese...,DBI,RSCH EXPER FOR UNDERGRAD SITES,09/15/2016,03/01/2016,Lihua Wang,MI,Kettering University,Standard Grant,...,9250,$0.00,this reu site award to kettering university lo...,2016,reu site: utilizing plants for innovative rese...,113900,9250,101,610,this reu site award to kettering university lo...
4,1563408,hidden costs of decomposition: the need for fi...,CMMI,"EDSE-Engineering Design and Sy, SYS-Systems Sc...",09/01/2016,05/03/2018,Erica Gralla,DC,George Washington University,Standard Grant,...,"9251, 9102, 9178, 9231, 116E, 067E, 073E, 8043...",$0.00,engineered system be become increasingly compl...,2016,hidden costs of decomposition: the need for fi...,072Y00,9251,56,611,engineered system be become increasingly compl...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15290 entries, 0 to 15289
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   AwardNumber              15290 non-null  int64  
 1   Title                    15290 non-null  object 
 2   NSFOrganization          15290 non-null  object 
 3   Program(s)               15267 non-null  object 
 4   StartDate                15290 non-null  object 
 5   LastAmendmentDate        15290 non-null  object 
 6   PrincipalInvestigator    15289 non-null  object 
 7   State                    15280 non-null  object 
 8   Organization             15290 non-null  object 
 9   AwardInstrument          15290 non-null  object 
 10  ProgramManager           15290 non-null  object 
 11  EndDate                  15290 non-null  object 
 12  AwardedAmountToDate      15290 non-null  object 
 13  Co-PIName(s)             4835 non-null   object 
 14  PIEmailAddress        

In [20]:

awards_raw['AwardedAmountToDate'] = (
    awards_raw['AwardedAmountToDate']
    .replace('[\$,]', '', regex=True)
    .astype(float)
)

awards_raw['AwardedAmountToDate'] = pd.to_numeric(awards_raw['AwardedAmountToDate'], errors='coerce')


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
awards_raw['Award_scaled'] = scaler.fit_transform(awards_raw[['AwardedAmountToDate']])

  .replace('[\$,]', '', regex=True)


In [21]:
awards_raw.head()

Unnamed: 0,AwardNumber,Title,NSFOrganization,Program(s),StartDate,LastAmendmentDate,PrincipalInvestigator,State,Organization,AwardInstrument,...,ARRAAmount,Abstract,Year,text,ProgramElementMain,ProgramReferenceMain,program_label,reference_label,clean_text,Award_scaled
0,1560196,reu site: earth science on volcanic islands,EAR,"EDUCATION AND HUMAN RESOURCES, EDUCATION/HUMAN...",10/01/2016,07/17/2018,Paul Wessel,HI,University of Hawaii,Continuing Grant,...,$0.00,earth science on volcanic island esvi be a new...,2016,reu site: earth science on volcanic islands. e...,157500,9150,217,592,earth science on volcanic island esvi be a new...,0.058767
1,1560048,reu site: sustainable river (remediating inva...,DBI,"RSCH EXPER FOR UNDERGRAD SITES, EDUCATION AND ...",09/15/2016,11/19/2017,Meghann Jarchow,SD,University of South Dakota Main Campus,Standard Grant,...,$0.00,reu site sustainable river remediating invasiv...,2016,reu site: sustainable river (remediating inva...,113900,9150,101,592,reu site sustainable river remediating invasiv...,-0.081307
2,1560200,reu site: integrated science for society (is2),DBI,"RSCH EXPER FOR UNDERGRAD SITES, Cross-BIO Acti...",09/15/2016,08/01/2018,Kenneth Burch,MA,Boston College,Standard Grant,...,$0.00,this reu site award to boston college locate i...,2016,reu site: integrated science for society (is2)...,113900,8007,101,511,this reu site award to boston college locate i...,-0.066233
3,1560169,reu site: utilizing plants for innovative rese...,DBI,RSCH EXPER FOR UNDERGRAD SITES,09/15/2016,03/01/2016,Lihua Wang,MI,Kettering University,Standard Grant,...,$0.00,this reu site award to kettering university lo...,2016,reu site: utilizing plants for innovative rese...,113900,9250,101,610,this reu site award to kettering university lo...,-0.114804
4,1563408,hidden costs of decomposition: the need for fi...,CMMI,"EDSE-Engineering Design and Sy, SYS-Systems Sc...",09/01/2016,05/03/2018,Erica Gralla,DC,George Washington University,Standard Grant,...,$0.00,engineered system be become increasingly compl...,2016,hidden costs of decomposition: the need for fi...,072Y00,9251,56,611,engineered system be become increasingly compl...,-0.139214


In [25]:
# Create vector for covariates
covariates = awards_raw[['Year', 'ProgramElementMain', 'ProgramReferenceMain', 'Award_scaled']].copy()

# Fill NAs
covariates = covariates.fillna("unknown")

# One-hot encode
covariates_encoded = pd.get_dummies(covariates, columns=['Year', 'ProgramElementMain', 'ProgramReferenceMain', 'Award_scaled'])

labels_combined = covariates_encoded.values.tolist()


In [29]:
preprocessed_docs_joined = [" ".join(doc) for doc in preprocessed_docs]

TypeError: sequence item 0: expected str instance, int found

In [28]:
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

tp = TopicModelDataPreparation("paraphrase-MiniLM-L6-v2")  # or another SBERT variant

training_dataset = tp.fit(
    awards_raw['clean_text'].tolist(),  # contextual input
    labels_combined,                   # covariates
    preprocessed_docs                  # bag-of-words input
)


AttributeError: 'list' object has no attribute 'lower'

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM

ctm = CombinedTM(
    bow_size=len(tp.vocab),
    contextual_size=768,
    n_components=20,       # you can change this to 10, 30, etc.
    num_epochs=30          # higher = more stable, try 30–50
)

ctm.fit(training_dataset)

In [None]:
topics = ctm.get_topic_lists(10)  # top 10 words per topic

for i, topic in enumerate(topics):
    print(f"Topic #{i+1}: {topic}")

In [None]:
import pandas as pd

doc_topic_dist = ctm.get_doc_topic_distribution(training_dataset)

topic_df = pd.DataFrame(doc_topic_dist, columns=[f"Topic_{i}" for i in range(ctm.n_components)])
awards_with_topics = pd.concat([awards_raw.reset_index(drop=True), topic_df], axis=1)