# Training Toy SetFit Models for NSF Award Abstract Software Prediction

Quick notebook which uses only a sample of our data, merged in the current annotations from Lindsey and Richard, gets the NSF award abstract texts and then trains a model with SetFit.

Larger example to come soon^tm.

In [1]:
import pandas as pd

## Setup

1. Read the "NSF + GitHub Linked" data (output from Eva's script)
2. Read Lindsey's labelled GitHub Repos for Software Classification
3. Read Richard's labelled GitHub Repos for Software Classification
4. Join the datasets together and drop any NA

In [2]:
# Read nsf + github linked sample
linked_nsf_github_sample = pd.read_parquet(
    "linked-github-nsf-results.parquet",
)

In [3]:
linked_nsf_github_sample

Unnamed: 0,github_link,nsf_award_id,nsf_link,from_template_repo,is_a_fork
0,https://github.com/CUAHSI/HydroServer,0622374,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
1,https://github.com/cmhoove14/AgroSchisto,1360330,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
2,https://github.com/Multiscale-Sandbox/spinicedata,1939916,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
3,https://github.com/Multiscale-Sandbox/spinicedata,1940145,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
4,https://github.com/Multiscale-Sandbox/spinicedata,1940287,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
...,...,...,...,...,...
1467,https://github.com/sugwg/gw170817-common-eos,1748958,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
1468,https://github.com/sugwg/gw170817-common-eos,1714498,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
1469,https://github.com/sugwg/gw170817-common-eos,1707954,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False
1470,https://github.com/sugwg/gw170817-common-eos,1541396,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False


In [4]:
# Read lindseys labelled github repos data and clean
lindsey_coded_repos = pd.read_csv(
    "all-github-search-results-duplicates-removed - Lindsey.csv",
)
lindsey_coded_repos = lindsey_coded_repos[["include/exclude", "link"]]
lindsey_coded_repos["annotator"] = "lindsey"

In [5]:
# Read richards labelled github repos data and clean
richard_coded_repos = pd.read_csv(
    "all-github-search-results-duplicates-removed - Richard.csv",
)
richard_coded_repos = richard_coded_repos[["include/exclude", "link"]]
richard_coded_repos["annotator"] = "richard"

In [6]:
# Join and clean
data_lindsey = linked_nsf_github_sample.join(
    lindsey_coded_repos.set_index("link"), on="github_link",
)
data_richard = linked_nsf_github_sample.join(
    richard_coded_repos.set_index("link"), on="github_link",
)
data = pd.concat([data_lindsey, data_richard])
data = data.dropna(
    subset=["include/exclude"],
)
data

Unnamed: 0,github_link,nsf_award_id,nsf_link,from_template_repo,is_a_fork,include/exclude,annotator
0,https://github.com/CUAHSI/HydroServer,0622374,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,include,lindsey
2,https://github.com/Multiscale-Sandbox/spinicedata,1939916,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey
3,https://github.com/Multiscale-Sandbox/spinicedata,1940145,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey
4,https://github.com/Multiscale-Sandbox/spinicedata,1940287,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey
5,https://github.com/Multiscale-Sandbox/spinicedata,1940260,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey
...,...,...,...,...,...,...,...
1467,https://github.com/sugwg/gw170817-common-eos,1748958,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard
1468,https://github.com/sugwg/gw170817-common-eos,1714498,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard
1469,https://github.com/sugwg/gw170817-common-eos,1707954,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard
1470,https://github.com/sugwg/gw170817-common-eos,1541396,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard


## Quick Value Counts

In [7]:
data.loc[
    data.annotator == "lindsey"
]["include/exclude"].value_counts()

exclude    500
include    157
Name: include/exclude, dtype: int64

In [8]:
data.loc[
    data.annotator == "richard"
]["include/exclude"].value_counts()

exclude    564
include    302
Name: include/exclude, dtype: int64

## Get NSF Award Abstracts

In [9]:
from typing import Dict, Union

import requests
from tqdm.contrib.concurrent import thread_map

from soft_search.constants import NSFFields

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def _thread_abstract_text(award_id: int) -> Dict[str, Union[int, str]]:
    response_data = requests.get(
        f"https://api.nsf.gov/"
        f"services/v1/awards/{award_id}.json"
        f"?printFields={NSFFields.abstractText}"
    ).json()
    
    # Handle data existance
    if "response" not in response_data:
        return None
    response_subset = response_data["response"]
    
    if "award" not in response_subset:
        return None
    award_data = response_subset["award"]
    
    if len(award_data) == 0:
        return None
    single_award = award_data[0]
    
    # Return the award id and the abstract text
    return {
        "award_id": award_id,
        "abstract_text": single_award[NSFFields.abstractText]
    }

# Thread gather texts
abstract_texts_list = thread_map(
    _thread_abstract_text,
    data.nsf_award_id.unique(),
)

# Filter failed values
abstract_texts = pd.DataFrame([at for at in abstract_texts_list if at is not None])

# Join to original data frame
data = data.join(abstract_texts.set_index("award_id"), on="nsf_award_id")

# Drop any rows that are missing abstract text
data = data.dropna(subset=["abstract_text"])
data

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 757/757 [01:24<00:00,  8.91it/s]


Unnamed: 0,github_link,nsf_award_id,nsf_link,from_template_repo,is_a_fork,include/exclude,annotator,abstract_text
0,https://github.com/CUAHSI/HydroServer,0622374,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,include,lindsey,This proposal advances integrative hydrologic ...
2,https://github.com/Multiscale-Sandbox/spinicedata,1939916,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey,This project addresses two of the most pressin...
3,https://github.com/Multiscale-Sandbox/spinicedata,1940145,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey,This project addresses two of the most pressin...
4,https://github.com/Multiscale-Sandbox/spinicedata,1940287,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey,This project addresses two of the most pressin...
5,https://github.com/Multiscale-Sandbox/spinicedata,1940260,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,lindsey,This project addresses two of the most pressin...
...,...,...,...,...,...,...,...,...
1467,https://github.com/sugwg/gw170817-common-eos,1748958,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard,The Kavli Institute for Theoretical Physics (K...
1468,https://github.com/sugwg/gw170817-common-eos,1714498,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard,Gamma-Ray Bursts (GRB) are intimately linked w...
1469,https://github.com/sugwg/gw170817-common-eos,1707954,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard,We have entered a new age of human exploration...
1470,https://github.com/sugwg/gw170817-common-eos,1541396,https://www.nsf.gov/awardsearch/showAward?AWD_...,False,False,exclude,richard,Investment in campus cyberinfrastructure (CI) ...


## Prep Data for Training

In [11]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [12]:
# Set up data splits of train=0.6 test=0.2 valid=0.2

# select only the columns we need
subset_data = data[["annotator", "abstract_text", "include/exclude"]]

# set the labels to ints
subset_data = subset_data.replace({"exclude": 0, "include": 1})

# lindsey
lindsey_data = subset_data.loc[subset_data.annotator == "lindsey"].drop(
    columns=["annotator"]
)
lindsey_train, lindsey_test_and_valid = train_test_split(
    lindsey_data,
    test_size=0.6,
    stratify=lindsey_data["include/exclude"],
)
lindsey_test, lindsey_valid = train_test_split(
    lindsey_test_and_valid,
    test_size=0.5,
    stratify=lindsey_test_and_valid["include/exclude"],
)

# richard
richard_data = subset_data.loc[subset_data.annotator == "richard"].drop(
    columns=["annotator"]
)
richard_train, richard_test_and_valid = train_test_split(
    richard_data,
    test_size=0.6,
    stratify=richard_data["include/exclude"],
)
richard_test, richard_valid = train_test_split(
    richard_test_and_valid,
    test_size=0.5,
    stratify=richard_test_and_valid["include/exclude"],
)

In [13]:
# Convert to Huggingface Dataset objects
lindsey_train = Dataset.from_pandas(lindsey_train, preserve_index=False)
lindsey_test = Dataset.from_pandas(lindsey_test, preserve_index=False)
lindsey_valid = Dataset.from_pandas(lindsey_valid, preserve_index=False)
richard_train = Dataset.from_pandas(richard_train, preserve_index=False)
richard_test = Dataset.from_pandas(richard_test, preserve_index=False)
richard_valid = Dataset.from_pandas(richard_valid, preserve_index=False)

## Train Models for Each Person

In [14]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import accuracy_score

In [15]:
# Load a SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
# Run model training and eval
models = {}
for ds_name, train_ds, test_ds in [
    ("lindsey", lindsey_train, lindsey_test),
    ("richard", richard_train, lindsey_test),
]:  
    # Create trainer
    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        loss_class=CosineSimilarityLoss,
        metric="accuracy",
        batch_size=2,
        num_iterations=20,
        num_epochs=1,
        column_mapping={"abstract_text": "text", "include/exclude": "label"},
    )

    # Train and evaluate
    trainer.train()
    metrics = trainer.evaluate()
    models[ds_name] = trainer.model
    trainer.model.save_pretrained(f"trained-soft-search-transformer-{ds_name}")
    
    # Print stats and predictions
    print(ds_name)
    print("training accuracy:", metrics["accuracy"])

Applying column mapping to training dataset
***** Running training *****
  Num examples = 10400
  Num epochs = 1
  Total optimization steps = 5200
  Total train batch size = 2
Epoch:   0%|                                                                                                                                                            | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                                                     | 0/5200 [00:00<?, ?it/s][A
Iteration:   0%|                                                                                                                                             | 1/5200 [00:00<51:44,  1.67it/s][A
Iteration:   0%|                                                                                                                                             | 2/5200 [00:00<39:52,  2.17it/s][A
Iteration:   0%|                                   

lindsey
training accuracy: 0.6871794871794872


***** Running training *****
  Num examples = 13640
  Num epochs = 1
  Total optimization steps = 6820
  Total train batch size = 2
Epoch:   0%|                                                                                                                                                            | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                                                     | 0/6820 [00:00<?, ?it/s][A
Iteration:   0%|                                                                                                                                             | 1/6820 [00:00<50:37,  2.24it/s][A
Iteration:   0%|                                                                                                                                             | 2/6820 [00:00<50:11,  2.26it/s][A
Iteration:   0%|                                                                               

In [18]:
# Run validation accuracy
for ds_name, valid_ds in [
    ("lindsey", lindsey_valid),
    ("richard", richard_valid),
]:
    print(ds_name)
    model = models[ds_name]
    preds = model(valid_ds["abstract_text"])
    # print("predictions from validation set:", preds)
    # print("ground truth for validation set:", valid_ds["include/exclude"])
    print("validation accuracy:", accuracy_score(valid_ds["include/exclude"], preds))
    print("-" * 80)

lindsey
validation accuracy: 0.7908163265306123
--------------------------------------------------------------------------------
richard
validation accuracy: 0.6484375
--------------------------------------------------------------------------------
