# Training Toy SetFit Models for NSF Award Abstract Software Prediction

Quick notebook which uses only a sample of our data, merged in the current annotations from Lindsey and Richard, gets the NSF award abstract texts and then trains a model with SetFit.

Larger example to come soon^tm.

In [None]:
import pandas as pd

## Setup

1. Read a sample of the "NSF + GitHub Linked" data (output from Eva's script)
2. Read Lindsey's labelled GitHub Repos for Software Classification
3. Read Richard's labelled GitHub Repos for Software Classification
4. Join the datasets together and drop any NA

In [None]:
# Read nsf + github linked sample
linked_nsf_github_sample = pd.read_parquet(
    "/Users/eva/Downloads/linked-github-nsf-results.parquet",
)

In [None]:
# Read lindseys labelled github repos data and clean
lindsey_coded_repos = pd.read_csv(
    "/Users/eva/Downloads/all-github-search-results-duplicates-removed - Lindsey.csv",
)
lindsey_coded_repos = lindsey_coded_repos[["include/exclude", "link"]]
lindsey_coded_repos["annotator"] = "lindsey"

In [None]:
# Read richards labelled github repos data and clean
richard_coded_repos = pd.read_csv(
    "/Users/eva/Downloads/all-github-search-results-duplicates-removed - Richard.csv",
)
richard_coded_repos = richard_coded_repos[["include/exclude", "link"]]
richard_coded_repos["annotator"] = "richard"

In [None]:
# Join and clean
data_lindsey = linked_nsf_github_sample.join(
    lindsey_coded_repos.set_index("link"), on="github_link",
)
data_richard = linked_nsf_github_sample.join(
    richard_coded_repos.set_index("link"), on="github_link",
)
data = pd.concat([data_lindsey, data_richard])
data = data.dropna(
    subset=["include/exclude"],
)
data.head()

## Quick Value Counts

In [None]:
data.loc[
    data.annotator == "lindsey"
]["include/exclude"].value_counts()

In [None]:
data.loc[
    data.annotator == "richard"
]["include/exclude"].value_counts()

## Get NSF Award Abstracts

In [None]:
from typing import Dict, Union

import requests
from tqdm.contrib.concurrent import thread_map

from soft_search.constants import NSFFields

In [None]:
def _get_abstract_text(award_id: int) -> Dict[str, Union[int, str]]:
    return {
        "award_id": award_id,
        "abstract_text": requests.get(
            f"https://api.nsf.gov/"
            f"services/v1/awards/{award_id}.json"
            f"?printFields={NSFFields.abstractText}"
        ).json()["response"]["award"][0][NSFFields.abstractText]
    }

abstract_texts = pd.DataFrame(
    thread_map(
        _get_abstract_text,
        data.nsf_award_id.unique(),
    )
)

data = data.join(abstract_texts.set_index("award_id"), on="nsf_award_id")
data.head()

## Prep Data for Training

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [None]:
# Set up data splits of train=0.6 test=0.2 valid=0.2

# select only the columns we need
subset_data = data[["annotator", "abstract_text", "include/exclude"]]

# lindsey
lindsey_data = subset_data.loc[subset_data.annotator == "lindsey"].drop(
    columns=["annotator"]
)
lindsey_train, lindsey_test_and_valid = train_test_split(
    lindsey_data,
    test_size=0.6,
    stratify=lindsey_data["include/exclude"],
)
lindsey_test, lindsey_valid = train_test_split(
    lindsey_test_and_valid,
    test_size=0.5,
    stratify=lindsey_test_and_valid["include/exclude"],
)

# richard
richard_data = subset_data.loc[subset_data.annotator == "richard"].drop(
    columns=["annotator"]
)
richard_train, richard_test_and_valid = train_test_split(
    richard_data,
    test_size=0.6,
    stratify=richard_data["include/exclude"],
)
richard_test, richard_valid = train_test_split(
    richard_test_and_valid,
    test_size=0.5,
    stratify=richard_test_and_valid["include/exclude"],
)

In [None]:
# Convert to Huggingface Dataset objects
lindsey_train = Dataset.from_pandas(lindsey_train, preserve_index=False)
lindsey_test = Dataset.from_pandas(lindsey_test, preserve_index=False)
lindsey_valid = Dataset.from_pandas(lindsey_valid, preserve_index=False)
richard_train = Dataset.from_pandas(richard_train, preserve_index=False)
richard_test = Dataset.from_pandas(richard_test, preserve_index=False)
richard_valid = Dataset.from_pandas(richard_valid, preserve_index=False)

## Train Models for Each Person

In [None]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

In [None]:
# Load a SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

In [None]:
for train_ds, test_ds, valid_ds in [
    (lindsey_train, lindsey_test, lindsey_valid),
    (richard_train, lindsey_test, lindsey_valid),
]:  
    # Create trainer
    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        loss_class=CosineSimilarityLoss,
        metric="accuracy",
        batch_size=16,
        num_iterations=20,
        num_epochs=1,
        column_mapping={"abstract_text": "text", "include/exclude": "label"},
    )

    # Train and evaluate
    # trainer.train()
    # metrics = trainer.evaluate()
    break