# Data Cleaning and Understanding

### Import and Install Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import mannwhitneyu
from tqdm import tqdm

import nltk
import os
import pickle
from nltk.tokenize import TreebankWordTokenizer



### Create the manual word tokenizer

In [2]:
# Define your text
text = "This is a test sentence."

# Path to punkt pickle file
nltk_data_path = '/Users/ryanseely/nltk_data'
punkt_path = os.path.join(nltk_data_path, 'tokenizers', 'punkt', 'english.pickle')

# Load sentence tokenizer manually from file
with open(punkt_path, 'rb') as f:
    sentence_tokenizer = pickle.load(f)

# Tokenize
sentences = sentence_tokenizer.tokenize(text)
word_tokenizer = TreebankWordTokenizer()
tokens = [word_tokenizer.tokenize(sent) for sent in sentences]

# Flatten token list
flat_tokens = [token for sublist in tokens for token in sublist]
print(flat_tokens)

# WE HAVE TO MANUALLY CREATE word_tokenize()

def word_tokenize_manual(text):
    # Path to your punkt file
    nltk_data_path = '/Users/ryanseely/nltk_data'
    punkt_path = os.path.join(nltk_data_path, 'tokenizers', 'punkt', 'english.pickl‌​e')

    # Load sentence tokenizer manually
    with open(punkt_path, 'rb') as f:
        sentence_tokenizer = pickle.load(f)

    # Tokenize into sentences
    sentences = sentence_tokenizer.tokenize(text)

    # Tokenize into words using TreebankWordTokenizer
    word_tokenizer = TreebankWordTokenizer()
    tokens = [word_tokenizer.tokenize(sent) for sent in sentences]

    # Flatten and return token list
    return [token for sublist in tokens for token in sublist]

def word_tokenize_manual(text):
    sentences = sentence_tokenizer.tokenize(text)
    return [token for sent in sentences for token in word_tokenizer.tokenize(sent)]

['This', 'is', 'a', 'test', 'sentence', '.']


### Read in Data

Create a function that pulls in each award dataset and creates a pandas dataframe called `awards_data`

- This functions also adds a `Year` variable with the year awarded for each award

Then it creates a dataframe for each year called `awards_xxxx`, where the x's represent the year

In [3]:
# Build pipeline that imports data from year of ranges from github and add a year string variable to each observation for each year, then makes each year its own dataframe

def load_and_create_award_data(start_year=2016, end_year=2025):
    base_url = "https://raw.githubusercontent.com/ryanpseely/reu_nsf/main/awards_{}.csv"
    all_data = []

    for year in range(start_year, end_year + 1):
        try:
            url = base_url.format(year)
            df = pd.read_csv(url, encoding='latin1')
            df['Year'] = str(year)
            all_data.append(df)
            globals()[f"awards_{year}"] = df  # Assign as variable
        except Exception as e:
            print(f"❌ Failed to load {year}: {e}")

    if not all_data:
        raise ValueError("No data loaded. Please check the URLs or years.")

    full_df = pd.concat(all_data, ignore_index=True)
    return full_df


In [4]:
awards_data = load_and_create_award_data(2016, 2025)

### How many awards each year?

In [5]:
# Count the number of awards for each year
award_counts = {
    "2016": len(awards_2016),
    "2017": len(awards_2017),
    "2018": len(awards_2018),
    "2019": len(awards_2019),
    "2020": len(awards_2020),
    "2021": len(awards_2021),
    "2022": len(awards_2022),
    "2023": len(awards_2023),
    "2024": len(awards_2024),
    "2025": len(awards_2025) 
}

# Print the counts neatly
print("Year-wise Award Counts:")
for year, count in award_counts.items():
    print(f"{year}: {count}")

# Check that yearly count matches with total count
total_count = len(awards_data)

sum_of_yearly_counts = (len(awards_2016) + len(awards_2017) + len(awards_2018) + len(awards_2019) + len(awards_2020) + len(awards_2021) + len(awards_2022) + len(awards_2023) + len(awards_2024) + len(awards_2025))
print(f"Total count from individual years: {sum_of_yearly_counts}")
print(f"Total count from all years combined: {total_count}")

Year-wise Award Counts:
2016: 1599
2017: 1631
2018: 1306
2019: 1845
2020: 1793
2021: 1620
2022: 1745
2023: 1458
2024: 1542
2025: 798
Total count from individual years: 15337
Total count from all years combined: 15337


### Remove NSF Award Statement repeated in most of the abstracts

In [6]:
boilerplate = (
    "This award reflects NSF's statutory mission and has been deemed worthy of support "
    "through evaluation using the Foundation's intellectual merit and broader impacts review criteria."
)

# Count occurrences across all years
total_count = awards_data["Abstract"].str.contains(boilerplate, na=False).sum()
counts_by_year = (
    awards_data["Abstract"]
    .str.contains(boilerplate, na=False)
    .groupby(awards_data["Year"])
    .sum()
)

# Count each year's occurrences
for year, count in counts_by_year.items():
    print(f"{year}: {count}")

# remove the boilerplate text from the Abstract column
awards_data["Abstract"] = awards_data["Abstract"].str.replace(boilerplate, "", regex=False)

# count how many are remaining and print
remaining_counts = (
    awards_data["Abstract"]
    .str.contains(boilerplate, na=False)
    .groupby(awards_data["Year"])
    .sum()
)

for year, count in remaining_counts.items():
    print(f"{year} remaining: {count}")

# boilerplate removed from all abstracts

2016: 0
2017: 0
2018: 1053
2019: 1722
2020: 1726
2021: 1592
2022: 1727
2023: 1455
2024: 1539
2025: 797
2016 remaining: 0
2017 remaining: 0
2018 remaining: 0
2019 remaining: 0
2020 remaining: 0
2021 remaining: 0
2022 remaining: 0
2023 remaining: 0
2024 remaining: 1
2025 remaining: 0


In [7]:
import nltk
nltk.download('punkt', force=True)
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')




from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
import re, string, nltk

lemmatizer = WordNetLemmatizer()

# Helper to convert POS tags to WordNet format
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

[nltk_data] Downloading package punkt to /Users/ryanseely/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ryanseely/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ryanseely/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ryanseely/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/ryanseely/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [8]:
# build text processing pipeline - just for TF-IDF, i should not apply this on the entire dataset 
import inflect
p = inflect.engine()

# ---- Text Cleaning Helpers ----
def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def collapse_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def lemmatize_text(text):
    tokens = word_tokenize_manual(text)
    tagged = pos_tag(tokens)
    lemmatized = []

    for word, tag in tagged:
        wn_tag = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word, wn_tag) if wn_tag else word
        if tag.startswith('N') and p.singular_noun(lemma):
            lemma = p.singular_noun(lemma)
        lemmatized.append(lemma)

    return ' '.join(lemmatized)

# ---- Full Preprocessing Function ----
def preprocess_abstract_column(df, col='Abstract'):
    before = len(df)
    df = df.dropna(subset=[col]).copy()
    after = len(df)
    df[col] = df[col].apply(lambda x: lemmatize_text(
        collapse_whitespace(
            remove_numbers(
                remove_punctuation(
                    clean_html(x)
                )
            )
        )
    ))
    return df, before, after

In [9]:
awards_data_cleaned, before, after = preprocess_abstract_column(awards_data, col='Abstract')

In [10]:
awards_data_cleaned.to_csv("awards_data_cleaned.csv", index=False)

In [11]:
print("Before (original row count):", before)
print("After (remaining with abstract):", after)
print("Dropped rows:", before - after)

Before (original row count): 15337
After (remaining with abstract): 15290
Dropped rows: 47


In [12]:
for i, abstract in enumerate(awards_data_cleaned['Abstract'][:5]):
    print(f"Abstract {i+1}:", abstract)
    print()

Abstract 1: Earth Science on Volcanic Island ESVI be a new Research Experience for Undergraduate REU Site host by the Department of Geology and Geophysic School of Ocean and Earth Science and Technology at the University of Hawaii at Mânoa Oceanic island formation evolution and sustainability be unify theme that a group of undergraduate student will explore during an exciting week program The Hawaiian Island be of volcanic origin be windows into the Earth interior Built by magma that have ascend roughly a hundred kilometer to reach the surface the volcano subsequently experience deformation erosion flank collapse and eventually sink below sea level Oceanic island be also site of intense biological evolution and be ecological niche in a vast ocean The University of Hawaii be uniquely position to take advantage of these phenomenon and our new program will provide cuttingedge STEM research opportunity for motivated undergraduate student in particular traditionally underrepresented group s

In [13]:
# Search for text I don't want - plurals, br, etc.
full_text = ' '.join(awards_data_cleaned['Abstract'].tolist())

word_to_count = 'fellowships'
count = full_text.lower().split().count(word_to_count.lower())

print(f"The word '{word_to_count}' appears {count} times.")

# There is still a problem with plurals. Fellowship appears 985 times, fellowships appears 3 times. This is an improvement however, from adding the plural remover to the lemmatizer

The word 'fellowships' appears 3 times.


In [14]:
from collections import Counter
word_counts = Counter(' '.join(awards_data_cleaned['Abstract']).lower().split())
word_counts.most_common(10)

[('the', 317063),
 ('and', 262396),
 ('of', 216169),
 ('to', 165399),
 ('be', 127324),
 ('in', 126783),
 ('a', 116758),
 ('will', 86929),
 ('this', 63383),
 ('for', 62808)]