# Assistance Listings test

Let's use our stored (pickled) a model to identify categories on assistance listings (see fpi_create_model).

## Key Imports

In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

## Define a function that will apply our estimators and use the probability to sort the categories

In [2]:
def predict_categories(text,estimators):
    """
    Given text and a dictionary of estimators, apply the estimators on the text and sort the result
    by the calculated probability
    """
    results = []
    for category, estimator in estimators.items():
        results.append((category, estimator.predict_proba([text])[0][1]))
    return sorted(results, key = lambda x: -x[1])
        

## Now load and test the estimators

In [3]:
import pickle

estimators = pickle.load(open("fpi_estimators.pkl", "rb"))

## Now import the Assistance Listings

In [4]:
# Read the CSV File into a dataframe

labeled_data = pd.read_csv('AssistanceListings_USASpendingGov_PUBLIC_WEEKLY_20220101.csv',encoding='cp1252',
                           usecols=['Program Title','Program Number','Federal Agency (030)','Objectives (050)'])

# Get the categories
categories = pd.read_csv('Federal_Program_Inventory_Pilot_Data.csv',usecols=['Category'])
categories = categories.drop_duplicates().reset_index(drop=True)
categories = [r["Category"] for _, r in categories.iterrows()]

# Join Program Title, Federal Agency and Objectives into a single text field
labeled_data['text'] = labeled_data[['Program Title','Federal Agency (030)','Objectives (050)']].agg(' '.join, axis=1)

labeled_data

Unnamed: 0,Program Title,Program Number,Federal Agency (030),Objectives (050),text
0,Agricultural Research Basic and Applied Research,10.001,"AGRICULTURAL RESEARCH SERVICE, AGRICULTURE, DE...","To make agricultural research discoveries, eva...",Agricultural Research Basic and Applied Resear...
1,"Plant and Animal Disease, Pest Control, and An...",10.025,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",To protect U.S. agriculture from economically ...,"Plant and Animal Disease, Pest Control, and An..."
2,Wildlife Services,10.028,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",To reduce damage caused by mammals and birds a...,Wildlife Services ANIMAL AND PLANT HEALTH INSP...
3,Avian Influenza Indemnity Program,10.029,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",The Animal and Plant Health Inspection Service...,Avian Influenza Indemnity Program ANIMAL AND P...
4,Indemnity Program,10.030,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",Animal and Plant Health Inspection Service adm...,Indemnity Program ANIMAL AND PLANT HEALTH INSP...
...,...,...,...,...,...
3302,Food for Peace Emergency Program (EP),98.008,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...","To improve access, availability and utilizatio...",Food for Peace Emergency Program (EP) AGENCY F...
3303,John Ogonowski Farmer-to-Farmer Program,98.009,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",Improve global food production and marketing b...,John Ogonowski Farmer-to-Farmer Program AGENC...
3304,Denton Program,98.010,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",To put the empty space on U.S. Military transp...,Denton Program AGENCY FOR INTERNATIONAL DEVELO...
3305,Global Development Alliance,98.011,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",The Global Development Alliance (GDA) business...,Global Development Alliance AGENCY FOR INTERNA...


In [5]:
# Get category probabilities
category_probabilities = [predict_categories(row['text'],estimators) for _,row in labeled_data.iterrows()]

category_probabilities[1:3]

[[('Native American', 0.2554974011879961),
  ('Broadband', 0.23586597466623016),
  ('Homelessness', 0.03067793481331821),
  ('STEM Education', 0.02423854847282818),
  ('HIV/AIDS', 0.005351769501626025),
  ('Global Health', 0.0016331950064983891),
  ('Economic Development', 0.0),
  ('Opioid Epidemic Response', 0.0),
  ('Workforce Development', 0.0),
  ('Flood Risk', 0.0),
  ('A.I. R&D/Quantum R&D', 0.0),
  ('Transportation Infrastructure', 0.0)],
 [('Global Health', 0.22268559373837138),
  ('Native American', 0.21911731063746404),
  ('Homelessness', 0.03067793481331821),
  ('STEM Education', 0.024342464353996),
  ('HIV/AIDS', 0.005351769501626025),
  ('Broadband', 0.0),
  ('Economic Development', 0.0),
  ('Opioid Epidemic Response', 0.0),
  ('Workforce Development', 0.0),
  ('Flood Risk', 0.0),
  ('A.I. R&D/Quantum R&D', 0.0),
  ('Transportation Infrastructure', 0.0)]]

In [6]:
# Turn the category pair lists into dictionaries
category_probabilities = [{c:p for (c,p) in l} for l in category_probabilities]

category_probabilities[1:3]

[{'Native American': 0.2554974011879961,
  'Broadband': 0.23586597466623016,
  'Homelessness': 0.03067793481331821,
  'STEM Education': 0.02423854847282818,
  'HIV/AIDS': 0.005351769501626025,
  'Global Health': 0.0016331950064983891,
  'Economic Development': 0.0,
  'Opioid Epidemic Response': 0.0,
  'Workforce Development': 0.0,
  'Flood Risk': 0.0,
  'A.I. R&D/Quantum R&D': 0.0,
  'Transportation Infrastructure': 0.0},
 {'Global Health': 0.22268559373837138,
  'Native American': 0.21911731063746404,
  'Homelessness': 0.03067793481331821,
  'STEM Education': 0.024342464353996,
  'HIV/AIDS': 0.005351769501626025,
  'Broadband': 0.0,
  'Economic Development': 0.0,
  'Opioid Epidemic Response': 0.0,
  'Workforce Development': 0.0,
  'Flood Risk': 0.0,
  'A.I. R&D/Quantum R&D': 0.0,
  'Transportation Infrastructure': 0.0}]

In [7]:
# Now add the category probabilities to the data frame
for c in categories:
    labeled_data[c] = [d[c] for d in category_probabilities]
    
labeled_data

Unnamed: 0,Program Title,Program Number,Federal Agency (030),Objectives (050),text,Broadband,Economic Development,Opioid Epidemic Response,STEM Education,Workforce Development,Native American,Flood Risk,A.I. R&D/Quantum R&D,Global Health,Homelessness,HIV/AIDS,Transportation Infrastructure
0,Agricultural Research Basic and Applied Research,10.001,"AGRICULTURAL RESEARCH SERVICE, AGRICULTURE, DE...","To make agricultural research discoveries, eva...",Agricultural Research Basic and Applied Resear...,0.000000,0.0,0.0,0.508370,0.0,1.000000,0.0,0.663985,0.000000,0.030678,0.005352,0.0
1,"Plant and Animal Disease, Pest Control, and An...",10.025,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",To protect U.S. agriculture from economically ...,"Plant and Animal Disease, Pest Control, and An...",0.235866,0.0,0.0,0.024239,0.0,0.255497,0.0,0.000000,0.001633,0.030678,0.005352,0.0
2,Wildlife Services,10.028,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",To reduce damage caused by mammals and birds a...,Wildlife Services ANIMAL AND PLANT HEALTH INSP...,0.000000,0.0,0.0,0.024342,0.0,0.219117,0.0,0.000000,0.222686,0.030678,0.005352,0.0
3,Avian Influenza Indemnity Program,10.029,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",The Animal and Plant Health Inspection Service...,Avian Influenza Indemnity Program ANIMAL AND P...,0.004658,0.0,0.0,0.025314,0.0,0.353651,0.0,0.000000,0.000000,0.030678,0.005352,0.0
4,Indemnity Program,10.030,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",Animal and Plant Health Inspection Service adm...,Indemnity Program ANIMAL AND PLANT HEALTH INSP...,0.029459,0.0,0.0,0.034794,0.0,0.514101,0.0,0.000000,0.000000,0.030678,0.005352,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3302,Food for Peace Emergency Program (EP),98.008,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...","To improve access, availability and utilizatio...",Food for Peace Emergency Program (EP) AGENCY F...,0.031118,0.0,0.0,0.035133,0.0,0.109998,0.0,0.000000,0.951102,0.030678,0.005352,0.0
3303,John Ogonowski Farmer-to-Farmer Program,98.009,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",Improve global food production and marketing b...,John Ogonowski Farmer-to-Farmer Program AGENC...,0.000000,0.0,0.0,0.062621,0.0,0.000000,0.0,0.000000,1.000000,0.030678,0.005352,0.0
3304,Denton Program,98.010,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",To put the empty space on U.S. Military transp...,Denton Program AGENCY FOR INTERNATIONAL DEVELO...,0.000000,0.0,0.0,0.039510,0.0,0.000000,0.0,0.000000,0.742560,0.030678,0.005352,0.0
3305,Global Development Alliance,98.011,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",The Global Development Alliance (GDA) business...,Global Development Alliance AGENCY FOR INTERNA...,0.000000,0.0,0.0,0.056494,0.0,0.000000,0.0,0.095189,1.000000,0.030678,0.005352,0.0


In [8]:
labeled_data.to_csv('assistance_listings_categorized.csv',index=False)