# Federal Project Inventory Model Use
Let's use our stored (pickled) a model to identify categories given a program text (see fpi_create_model).

## Key Imports

In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

## Define a function that will apply our estimators and use the probability to sort the categories

In [2]:
def predict_categories(text,estimators):
    """
    Given text and a dictionary of estimators, apply the estimators on the text and sort the result
    by the calculated probability
    """
    results = []
    for category, estimator in estimators.items():
        results.append((category, estimator.predict_proba([text])[0][1]))
    return sorted(results, key = lambda x: -x[1])
        

## Now load and test the estimators

In [3]:
import pickle

estimators = pickle.load(open("fpi_estimators.pkl", "rb"))

In [4]:
predict_categories("Area Development Base Program Provides access to broadband by funding for the cost of constructing, improving, and acquiring facilities and equipment for broadband service in rural communities of 20,000 inhabitants or less. Direct loans are made at the cost of money to the Treasury for the life of the facilities financed.", estimators)

[('Broadband', 1.0),
 ('Native American', 0.19973434838156898),
 ('Transportation Infrastructure', 0.1289747674224514),
 ('Opioid Epidemic Response', 0.06096071424030308),
 ('Workforce Development', 0.04613192362603553),
 ('Homelessness', 0.023516401253228236),
 ('HIV/AIDS', 0.023380235332656832),
 ('Economic Development', 0.0),
 ('STEM Education', 0.0),
 ('Flood Risk', 0.0),
 ('A.I. R&D/Quantum R&D', 0.0),
 ('Global Health', 0.0)]

In [5]:
predict_categories("I have no idea how to convince this system to mark this text as native american, but I'm guessing. What happens if I also discuss tribal issues and housing? Homelessness and poverty is a concern with native american populations. Housing is important.",estimators)

[('Homelessness', 1.0),
 ('Native American', 0.9803061857199871),
 ('Opioid Epidemic Response', 0.06096071424030308),
 ('Workforce Development', 0.04613192362603553),
 ('HIV/AIDS', 0.01612885199563541),
 ('Broadband', 0.01319442303341195),
 ('Economic Development', 0.0),
 ('STEM Education', 0.0),
 ('Flood Risk', 0.0),
 ('A.I. R&D/Quantum R&D', 0.0),
 ('Global Health', 0.0),
 ('Transportation Infrastructure', 0.0)]

### Let's find out what contributed to the "HIV/AIDS" Category

In [6]:
estimator = estimators["HIV/AIDS"]

In [7]:
features = estimator.named_steps['vectorizer'].get_feature_names()
mask = estimator.named_steps['select'].get_support()
new_features = [ feature for bool, feature in zip (mask, features) if bool ]
nf = pd.DataFrame({'features': new_features, 'coef': estimator.named_steps['classifier'].coef_[0] })
nf.sort_values(['coef'], ascending=0)

Unnamed: 0,features,coef
443,hiv,7.015168
71,aids,5.393709
826,rwhap,4.818558
154,categories,2.469945
596,minority,2.177972
...,...,...
948,tribal,-2.360657
373,federal,-3.006603
737,program,-3.036754
83,american,-3.139768


## Define a function that lists the features found in a string

In [8]:
from collections import Counter

def why_category(text,estimators,category):
    """
    Given text and a dictionary of estimators, apply the estimators on the text and sort the result
    by the calculated probability
    """
    results = {}
    # Remove simple punctuation, make lowercase, and add a beginning and trailing space
    text = " " + text.replace(',','').replace('.','').lower() + " "
    # find the right estimator
    for est_cat, estimator in estimators.items():
        if (est_cat == category):
            # Get features and weights from the estimator
            features = estimator.named_steps['vectorizer'].get_feature_names()
            mask = estimator.named_steps['select'].get_support()
            new_features = [ feature for bool, feature in zip (mask, features) if bool ]
            nf = pd.DataFrame({'features': new_features, 'coef': estimator.named_steps['classifier'].coef_[0] })
            pd.options.display.max_rows=500
            # print(nf.sort_values(['coef'], ascending=0))
            # Now loop through each feature and count up occurences (including beginning and trailing spaces)
            for _, row in nf.iterrows():
                freq = text.count(" " + row['features'] + " ")
                if (freq > 0):
                    results[row['features']] = [freq, row['coef']]
    # Return results sorted by product of freq and coefficient
    return {k: v for k,v in sorted(results.items(), 
                                   key=lambda item: (item[1][0] * item[1][1]),
                                   reverse=True)}

In [9]:
wc = why_category("Agricultural Research Basic and Applied Research AGRICULTURAL RESEARCH SERVICE, AGRICULTURE, DEPARTMENT OF To make agricultural research discoveries, evaluate alternative ways of attaining research goals, and provide scientific technical information.",
                 estimators,
                 "Native American")
wc

{'make': [1, 3.8542492734462956],
 'basic': [1, -0.25983401117225263],
 'information': [1, -0.4854195842491149],
 'ways': [1, -0.5474837089922611],
 'research': [5, -2.8147870263424366]}