# Use LLM/VD Model on Assistance Listings

In [1]:
import sys
print (sys.version)

3.10.11 (main, Apr  5 2023, 14:15:30) [GCC 7.5.0]


## Open ChatGPT and Pinecone VD

In [2]:
import os
import openai

# get API key from top-right dropdown on OpenAI website
openai.api_key = os.getenv("OPENAI_API_KEY") or "OPENAI_API_KEY"

# openai.Engine.list()  # check we have authenticated

In [4]:
# Connect to Pinecone with PINECONE_API_KEY and PINECONE_ENVIRONMENT

import pinecone
from tqdm import tqdm

api_key = os.getenv("PINECONE_API_KEY") or "PINECONE_API_KEY"
# find your environment next to the api key in pinecone console
env = os.getenv("PINECONE_ENVIRONMENT") or "PINECONE_ENVIRONMENT"

pinecone.init(api_key=api_key, enviroment=env)

# Create a sample embedding so that we know the embedding length

embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=["This is sample test that will determine the length"],
    engine=embed_model
)

embedding_length = len(res['data'][0]['embedding'])
embedding_length

1536

In [5]:
# Connect to Pinecone VD Index

index_name = 'openai-fpi-categorization'

# connect to index
index = pinecone.Index(index_name)
# view index stats 
index_stats = index.describe_index_stats()
index_stats

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 711}},
 'total_vector_count': 711}

## Now import the Assistance Listings

In [59]:
import pandas as pd

# Read the CSV File into a dataframe

labeled_data = pd.read_csv('AssistanceListings_USASpendingGov_PUBLIC_WEEKLY_20230520.csv',encoding='cp1252',
                           dtype=str, keep_default_na=False,
                           usecols=['Program Title','Program Number','Federal Agency (030)','Objectives (050)',
                                   'Beneficiary Eligibility (082)'])

# Join Program Title, Federal Agency and Objectives into a single text field
# Let's join the Agency, Program Name, Program Description, Mission/Purpose, and Beneficiaries
# into a single Text field (make it human readable for the LLM)
labeled_data['text'] = ("Program Name: " + labeled_data['Program Title'] + "\n" +
                        "Agency: " + labeled_data['Federal Agency (030)'] + "\n" +
                        "Program Description: " + labeled_data['Objectives (050)'] + "\n" +
                        "Beneficiaries: " + labeled_data['Beneficiary Eligibility (082)'])

labeled_data

Unnamed: 0,Program Title,Program Number,Federal Agency (030),Objectives (050),Beneficiary Eligibility (082),text
0,Agricultural Research Basic and Applied Research,10.001,"AGRICULTURAL RESEARCH SERVICE, AGRICULTURE, DE...","To make agricultural research discoveries, eva...",Usually nonprofit institutions of higher educa...,Program Name: Agricultural Research Basic and ...
1,"Plant and Animal Disease, Pest Control, and An...",10.025,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",To protect U.S. agriculture from economically ...,"Farmers, ranchers, agriculture producers, Stat...","Program Name: Plant and Animal Disease, Pest C..."
2,Wildlife Services,10.028,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",To reduce damage caused by mammals and birds a...,"States, local jurisdictions, U.S. Territorial ...",Program Name: Wildlife Services\nAgency: ANIMA...
3,Avian Influenza Indemnity Program,10.029,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",The Animal and Plant Health Inspection Service...,Poultry owners and contract growers.,Program Name: Avian Influenza Indemnity Progra...
4,Indemnity Program,10.030,"ANIMAL AND PLANT HEALTH INSPECTION SERVICE, AG...",Animal and Plant Health Inspection Service adm...,,Program Name: Indemnity Program\nAgency: ANIMA...
...,...,...,...,...,...,...
3499,Food for Peace Emergency Program (EP),98.008,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...","To improve access, availability and utilizatio...","Private, non-profit institutions/organizations...",Program Name: Food for Peace Emergency Program...
3500,John Ogonowski Farmer-to-Farmer Program,98.009,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",Improve global food production and marketing b...,"Farms, cooperatives, farmers associations, agr...",Program Name: John Ogonowski Farmer-to-Farmer ...
3501,Denton Program,98.010,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",To put the empty space on U.S. Military transp...,"PVOs, NGOs and International Organizations (IO...",Program Name: Denton Program\nAgency: AGENCY F...
3502,Global Development Alliance,98.011,"AGENCY FOR INTERNATIONAL DEVELOPMENT, AGENCY F...",The Global Development Alliance (GDA) business...,"Beneficiaries are foreign governments, foreign...",Program Name: Global Development Alliance\nAge...


In [27]:
# Get the categories
categories = pd.read_csv('Federal_Program_Inventory_Pilot_Data.csv',encoding='cp1252',usecols=['Category'])
categories = categories.drop_duplicates().reset_index(drop=True)
categories = [r["Category"] for _, r in categories.iterrows()]
categories

['Broadband',
 'Economic Development',
 'Opioid Epidemic Response',
 'STEM Education',
 'Workforce Development',
 'Native American',
 'Flood Risk',
 'A.I. R&D/Quantum R&D',
 'Global Health',
 'Homelessness',
 'HIV/AIDS',
 'Transportation Infrastructure']

In [60]:
print(labeled_data['Program Number'][400])
print(labeled_data['text'][400])

10.751
Program Name: Rural Energy Savings Program (RESP)
Agency: RURAL UTILITIES SERVICE, AGRICULTURE, DEPARTMENT OF
Program Description: The Rural Energy Savings Program (RESP) provides loans to entities who provide energy efficiency 
services in rural areas that agree to make affordable loans to help Qualified consumers implement cost- effective, energy efficiency measures. This program, authorized by Congress in the 2014 Farm Bill, helps to build a cleaner and more sustainable domestic energy sector for future generations. RESP will help lower energy bills for rural families and businesses and will reduce barriers to 
investment in energy efficiency projects or activities.
Beneficiaries: N/A


## Define the Categorization Query

In [65]:
import json

# Gets the categories
# openai, index, categories, embed_model are global
def llm_vd_get_categories_4(text, temperature=0, top_k=5):
    # First get the related hits from Pinecone
    qe = openai.Embedding.create(input=[text], engine=embed_model)
    res = index.query(qe['data'][0]['embedding'], top_k=top_k, include_metadata=True)
    # Construct the query
    query = ("You are natural language processing categorization tool.\n" +
             "Given examples of nearest-neighbor US grant programs and their categories. You will assign " +
             "zero to many categories to a new grant program.\n" +
             "The category choices are ['" + "','".join(categories) + "'])\n" +
             "The following are pre-categorized programs and their known categories:\n\n")
    for match in res['matches']:
        query += (match['metadata']['text'] + "\n\n" +
                  "Categories: " + ', '.join(match['metadata']['categories']) + "\n\n" +
                  "---" + "\n\n")
    query += ("Please use the above categorized programs to assign categories to the following program description: \n\n" +
              "Program text: " + text + "\n\n" +
              "Output a JSON object (do not output any other text) with four entries: \n" +
              " 1) 'clues' First, list CLUES (i.e. Keywords, phrases, contextual information, smenatic relations, semantic meaning, tones, references) that support the categorization\n" + 
              " 2) 'reasoning' Second, deduce the diagnostic REASONING process from premises (i.e., clues, input) that supports the categorization\n" + 
              " 3) 'categories' an array of strings with your categories based upon the clues and reasoning (the choices are ['" + "','".join(categories) + "'])\n" +
              " 4) 'confidence' a number from 0 to 100 representing your confidence (100 being the most confident)""\n\n")
#    print(query)
    chat = openai.ChatCompletion.create(
        model='gpt-4',
        temperature=temperature,
        messages=[{"role": "user", "content" : query}]
    )
    return(json.loads(chat['choices'][0]['message']['content']))

In [66]:
llm_vd_get_categories_4(labeled_data['text'][400],0)

{'clues': ['Rural Energy Savings Program',
  'provides loans',
  'energy efficiency services',
  'rural areas',
  'cost-effective, energy efficiency measures',
  'lower energy bills for rural families and businesses',
  'reduce barriers to investment in energy efficiency projects or activities'],
 'reasoning': "The program is designed to provide loans for energy efficiency services in rural areas. This is similar to the 'Rural Economic Development Loans' and 'Rural Economic Development Grants' programs, which also provide funding for rural areas and are categorized under 'Economic Development'. The focus on energy efficiency also aligns with the 'Energy Efficiency and Renewable Energy (EERE)' program, which falls under the 'A.I. R&D/Quantum R&D' category. However, the RESP program does not mention any involvement of Native American communities or STEM education, unlike the EERE program.",
 'categories': ['Economic Development', 'A.I. R&D/Quantum R&D'],
 'confidence': 85}

## Sample for Model Validation

Here we grab examples for review

In [67]:
sample_count = 25

import random
# sample_ids = random.sample(range(0, len(labeled_data['text'])), sample_count)
# sample_ids.sort()
# Record for repeatability
sample_ids = [81, 182, 400, 680, 957, 1049, 1278, 1361, 1458, 1522, 1658, 
              1711, 1745, 1924, 2078, 2433, 2719, 2760, 2784, 2891, 2932, 2994, 3358, 3402, 3437]
print(sample_ids)

[81, 182, 400, 680, 957, 1049, 1278, 1361, 1458, 1522, 1658, 1711, 1745, 1924, 2078, 2433, 2719, 2760, 2784, 2891, 2932, 2994, 3358, 3402, 3437]


In [68]:
found_categories=[]
if os.path.exists("found_categories_al.json"):
    with open("found_categories_al.json","r") as infile:
        found_categories = json.load(infile)

In [69]:
import time

# The following is a reentrant loop so that we can calculate the categories
# in batches
error_count = 0
for i in range(0,len(sample_ids)):
    if (i >= len(found_categories)):
        key = sample_ids[i]
        while True:
            try:
                chat_result = llm_vd_get_categories_4(labeled_data['text'][key],error_count * 0.1)
                error_count = 0
                break
            except Exception as e:
                print(e)
                error_count += 1
                if error_count > 3:
                    raise Exception("Three errors in a row")
                time.sleep(5)
        chat_result["key"] = key # Record key just in case
        chat_result["Program Number"] = labeled_data['Program Number'][key] # Record key just in case
        # print(chat_result)
        print(f"{i} {sample_ids[i]}")
        found_categories.append(chat_result)
        with open("found_categories_al.json","w") as outfile:
            outfile.write(json.dumps(found_categories))

0 81
1 182
2 400
3 680
4 957
5 1049
6 1278
7 1361
8 1458
9 1522
10 1658
11 1711
12 1745
13 1924
14 2078
15 2433
16 2719
17 2760
18 2784
19 2891
20 2932
21 2994
22 3358
23 3402
24 3437
