In [48]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for search
from bs4 import BeautifulSoup as bs
import requests

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-4o-mini"

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", [Enter your OpenAI API Key Here]))


## RAG

In [49]:
snap_dict = {
    'A': ['273.1', '273.2'], 
    'B': ['273.3', '273.4'],
    'C': ['273.5', '273.6', '273.7'],
    'D': ['273.8', '273.9', '273.10', '273.11'],
    'E': ['273.12', '273.13', '273.14', '273.15'],
    'F': ['273.16', '273.17', '273.18']
}

snap_text = pd.DataFrame()
for key, value in snap_dict.items(): 
    for i in value:
        print(key, i)
        URL = f"https://www.ecfr.gov/api/versioner/v1/full/2024-07-30/title-7.xml?subtitle=B&chapter=II&subchapter=C&part=273&subpart={key}&section={i}"
        response = requests.get(URL)
        xml_reg_text = response.content
        sectioned_text = pd.read_xml(xml_reg_text)
        snap_text = pd.concat([snap_text, sectioned_text], axis = 0)


A 273.1
A 273.2
B 273.3
B 273.4
C 273.5
C 273.6
C 273.7
D 273.8
D 273.9
D 273.10
D 273.11
E 273.12
E 273.13
E 273.14
E 273.15
F 273.16
F 273.17
F 273.18


In [50]:
snap_text = snap_text.rename(columns={"HEAD": "title", "P": "text", "I":"section_header"})
snap_text.head(10)

Unnamed: 0,title,text,section_header,TYPE,CITA,HED,PSPACE,E,FP,SU,...,width,DIV,BORDER,DEEP,HTYPE,POSITION,ROTATION,SPAN,STRIP,img
0,§ 273.1 Household concept.,,,,,,,,,,...,,,,,,,,,,
1,,(a),General household definition.,,,,,,,,...,,,,,,,,,,
2,,(1) An individual living alone;,,,,,,,,,...,,,,,,,,,,
3,,"(2) An individual living with others, but cust...",,,,,,,,,...,,,,,,,,,,
4,,(3) A group of individuals who live together a...,,,,,,,,,...,,,,,,,,,,
5,,(b),Required household combinations.,,,,,,,,...,,,,,,,,,,
6,,(i) Spouses;,,,,,,,,,...,,,,,,,,,,
7,,(ii) A person under 22 years of age who is liv...,,,,,,,,,...,,,,,,,,,,
8,,(iii) A child (other than a foster child) unde...,,,,,,,,,...,,,,,,,,,,
9,,(2),Elderly and disabled persons.,,,,,,,,...,,,,,,,,,,


In [52]:
sectioned_text = snap_text[['title', 'section_header', 'text']]
sectioned_text = sectioned_text.fillna('')
sectioned_text['combined_text'] = sectioned_text['title'] + sectioned_text['section_header'] + sectioned_text['text']
sectioned_text.head(10)

Unnamed: 0,title,section_header,text,combined_text
0,§ 273.1 Household concept.,,,§ 273.1 Household concept.
1,,General household definition.,(a),General household definition.(a)
2,,,(1) An individual living alone;,(1) An individual living alone;
3,,,"(2) An individual living with others, but cust...","(2) An individual living with others, but cust..."
4,,,(3) A group of individuals who live together a...,(3) A group of individuals who live together a...
5,,Required household combinations.,(b),Required household combinations.(b)
6,,,(i) Spouses;,(i) Spouses;
7,,,(ii) A person under 22 years of age who is liv...,(ii) A person under 22 years of age who is liv...
8,,,(iii) A child (other than a foster child) unde...,(iii) A child (other than a foster child) unde...
9,,Elderly and disabled persons.,(2),Elderly and disabled persons.(2)


In [53]:
user_entered_text ={
    'title': ['', '', '', '', '', '', '', ''],
    'section_header': ['', '', '', '', '', '', '', ''],
    'text': ['', '', '', '', '', '', '', '']
,
    'combined_text': 
["""
    
Net Monthly Income Eligibility Standards
(100 Percent of Poverty Level)
Household Size of 1 : $1,074 
Household Size of 2 : $1,452 
Household Size of 3 : $1,830 
Household Size of 4 : $2,209 
Household Size of 5 : $2,587 
Household Size of 6 : $2,965 
Household Size of 7 : $3,344 
Household Size of 8 : $3,722 
Each additional member : $379 
"""
, 
"""
2022 Gross Monthly Income Eligibility Standards
(130 Percent of Poverty Level)
Household Size of 1 : $1,396 
Household Size of 2 : $1,888 
Household Size of 3 : $2,379 
Household Size of 4 : $2,871 
Household Size of 5 : $3,363 
Household Size of 6 : $3,855 
Household Size of 7 : $4,347 
Household Size of 8 : $4,839 
Each additional member : $492 
"""
, 
    """ Maximum Monthly SNAP allotment for FY 2022: 
     People in Household: 1, Maximum Monthly Allotment: $250.
     People in Household: 2, Maximum Monthly Allotment: $459.
     People in Household: 3, Maximum Monthly Allotment: $658.
     People in Household: 4, Maximum Monthly Allotment: $835.
     People in Household: 5, Maximum Monthly Allotment: $992.
     People in Household: 6, Maximum Monthly Allotment: $1,190.
     People in Household: 7, Maximum Monthly Allotment: $1,316.
     People in Household: 8, Maximum Monthly Allotment: $1,504.
    Each additional person beyond 8 people, add $188 to the Maximum Monthly Allotment.
    """
, 
""" Standard Deductions 2022: 
Household Size 1 to 2 : $177
Household Size 3 : $177
Household Size 4 : $184
Household Size 5 : $215
Household Size 6+: $246   
 
Maximum Shelter Deduction 2022: $597

Homeless Shelter Deduction 2022: $159.73

Maximum Resource Limits 2022: 
Households with at least one member who is age 60 or older or is disabled: $3,750
All other households: $2,500
"""
,

 """To determine a SNAP household’s monthly net income, there are certain allowable deductions that are subtracted from the household’s gross monthly income. Some of the deduction amounts are set by the Federal government and are subject to change each October 1.

20 percent deduction is applied to earned income (e.g., money from a job or self-employment).
A standard deduction is applied to all SNAP households. 
A dependent care deduction when needed for work, training, or education.
Legally owed child support payments.
Excess shelter costs that are more than half of the household’s income after all other deductions have been applied.   In addition to the standard, a household can claim the following allowable shelter costs:
Rent or mortgage payments and interest, or Taxes on the home
Households with a person(s) age 60 years or older or a person with a disability can also deduct out-of-pocket medical expenses of more than $35. Payments made by insurance or someone else cannot be counted.
"""
, 
""" Resource Limits
Resources are things like bank accounts, money market funds, certificates of deposit, and stock and bonds. 
Some things never count, such as the value of the home. 
If a resident applying for SNAP is categorically eligible, there is no limit on resources.

If you apply for SNAP, the District will review your information to determine if a resource limit applies.
Resource limits are set by the Federal government and are subject to change each October 1.
"""
, 
""" Certification Period

Households approved for SNAP are assigned a certification period.  
The certification period is the length of time your household will receive benefits.  In some cases, a certification period may end early if there is a change in circumstances that makes the household ineligible (e.g., receiving an increase in earned or unearned income).

Most SNAP households will receive a 12-month certification period.  
SNAP households that meet ESAP (Elderly Simplified Application Project) criteria will receive a 36-month certification period. 
Households that meet ESAP criteria are households in which: All adults are age 60 or older and/or disabled, and Do not have earned income. 
"""
, 
"""
If the SNAP household receives Temporary Assistance for Needy Families (TANF) cash benefits, 
TANF non-cash benefits, or Supplemental Security Income (SSI), the SNAP household is considered categorically eligible for SNAP.  
Categorical eligibility for SNAP means the household has already been determined eligible for another means-tested program.  
Most District residents applying for SNAP are determined categorically eligible.
""" ]
}

annual_limits = pd.DataFrame.from_dict(user_entered_text)

In [54]:
sectioned_text = pd.concat([annual_limits, sectioned_text], axis = 0)
sectioned_text.head(5)

Unnamed: 0,title,section_header,text,combined_text
0,,,,\n \nNet Monthly Income Eligibility Standar...
1,,,,\n2022 Gross Monthly Income Eligibility Standa...
2,,,,Maximum Monthly SNAP allotment for FY 2022: \...
3,,,,Standard Deductions 2022: \nHousehold Size 1 ...
4,,,,To determine a SNAP household’s monthly net in...


In [55]:
EMBEDDING_MODEL = "text-embedding-3-small"
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

# print(len(sectioned_text['combined_text'].to_list()))
snap_text_list = [text for text in sectioned_text['combined_text'].to_list() if len(text) > 0]
# print(len(snap_text_list))

embeddings = []
for batch_start in range(0, len(snap_text_list), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = snap_text_list[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    
    response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response.data):
        assert i == be.index  # double check embeddings are in same order as input
    batch_embeddings = [e.embedding for e in response.data]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": snap_text_list, "embedding": embeddings})

Batch 0 to 999
Batch 1000 to 1999
Batch 2000 to 2999


## Ask

In [56]:
# save document chunks and embeddings

SAVE_PATH = "~/Desktop/SNAP/embedded_regs.csv"

df.to_csv(SAVE_PATH, index=False)


In [57]:
# download pre-chunked text and pre-computed embeddings
# this file is ~200 MB, so may take a minute depending on your connection speed
embeddings_path = SAVE_PATH

df = pd.read_csv(embeddings_path)
df.head()

Unnamed: 0,text,embedding
0,\n \nNet Monthly Income Eligibility Standar...,"[0.017097212374210358, -0.003910659346729517, ..."
1,\n2022 Gross Monthly Income Eligibility Standa...,"[0.009045927785336971, -0.012394016608595848, ..."
2,Maximum Monthly SNAP allotment for FY 2022: \...,"[0.025717277079820633, 0.001359197311103344, 0..."
3,Standard Deductions 2022: \nHousehold Size 1 ...,"[0.03339914605021477, -0.01860125921666622, 0...."
4,To determine a SNAP household’s monthly net in...,"[0.018247514963150024, 0.012250678613781929, 0..."


In [58]:
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [59]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]


In [138]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    # encoding = tiktoken.encoding_for_model(model)
    encoding = tiktoken.get_encoding("o200k_base")
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = """
        Use the below Federal Regs on SNAP eligibility to assess applications. 
        If you need more information, or to clarify an ask, ask for the information or clarification that you need.
        If the answer cannot be found in the articles, write I don't know the answer to that.

        Make sure you calculate the appropriate deductions, resource limits, and certification period. Earned income is wages. 
        If you don't have the specific information required to calculate the appropriate deduction, resource limit, or certification period, ask for the required information. 
        For example, if the deduction is: A dependent care deduction when needed for work, training, or education. Make sure that you have the applicant information related to any work, training or education. Otherwise, ask the applicant for this information.
        
        Finally, after you independently calculate the household eligibility benefits for SNAP, compare your household benefits calculation to 
        Total shelter costs only include rent and not utilities. 
        List the final calculated benefit using this syntax: {Final Calculated Benefit: }
        
        """
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\n Article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    response_df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You assess a SNAP application's eligibility for benefits"},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    # print(response_message)
    
    return response_message


In [None]:
dc_applications = pd.read_csv('snap_stories.csv')
dc_apps = dc_applications.rename(columns={"Unnamed: 0": "index", "0": "snap_story"})
dc_apps.head(5)

Unnamed: 0,index,snap_story
0,0,1 individual(s) listed in this SNAP applicatio...
1,1,1 individual(s) listed in this SNAP applicatio...
2,2,1 individual(s) listed in this SNAP applicatio...
3,3,3 individual(s) listed in this SNAP applicatio...
4,4,1 individual(s) listed in this SNAP applicatio...


In [140]:
responses_df = pd.DataFrame({'question': pd.Series(dtype='str'),
                   'answer': pd.Series(dtype='str')})

for i in range(len(dc_apps['snap_story'])): 
    individual_response = {}
    individual_response['question'] = [dc_apps['snap_story'].iloc[i]]
    individual_response['answer'] = [ask(dc_apps['snap_story'].iloc[i])]
    # print(individual_response)
    
    ir_df = pd.DataFrame.from_dict(individual_response)
    responses_df = pd.concat([ir_df, responses_df], axis = 0)

In [155]:
responses_df.head(5)
df_reverse_responses = responses_df.iloc[::-1, ::]

df_reverse_responses = df_reverse_responses.reset_index()
df_reverse_responses.head(10)

Unnamed: 0,index,question,answer
0,0,1 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
1,0,1 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
2,0,1 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
3,0,3 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
4,0,1 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
5,0,1 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
6,0,4 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
7,0,3 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...
8,0,1 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the individ...
9,0,1 individual(s) listed in this SNAP applicatio...,To assess the SNAP application for the househo...


In [154]:
qc_output = pd.read_csv('~/snap/snap_qc_results.csv')
qc_output = qc_output.rename(columns={"Unnamed: 0": "index", "0": "benefit_result"})
qc_output.head(10)

Unnamed: 0,index,benefit_result
0,0,1 individual(s) listed in this SNAP applicatio...
1,1,1 individual(s) listed in this SNAP applicatio...
2,2,1 individual(s) listed in this SNAP applicatio...
3,3,3 individual(s) listed in this SNAP applicatio...
4,4,1 individual(s) listed in this SNAP applicatio...
5,5,1 individual(s) listed in this SNAP applicatio...
6,6,4 individual(s) listed in this SNAP applicatio...
7,7,3 individual(s) listed in this SNAP applicatio...
8,8,1 individual(s) listed in this SNAP applicatio...
9,9,1 individual(s) listed in this SNAP applicatio...


In [156]:
merged_qa = pd.merge(df_reverse_responses, qc_output, how = 'inner', left_index=True, right_index=True)
print(len(df_reverse_responses))
print(len(qc_output))
print(len(merged_qa))

merged_qa_csv = merged_qa[['question', 'answer', 'benefit_result']]

259
259
259


In [None]:
benefit_calcs = merged_qa_csv["benefit_result"].str.split(":", expand=True)
benefit_calcs = benefit_calcs[1].str.split(' ', expand=True)
benefit_calcs = benefit_calcs[1].str.split('$', expand=True)
benefit_calcs = benefit_calcs[1]
benefit_calcs.head(5)

0    150
1    250
2    250
3    571
4     20
Name: 1, dtype: object

In [203]:
llm_calcs = merged_qa_csv["answer"].str.split("Final Calculated Benefit:", expand=True)
llm_calcs = llm_calcs.fillna('')
llm_calcs['combined'] = llm_calcs[1] + llm_calcs[2]
llm_calcs = llm_calcs['combined'].str.strip("{$}") 
# llm_calcs = llm_calcs[1].str.split('', expand=True)
# llm_calcs = llm_calcs[1]
llm_calcs.head(5)

0        238
1     $250}.
2     { $250
3       $658
4         16
Name: combined, dtype: object

In [168]:
pd.merge(benefit_calcs, llm_calcs, how = 'inner', left_index=True, right_index=True)

Unnamed: 0,0_x,1_x,0_y,1_y,2
0,1 individual(s) listed in this SNAP applicatio...,$150 ***,To assess the SNAP application for the househo...,238,
1,1 individual(s) listed in this SNAP applicatio...,$250 ***,To assess the SNAP application for the househo...,$250}.,
2,1 individual(s) listed in this SNAP applicatio...,$250 ***,To assess the SNAP application for the househo...,{,$250
3,3 individual(s) listed in this SNAP applicatio...,$571 ***,To assess the SNAP application for the househo...,$658,
4,1 individual(s) listed in this SNAP applicatio...,$20 ***,To assess the SNAP application for the househo...,16,
...,...,...,...,...,...
254,1 individual(s) listed in this SNAP applicatio...,$250 ***,To assess the SNAP application for the househo...,{223,
255,2 individual(s) listed in this SNAP applicatio...,$236 ***,To assess the SNAP application for the househo...,$459,
256,1 individual(s) listed in this SNAP applicatio...,$250 ***,To assess the SNAP application based on the pr...,$250,
257,2 individual(s) listed in this SNAP applicatio...,$148 ***,To assess the SNAP application for the househo...,250,


In [148]:
merged_qa_csv.to_csv('merged_qa_data.csv')

## Ask & Answer

In [None]:
ask("What households are categorically eligible for SNAP?")

In [None]:
ask("What does it mean to be categorically eligible?")

In [None]:
ask("What households are categorically eligible for SNAP? Explain it to me like I'm 5")

In [None]:
ask("How is the shelter deduction calculated for households experiencing homelessness?")

# Traditional shelter expense has a max. - changes as part of cost of living (COLA adjustments)
# Regs not updated to match new adjustments - COLA / barrier to policy to code because there are conflicts in source document and other documents.
# Statute trumps Regs / Farm Bill -> Statute -> Regs -> FNS policy guidance (usually more timely than Regs) -> State policy manuals 
# (Farm Bill usually re-authorized every 4 yrs)

# Data Matches - death, prisoner - verification process

In [None]:
ask("How is the shelter deduction calculated for households experiencing homelessness? Explain this simply using an example.")

In [None]:
ask('What are household eligibility guidelines?')

In [None]:
ask("What are the different pieces of information needed to assess a household's eligibility?")

In [None]:
ask("What are the different pieces of information needed to assess a household's eligibility? Explain it to me like I'm 5")

In [None]:
ask("How do I compute how much a household gets in SNAP benefits?")

In [None]:
ask("""What is the maximum monthly SNAP allotment for a household size?""")

In [None]:
# Issues: 
# Makes assumptions - 'earns income' -> Doesn't clarify gross v. net income.
# No common sense checks - takes what someone says at face value. We would have to explicitly specify the types of checks to make.
# No memory - can't refer to past quesitons / input. 

ask("""A five person household earns $0 a month, but they have an address. Do they qualify for SNAP and if so, how much would they receive in benefits?""")

In [None]:
ask("If I make $20,000 do I qualify for SNAP?")

In [None]:
ask("If I make $20,000 gross income do I qualify for SNAP? My household size is 2.")

In [None]:
ask("I make $20,000 gross income per year. Do I qualify for SNAP? My household size is 2.")

In [None]:
ask("How do I calculate net income?")

In [None]:
ask("My net income is 18,000 per year. My household size is 2. Do I qualify for SNAP? If so, how much do I qualify for?")