In [2]:
from IPython.display import display, HTML

# Apply word wrapping to notebook outputs
display(HTML('''
    <style>
        .output_area pre {
            white-space: pre-wrap;
        }
    </style>
'''))

import json
import pandas as pd
import time
from openai import OpenAI
import os
import ast
import numpy as np
import pdb
from dotenv import load_dotenv
from pprint import pprint
from openai import RateLimitError, AuthenticationError, APIConnectionError, APIError

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI client with API key from environment variable
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def get_response(user_content:str, system_content:str=None, assistant_content:str=None):
    try:
        messages = [{"role": "user", "content": user_content}]

        if system_content is not None:
            messages.append({"role": "system", "content": system_content})

        if assistant_content is not None:
            messages.append({"role": "assistant", "content": assistant_content})

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=messages
        )
        return response
    except RateLimitError as e:
        print(f"Rate limit exceeded: {e}")
        # print("Retrying after 60 seconds...")
        # time.sleep(60)  # Wait for 60 seconds before retrying
        # return get_response(question)
        # return response
    except APIConnectionError as e:
        print(f"API connection error: {e}")
    except AuthenticationError as e:
        print(f"Authentication error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def get_embedding(text, model="text-embedding-3-small"):
    try:
        text = text.replace("\n", " ")
        response = client.embeddings.create(
            model=model,
            input=[text]
        )
        return response.data[0].embedding
    except RateLimitError as e:
        print(f"Rate limit exceeded: {e}")
        # print("Retrying after 60 seconds...")
        # time.sleep(60)  # Wait for 60 seconds before retrying
        # return get_response(question)
        # return response
    except APIConnectionError as e:
        print(f"API connection error: {e}")
    except AuthenticationError as e:
        print(f"Authentication error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [3]:
question = "What temperature should I set my house on vacation so the pipes don't freeze?"
question

"What temperature should I set my house on vacation so the pipes don't freeze?"

In [4]:
response = get_response(user_content=question)
response

ChatCompletion(id='chatcmpl-AJsLSfuk2ehXqRvQsYB3ZNd4YFWow', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='It is recommended to set your thermostat to a minimum of 55°F (13°C) to prevent the pipes from freezing while you are away on vacation. This will help ensure that your home remains at a safe temperature to prevent any damage to your plumbing system.', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1729300498, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=52, prompt_tokens=23, total_tokens=75, completion_tokens_details=CompletionTokensDetails(audio_tokens=None, reasoning_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=0)))

In [5]:
response.choices[0].message.content

'It is recommended to set your thermostat to a minimum of 55°F (13°C) to prevent the pipes from freezing while you are away on vacation. This will help ensure that your home remains at a safe temperature to prevent any damage to your plumbing system.'

In [6]:
system_content = "You are an assistant who is helping answer questions. Please answer as if you are talking to a 8 year old."

In [7]:
response = get_response(user_content=question, system_content=system_content)
response

ChatCompletion(id='chatcmpl-AJsLTeN1aYVKOzvCI4Y2quR96qMRh', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Good question! You should set your house temperature to around 55 degrees Fahrenheit when you're on vacation to make sure the pipes don't freeze. This way, your house will stay warm enough to keep everything safe while you're away.", refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1729300499, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=47, prompt_tokens=51, total_tokens=98, completion_tokens_details=CompletionTokensDetails(audio_tokens=None, reasoning_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=0)))

In [8]:
response.choices[0].message.content

"Good question! You should set your house temperature to around 55 degrees Fahrenheit when you're on vacation to make sure the pipes don't freeze. This way, your house will stay warm enough to keep everything safe while you're away."

Now we will add Retrieval Augmented Generation

In [9]:
# first we create a little vectorDB from all of the web pages.
question = "Do you have parking?"
question

'Do you have parking?'

In [10]:
json_data = json.load(open("data/insurance-rag.json"))
# df = pd.json_normalize(json_data)
# df.head()

In [11]:
# Function to concatenate relevant fields for service agents
def format_service_agent(agent):
    return f"Agent ID: {agent['agent_id']}, Name: {agent['name']}, " \
           f"Expertise: {', '.join(agent['area_of_expertise'])}, " \
           f"Experience: Health - {agent['experience_handling_health_claims']} years, " \
           f"Auto - {agent['experience_handling_auto_claims']} years, " \
           f"Property - {agent['experience_handling_property_claims']} years, " \
           f"Overall - {agent['overall_experience']} years."

# Function to concatenate relevant fields for claims
def format_claim(claim):
    return f"Claim ID: {claim['claim_id']}, Customer: {claim['customer_name']} ({claim['customer_id']}), " \
           f"Insurance Type: {claim['insurance_type']}, Policy Number: {claim['policy_number']}, " \
           f"Incident Date: {claim['incident_date']}, Location: {claim['incident_location']}, " \
           f"Description: {claim['incident_description']}, Claim Type: {claim['claim_type']}, " \
           f"Claim Amount: ${claim['claim_amount']}, Feedback: {claim['claimant_feedback']}"

# Format the data
formatted_agents = [format_service_agent(agent) for agent in json_data['service_agents']]
formatted_claims = [format_claim(claim) for claim in json_data['claims']]

In [12]:
# Create a DataFrame with the formatted text
df_agents = pd.DataFrame({'text': formatted_agents})
df_claims = pd.DataFrame({'text': formatted_claims})

# Combine the DataFrames if needed
# df_combined = pd.concat([df_agents, df_claims], ignore_index=True)

# Display the DataFrame
# df_combined.head()

In [13]:
df_agents.head()

Unnamed: 0,text
0,"Agent ID: a1, Name: John Smith, Expertise: hea..."
1,"Agent ID: a2, Name: Emily Johnson, Expertise: ..."
2,"Agent ID: a3, Name: Michael Davis, Expertise: ..."
3,"Agent ID: a4, Name: Jessica Williams, Expertis..."
4,"Agent ID: a5, Name: David Brown, Expertise: he..."


In [14]:
df_claims.head()

Unnamed: 0,text
0,"Claim ID: C001, Customer: John Doe (cust001), ..."
1,"Claim ID: C002, Customer: John Doe (cust001), ..."
2,"Claim ID: C003, Customer: John Doe (cust001), ..."
3,"Claim ID: C004, Customer: Jane Smith (cust002)..."
4,"Claim ID: C005, Customer: Jane Smith (cust002)..."


In [15]:
# df = pd.DataFrame()
df_agents['embeddings'] = df_agents['text'].apply(get_embedding)
df_agents.to_csv("output/df_agents.csv", index=False)
df_agents.to_pickle("output/df_agents.pkl")
df_agents.head()

Unnamed: 0,text,embeddings
0,"Agent ID: a1, Name: John Smith, Expertise: hea...","[-0.01269555278122425, 0.030452121049165726, 0..."
1,"Agent ID: a2, Name: Emily Johnson, Expertise: ...","[-0.006803390569984913, 0.01536078006029129, 0..."
2,"Agent ID: a3, Name: Michael Davis, Expertise: ...","[-0.006611726246774197, 0.03933609649538994, 0..."
3,"Agent ID: a4, Name: Jessica Williams, Expertis...","[-0.00781966932117939, 0.03978852555155754, 0...."
4,"Agent ID: a5, Name: David Brown, Expertise: he...","[-0.01266653835773468, 0.044551748782396317, 0..."


In [16]:
df_claims['embeddings'] = df_claims['text'].apply(get_embedding)
df_claims.to_csv("output/df_claims.csv", index=False)
df_claims.to_pickle("output/df_claims.pkl")
df_claims.head()

Unnamed: 0,text,embeddings
0,"Claim ID: C001, Customer: John Doe (cust001), ...","[-0.03872453793883324, 0.007137414999306202, -..."
1,"Claim ID: C002, Customer: John Doe (cust001), ...","[-0.0468469113111496, 0.0003111716650892049, -..."
2,"Claim ID: C003, Customer: John Doe (cust001), ...","[-0.022890795022249222, 0.016941990703344345, ..."
3,"Claim ID: C004, Customer: Jane Smith (cust002)...","[-0.025329919531941414, -0.0008615886326879263..."
4,"Claim ID: C005, Customer: Jane Smith (cust002)...","[-0.010141157545149326, -0.026256050914525986,..."


In [29]:
question = "Most experienced agent for auto claims"
question_embedding = get_embedding(question)   
question, question_embedding[0:10], "..."

('Most experienced agent for auto claims',
 [-0.015063775703310966,
  0.013776155188679695,
  0.01597757078707218,
  0.0563507042825222,
  -0.03563801199197769,
  0.0013196379877626896,
  -0.04372371360659599,
  0.008660286664962769,
  -0.01729288138449192,
  -0.005077793728560209],
 '...')

In [30]:
def fn(page_embedding):
    return np.dot(page_embedding, question_embedding)

df_agents['distance'] = df_agents['embeddings'].apply(fn)
df_agents.head()

Unnamed: 0,text,embeddings,distance
0,"Agent ID: a1, Name: John Smith, Expertise: hea...","[-0.01269555278122425, 0.030452121049165726, 0...",0.501658
1,"Agent ID: a2, Name: Emily Johnson, Expertise: ...","[-0.006803390569984913, 0.01536078006029129, 0...",0.447491
2,"Agent ID: a3, Name: Michael Davis, Expertise: ...","[-0.006611726246774197, 0.03933609649538994, 0...",0.513587
3,"Agent ID: a4, Name: Jessica Williams, Expertis...","[-0.00781966932117939, 0.03978852555155754, 0....",0.455304
4,"Agent ID: a5, Name: David Brown, Expertise: he...","[-0.01266653835773468, 0.044551748782396317, 0...",0.489026
