In [None]:
import os

import numpy as np
import pandas as pd

import cv2
import zipfile
from pdf2image import convert_from_path
import hashlib


In [None]:
from openai import OpenAI
from pinecone import Pinecone
from dotenv import load_dotenv

In [None]:
cur_dir = os.getcwd()
zip_file_path = 'Electricity_bills.zip'
data_dir = os.path.join(cur_dir,'data\\')
pdf_dir = os.path.join(data_dir, 'pdf\\')

if not os.path.exists(data_dir):
    os.mkdir(data_dir)

if not os.path.exists(pdf_dir):
    os.mkdir(pdf_dir)

with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    zip_file.extractall(pdf_dir)

In [None]:
jpeg_dir = os.path.join(data_dir, 'jpeg\\')
if not os.path.exists(jpeg_dir):
    os.mkdir(jpeg_dir)


for pdf in os.listdir(pdf_dir):
    image = convert_from_path(os.path.join(pdf_dir,pdf))
    image[0].save(os.path.join(jpeg_dir,pdf[:-4]+'.jpeg'), 'JPEG')

In [None]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
client = OpenAI()

In [None]:
def vision_embed_file(file_name, multi_modal_model='gpt-4.1-mini', embedding_model='text-embedding-3-small'):
  def create_file(file_path):
    with open(file_path, "rb") as file_content:
      result = client.files.create(
          file=file_content,
          purpose="vision",
      )
      return result.id

  file_id = create_file(file_name)

  response = client.responses.create(
      model = multi_modal_model,
      input=[{
            'role':'user',
            'content':[{
                'type': 'input_text',
                    'text': 'what\'s in this image?'},
                {'type':'input_image',
                    'file_id':file_id}
            ]
      }]
  )
  caption = response.output_text
  embedding_object = client.embeddings.create(input=caption, model=embedding_model)
  vector = embedding_object.data[0].embedding

  # Display Results
  # Display Image from URL
  # os.system('wget %s' %image_url_)
  # file_name=str(image_url_).split("/")[-1]
#   print(file_name)
#   img = cv2.imread(file_name)
#   cv2_imshow(img)
#   print(caption)
#   print("Summary Length in characters:"+str(len(caption)))


  return_dict = {'image_caption': caption, 'file_id':file_id, 'embedding':vector}

  return return_dict



In [None]:
def get_embeddings(text, model="text-embedding-3-small"):
    text = text.replace("\n"," ")
    return client.embeddings.create(input=text, model=model).data[0].embedding

In [None]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index('retrieval-augmented-generation')
index.describe_index_stats()

In [None]:
index.delete(delete_all=True)
print(" Pinecone index has been reset.")

In [10]:
def hash_file(filepath):
    with open(filepath, "rb") as f:
        return hashlib.sha256(f.read()).hexdigest()

df = pd.DataFrame(columns=['id', 'values', 'metadata'])

i = 0
for jpeg in os.listdir(jpeg_dir):
    file_path = os.path.join(jpeg_dir, jpeg)
    
    vector_id = hash_file(file_path)

    res = index.fetch(ids=[vector_id])
    if vector_id in res.vectors:
        print(f"Skipping duplicate: {jpeg}")
        continue  

    embedding = vision_embed_file(file_path)

    df.loc[i] = [vector_id, embedding['embedding'], {'caption': embedding['image_caption']}]
    i += 1

df.to_csv(os.path.join(data_dir, 'embeds.csv'), index=False)


KeyboardInterrupt: 

In [None]:
def augmented_query(user_query, embed_model='text-embedding-3-small',k=5):
    contexts,query = get_context(user_query,embed_model = embed_model,k=k)
    return "\n\n--------------------------\n\n".join(contexts)+"\n\n--------------------------\n\n" + query

In [None]:
def prepare_DF(df):
  import json,ast
  try: df=df.drop('Unnamed: 0',axis=1)
  except: print('Unnamed Not Found')
  df['values']=df['values'].apply(lambda x: np.array([float(i) for i in x.replace("[",'').replace("]",'').split(',')]))
  df['metadata']=df['metadata'].apply(lambda x: ast.literal_eval(x))
  return df

In [None]:
index_df = prepare_DF(pd.read_csv(os.path.join(data_dir,'embeds.csv')))

In [None]:
upsert_vectors = list(index_df.itertuples(index=False, name=None))

In [None]:
index.upsert(vectors=upsert_vectors)

In [None]:
index.describe_index_stats()

In [None]:
sample_text = "What bills are due?"

In [None]:
test_embeddings = get_embeddings(sample_text)

In [None]:
index.query(vector=test_embeddings, top_k=5,include_metadata=True)

In [None]:
def get_context(query, embed_model = 'text-embedding-3-small',k=5,index=index):
    query_embeddings = get_embeddings(query,model=embed_model)
    pinecone_response = index.query(vector=query_embeddings,top_k=k,include_metadata=True)
    contexts = [item['metadata']['caption'] for item in pinecone_response['matches']]
    return contexts, query

In [None]:
get_context("Tell me about Avg. monthly usage")

In [None]:
my_question = "Tell me about useage?"
augg = augmented_query(my_question)
print(augg)

In [None]:
def ask_gpt_response(system_prompt, user_prompt, model='gpt-5-chat-latest'):
  response = client.responses.create(
      model=model,
      input=[
          {"role":"developer",
          "content":system_prompt},
          {"role":"user",
           "content":user_prompt}])
  return response.output_text, response

In [None]:
import textwrap

In [None]:
primer = f"""
You are a knowledgeable assistant specialized in answering questions about electric utility bills. 
You provide accurate and clear explanations based solely on the bill details and information provided above each question. 
If the information is not sufficient to answer the question, respond truthfully with, "I don't know."
"""

In [None]:
print(ask_gpt_response(system_prompt=primer, user_prompt=augmented_query('Give me some analysis of recent bills?')))

In [None]:
def ask_gpt_response(system_prompt, user_prompt, model='gpt-5-chat-latest'):
  response = client.responses.create(
      model=model,
      input=[
          {"role":"developer",
          "content":system_prompt},
          {"role":"user",
           "content":user_prompt}])
  return response.output_text, response

primer = f"""
You are a knowledgeable assistant specialized in answering questions about electric utility bills. 
You provide accurate and clear explanations based solely on the bill details and information provided above each question. 
If the information is not sufficient to answer the question, respond truthfully with, "I don't know."
"""

print(ask_gpt_response(system_prompt=primer, user_prompt=augmented_query('Give me some analysis of recent bills?')))