In [26]:
import os

import numpy as np
import pandas as pd

import cv2
import zipfile
from pdf2image import convert_from_path

In [32]:
from openai import OpenAI
from pinecone import Pinecone
from dotenv import load_dotenv

In [2]:
cur_dir = os.getcwd()
zip_file_path = 'Electricity_bills.zip'
data_dir = os.path.join(cur_dir,'data\\')
pdf_dir = os.path.join(data_dir, 'pdf\\')

if not os.path.exists(data_dir):
    os.mkdir(data_dir)

if not os.path.exists(pdf_dir):
    os.mkdir(pdf_dir)

with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    zip_file.extractall(pdf_dir)

In [10]:
jpeg_dir = os.path.join(data_dir, 'jpeg\\')
if not os.path.exists(jpeg_dir):
    os.mkdir(jpeg_dir)


for pdf in os.listdir(pdf_dir):
    image = convert_from_path(os.path.join(pdf_dir,pdf))
    image[0].save(os.path.join(jpeg_dir,pdf[:-4]+'.jpeg'), 'JPEG')

In [33]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
client = OpenAI()

In [14]:
def vision_embed_file(file_name, multi_modal_model='gpt-4.1-mini', embedding_model='text-embedding-3-small'):
  def create_file(file_path):
    with open(file_path, "rb") as file_content:
      result = client.files.create(
          file=file_content,
          purpose="vision",
      )
      return result.id

  file_id = create_file(file_name)

  response = client.responses.create(
      model = multi_modal_model,
      input=[{
            'role':'user',
            'content':[{
                'type': 'input_text',
                    'text': 'what\'s in this image?'},
                {'type':'input_image',
                    'file_id':file_id}
            ]
      }]
  )
  caption = response.output_text
  embedding_object = client.embeddings.create(input=caption, model=embedding_model)
  vector = embedding_object.data[0].embedding

  # Display Results
  # Display Image from URL
  # os.system('wget %s' %image_url_)
  # file_name=str(image_url_).split("/")[-1]
#   print(file_name)
#   img = cv2.imread(file_name)
#   cv2_imshow(img)
#   print(caption)
#   print("Summary Length in characters:"+str(len(caption)))


  return_dict = {'image_caption': caption, 'file_id':file_id, 'embedding':vector}

  return return_dict



In [66]:
df = pd.DataFrame(columns=['id', 'values', 'metadata'])

i=0
for jpeg in os.listdir(jpeg_dir):
    file_path = os.path.join(jpeg_dir, jpeg)
    embedding = vision_embed_file(file_path)
    df.loc[i] = [embedding['file_id'], embedding['embedding'], {'caption':embedding['image_caption']}]
    i += 1

df.to_csv(os.path.join(data_dir, 'embeds.csv'), index=False)

In [67]:
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pc.Index('retrieval-augmented-generation')
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 3}},
 'total_vector_count': 3,
 'vector_type': 'dense'}

In [68]:
def prepare_DF(df):
  import json,ast
  try: df=df.drop('Unnamed: 0',axis=1)
  except: print('Unnamed Not Found')
  df['values']=df['values'].apply(lambda x: np.array([float(i) for i in x.replace("[",'').replace("]",'').split(',')]))
  df['metadata']=df['metadata'].apply(lambda x: ast.literal_eval(x))
  return df

In [69]:
index_df = prepare_DF(pd.read_csv(os.path.join(data_dir,'embeds.csv')))

Unnamed Not Found


In [70]:
upsert_vectors = list(index_df.itertuples(index=False, name=None))

In [71]:
index.upsert(vectors=upsert_vectors)

{'upserted_count': 20}

In [72]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 23}},
 'total_vector_count': 23,
 'vector_type': 'dense'}