In [2]:
# ChatOpenAI

In [3]:
from dotenv import dotenv_values
import openai
import re
import httpx
import os
from openai import OpenAI
from langchain_core.messages import AIMessage, SystemMessage, HumanMessage

In [4]:
config = dotenv_values(".env")

client = OpenAI(api_key=config["OPEN_AI_KEY"])

In [4]:
from langchain.chat_models import init_chat_model

model = init_chat_model("gpt-4.1-mini", model_provider="openai", api_key=config["OPEN_AI_KEY"])

In [46]:

model.invoke([HumanMessage(content="Hi! I'm Bob")])

AIMessage(content='Hello, Bob! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 11, 'prompt_tokens': 11, 'total_tokens': 22, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_c064fdde7c', 'finish_reason': 'stop', 'logprobs': None}, id='run--258fc923-09ed-4f07-bec0-9f8243200855-0', usage_metadata={'input_tokens': 11, 'output_tokens': 11, 'total_tokens': 22, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [47]:
model.invoke("What's my name?")


AIMessage(content='I don’t know your name based on our current conversation. How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 11, 'total_tokens': 30, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_c064fdde7c', 'finish_reason': 'stop', 'logprobs': None}, id='run--52f909d5-1adc-433f-a020-0bb3f5eea596-0', usage_metadata={'input_tokens': 11, 'output_tokens': 19, 'total_tokens': 30, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [48]:
model.invoke([HumanMessage(content="What's my name?")])


AIMessage(content="I don't have access to your personal information unless you share it with me. What would you like me to call you?", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 11, 'total_tokens': 35, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_c064fdde7c', 'finish_reason': 'stop', 'logprobs': None}, id='run--a5225aa9-f01e-4204-9dba-06ef98520ed6-0', usage_metadata={'input_tokens': 11, 'output_tokens': 24, 'total_tokens': 35, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [52]:
# This is a SIMPLE example of how we can store chat history


In [49]:

model.invoke(
    [
        HumanMessage(content="Hi! I'm Bob"),
        # AIMessage(content="Hello Bob! How can I assist you today?"),
        HumanMessage(content="Are you able to process image frames from a video?")
    ]
)

AIMessage(content="Hi Bob! I can analyze and interpret images you share with me, including individual frames extracted from a video. However, I don't have the capability to process video files directly or handle continuous video streams. If you extract specific frames from your video and upload them as images, I can certainly help analyze those! Let me know how you'd like to proceed.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 70, 'prompt_tokens': 26, 'total_tokens': 96, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_c064fdde7c', 'finish_reason': 'stop', 'logprobs': None}, id='run--717fb43c-f755-4b28-a423-7874a5310d35-0', usage_metadata={'input_tokens': 26, 'output_tokens': 70, 'total_tokens': 96, 'input_token_det

In [54]:
# This is an example of how we can process images and use in LangChain messages


In [5]:
def process_video(video_path, seconds_per_frame=2):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame=0

    # Loop through the video and extract frames at specified sampling rate
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    print(f"Extracted {len(base64Frames)} frames")
    return base64Frames



In [6]:
import cv2
import base64
video_path = "pushup.MOV"
base64Frames = process_video(video_path, seconds_per_frame=1)

Extracted 21 frames


In [10]:
messages = [
    SystemMessage(content="You are generating a video summary. Please provide a summary of the video. Respond in Markdown."),
    HumanMessage(
        content=[
            {"type": "text", "text": "Please analyze the frames from this video and tell me what is happening."},
            *[
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{frame}"},
                }
                for frame in base64Frames
            ]
        ]
    )
]

In [11]:
model.invoke(messages)

AIMessage(content="These frames depict a sequence where a person performs an exercise or workout routine involving a transition from standing to a crawling-like position and then moving into a plank or push-up position. Here's what is happening step-by-step:\n\n1. The person starts in a standing position with knees bent.\n2. They prepare for movement by slightly leaning forward.\n3. They bend down further, placing their hands closer to the floor.\n4. The person places their hands on the floor while maintaining a bent-knee position.\n5-11. The person shifts their weight forward, moving the body into a plank position, possibly transitioning between a crouch and a push-up/plank.\n12-14. The person appears to be in a full plank or push-up position.\n15. The person shifts back or finishes the plank position.\n16-17. The person moves back to a crouched position.\n18-19. The person stands back up and faces the camera with a slight smile.\n\nOverall, it looks like an exercise involving dynamic

In [None]:
# We can try this approach to remove any heavy HumanMessage

In [45]:
messages = [
    SystemMessage(content="You are generating a video summary. Please provide a summary of the video. Respond in Markdown."),
    HumanMessage(
        content=[
            {"type": "text", "text": "Please analyze the frames from this video and tell me what is happening."},
            *[
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{frame}"},
                }
                for frame in base64Frames
            ]
        ]
    ),
    HumanMessage(content="My name is Allison. What is your name?")
]

In [49]:
# Removing the heavy image message
for msg in messages:
    if isinstance(msg.content, list):
        messages.remove(msg)

In [56]:
# Pinecone POC


In [5]:
# Pinecone POC

from pinecone import Pinecone, ServerlessSpec

# pc = Pinecone(api_key=config["PINECONE_API_KEY"])

pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

In [29]:
from pinecone import Pinecone, ServerlessSpec
import os

index_name = "test-index-2"
openai_embedding_dimension = 1536


# Two seperate approaches to creating an index on Pinecone

# if not pc.has_index(index_name):
#     pc.create_index_for_model(
#         name=index_name,
#         cloud="aws",
#         region="us-east-1",
#         embed={
#             "model":"llama-text-embed-v2",
#             "field_map":{"text": "chunk_text"}
#         }
#     )

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=openai_embedding_dimension,  # Set the dimension manually
        metric="cosine",  # Specify the similarity metric
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

In [30]:
# Create a sample dataset

records = [
    { "_id": "rec1", "chunk_text": "The Eiffel Tower was completed in 1889 and stands in Paris, France.", "category": "history" },
    { "_id": "rec2", "chunk_text": "Photosynthesis allows plants to convert sunlight into energy.", "category": "science" },
    { "_id": "rec3", "chunk_text": "Albert Einstein developed the theory of relativity.", "category": "science" },
    { "_id": "rec4", "chunk_text": "The mitochondrion is often called the powerhouse of the cell.", "category": "biology" },
    { "_id": "rec5", "chunk_text": "Shakespeare wrote many famous plays, including Hamlet and Macbeth.", "category": "literature" },
    { "_id": "rec6", "chunk_text": "Water boils at 100°C under standard atmospheric pressure.", "category": "physics" },
    { "_id": "rec7", "chunk_text": "The Great Wall of China was built to protect against invasions.", "category": "history" },
    { "_id": "rec8", "chunk_text": "Honey never spoils due to its low moisture content and acidity.", "category": "food science" },
    { "_id": "rec9", "chunk_text": "The speed of light in a vacuum is approximately 299,792 km/s.", "category": "physics" },
    { "_id": "rec10", "chunk_text": "Newton's laws describe the motion of objects.", "category": "physics" },
    { "_id": "rec11", "chunk_text": "The human brain has approximately 86 billion neurons.", "category": "biology" },
    { "_id": "rec12", "chunk_text": "The Amazon Rainforest is one of the most biodiverse places on Earth.", "category": "geography" },
    { "_id": "rec13", "chunk_text": "Black holes have gravitational fields so strong that not even light can escape.", "category": "astronomy" },
    { "_id": "rec14", "chunk_text": "The periodic table organizes elements based on their atomic number.", "category": "chemistry" },
    { "_id": "rec15", "chunk_text": "Leonardo da Vinci painted the Mona Lisa.", "category": "art" },
    { "_id": "rec16", "chunk_text": "The internet revolutionized communication and information sharing.", "category": "technology" },
    { "_id": "rec17", "chunk_text": "The Pyramids of Giza are among the Seven Wonders of the Ancient World.", "category": "history" },
    { "_id": "rec18", "chunk_text": "Dogs have an incredible sense of smell, much stronger than humans.", "category": "biology" },
    { "_id": "rec19", "chunk_text": "The Pacific Ocean is the largest and deepest ocean on Earth.", "category": "geography" },
    { "_id": "rec20", "chunk_text": "Chess is a strategic game that originated in India.", "category": "games" },
    { "_id": "rec21", "chunk_text": "The Statue of Liberty was a gift from France to the United States.", "category": "history" },
    { "_id": "rec22", "chunk_text": "Coffee contains caffeine, a natural stimulant.", "category": "food science" },
    { "_id": "rec23", "chunk_text": "Thomas Edison invented the practical electric light bulb.", "category": "inventions" },
    { "_id": "rec24", "chunk_text": "The moon influences ocean tides due to gravitational pull.", "category": "astronomy" },
    { "_id": "rec25", "chunk_text": "DNA carries genetic information for all living organisms.", "category": "biology" },
    { "_id": "rec26", "chunk_text": "Rome was once the center of a vast empire.", "category": "history" },
    { "_id": "rec27", "chunk_text": "The Wright brothers pioneered human flight in 1903.", "category": "inventions" },
    { "_id": "rec28", "chunk_text": "Bananas are a good source of potassium.", "category": "nutrition" },
    { "_id": "rec29", "chunk_text": "The stock market fluctuates based on supply and demand.", "category": "economics" },
    { "_id": "rec30", "chunk_text": "A compass needle points toward the magnetic north pole.", "category": "navigation" },
    { "_id": "rec31", "chunk_text": "The universe is expanding, according to the Big Bang theory.", "category": "astronomy" },
    { "_id": "rec32", "chunk_text": "Elephants have excellent memory and strong social bonds.", "category": "biology" },
    { "_id": "rec33", "chunk_text": "The violin is a string instrument commonly used in orchestras.", "category": "music" },
    { "_id": "rec34", "chunk_text": "The heart pumps blood throughout the human body.", "category": "biology" },
    { "_id": "rec35", "chunk_text": "Ice cream melts when exposed to heat.", "category": "food science" },
    { "_id": "rec36", "chunk_text": "Solar panels convert sunlight into electricity.", "category": "technology" },
    { "_id": "rec37", "chunk_text": "The French Revolution began in 1789.", "category": "history" },
    { "_id": "rec38", "chunk_text": "The Taj Mahal is a mausoleum built by Emperor Shah Jahan.", "category": "history" },
    { "_id": "rec39", "chunk_text": "Rainbows are caused by light refracting through water droplets.", "category": "physics" },
    { "_id": "rec40", "chunk_text": "Mount Everest is the tallest mountain in the world.", "category": "geography" },
    { "_id": "rec41", "chunk_text": "Octopuses are highly intelligent marine creatures.", "category": "biology" },
    { "_id": "rec42", "chunk_text": "The speed of sound is around 343 meters per second in air.", "category": "physics" },
    { "_id": "rec43", "chunk_text": "Gravity keeps planets in orbit around the sun.", "category": "astronomy" },
    { "_id": "rec44", "chunk_text": "The Mediterranean diet is considered one of the healthiest in the world.", "category": "nutrition" },
    { "_id": "rec45", "chunk_text": "A haiku is a traditional Japanese poem with a 5-7-5 syllable structure.", "category": "literature" },
    { "_id": "rec46", "chunk_text": "The human body is made up of about 60% water.", "category": "biology" },
    { "_id": "rec47", "chunk_text": "The Industrial Revolution transformed manufacturing and transportation.", "category": "history" },
    { "_id": "rec48", "chunk_text": "Vincent van Gogh painted Starry Night.", "category": "art" },
    { "_id": "rec49", "chunk_text": "Airplanes fly due to the principles of lift and aerodynamics.", "category": "physics" },
    { "_id": "rec50", "chunk_text": "Renewable energy sources include wind, solar, and hydroelectric power.", "category": "energy" }
]

In [18]:
# Create a sample dataset

records = [
    { "_id": "rec51", "chunk_text": "Maka Projects is a holding company that owns multiple multi-billion dollar companies including Chedr, Gymogul and Coach Central.", "category": "business" },
    { "_id": "rec52", "chunk_text": "Palantir is a software company specializing in data analytics and integration platforms, primarily serving government and commercial clients. Its technology, including the platforms Gotham and Foundry, helps organizations analyze and integrate data from disparate sources to improve decision-making.", "category": "business" }
]

In [19]:
# Format records

from langchain_core.documents import Document


# documents = []
for record in records:
    # Use 'chunk_text' as the page_content for embedding
    page_content = record["chunk_text"]
    
    # Store all other fields, including the original '_id', as metadata
    metadata = {
        "_id": record["_id"],
        "category": record["category"]
    }
    
    # Create the Document object
    documents.append(Document(page_content=page_content, metadata=metadata))

In [39]:
# UTIL FUNCTION

# Target the index
dense_index = pc.Index(index_name)

# Upsert the records into a namespace
dense_index.upsert_records("example-namespace", records)

In [33]:
model_name = "text-embedding-ada-002"

embeddings = OpenAIEmbeddings(model=model_name, api_key=config['OPEN_AI_KEY'])

In [37]:
# 3. Use PineconeVectorStore to generate embeddings and add documents
# The .from_texts() method is a convenient way to handle the entire process.
# It embeds the texts and upserts them to the specified index.
print("Generating embeddings and upserting documents to Pinecone...")

index_name = "test-index-2"

vectorstore = PineconeVectorStore(
    embedding=embeddings,
    index_name=index_name
)

vectorstore.add_documents(documents)
print("Documents successfully added to the Pinecone index.")

Generating embeddings and upserting documents to Pinecone...
Documents successfully added to the Pinecone index.


In [17]:
# Wait for the upserted vectors to be indexed
import time
time.sleep(10)

# View stats for the index
stats = dense_index.describe_index_stats()
print(stats)

In [40]:
# Define the query
query = "Famous historical structures and monuments"

# Search the dense index
results = dense_index.search(
    namespace="example-namespace",
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    }
)

# Print the results
for hit in results['result']['hits']:
        print(f"id: {hit['_id']:<5} | score: {round(hit['_score'], 2):<5} | category: {hit['fields']['category']:<10} | text: {hit['fields']['chunk_text']:<50}")

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Mon, 20 Oct 2025 07:56:52 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-pinecone-api-version': '2025-04', 'x-envoy-upstream-service-time': '0', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Integrated inference is not configured for this index"},"status":400}


In [13]:
# Search the dense index and rerank results
reranked_results = dense_index.search(
    namespace="example-namespace",
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 10,
        "rank_fields": ["chunk_text"]
    }   
)

# Print the reranked results
for hit in reranked_results['result']['hits']:
    print(f"id: {hit['_id']}, score: {round(hit['_score'], 2)}, text: {hit['fields']['chunk_text']}, category: {hit['fields']['category']}")
    

id: rec1, score: 0.11, text: The Eiffel Tower was completed in 1889 and stands in Paris, France., category: history
id: rec38, score: 0.06, text: The Taj Mahal is a mausoleum built by Emperor Shah Jahan., category: history
id: rec7, score: 0.06, text: The Great Wall of China was built to protect against invasions., category: history
id: rec21, score: 0.02, text: The Statue of Liberty was a gift from France to the United States., category: history
id: rec17, score: 0.02, text: The Pyramids of Giza are among the Seven Wonders of the Ancient World., category: history
id: rec26, score: 0.01, text: Rome was once the center of a vast empire., category: history
id: rec15, score: 0.01, text: Leonardo da Vinci painted the Mona Lisa., category: art
id: rec5, score: 0.0, text: Shakespeare wrote many famous plays, including Hamlet and Macbeth., category: literature
id: rec47, score: 0.0, text: The Industrial Revolution transformed manufacturing and transportation., category: history
id: rec50, sco

In [3]:
# LangChain x Pinecone

In [36]:
# %env PINECONE_API_KEY=

# %env PINECONE_API_KEY

In [11]:
from langchain_openai import OpenAIEmbeddings  

model_name = 'text-embedding-ada-002'  
embeddings = OpenAIEmbeddings(  
    model=model_name,  
    openai_api_key=config["OPEN_AI_KEY"]  
)

In [29]:
from langchain_pinecone import PineconeEmbeddings

model_name = 'llama-text-embed-v2'  
embeddings = PineconeEmbeddings(  
    model=model_name
)


In [7]:
from langchain_pinecone import PineconeVectorStore  

vectorstore = PineconeVectorStore(index_name="developer-quickstart-py", embedding=embeddings)

  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


NameError: name 'embeddings' is not defined

In [41]:
query = "What is the Eiffel Tower"  
vectorstore.similarity_search(  
    query,  # our search query  
    k=3  # return 3 most relevant docs  
)  

[Document(id='c105f698-e587-4edd-a31b-439fd34ae2f9', metadata={'_id': 'rec21', 'category': 'history'}, page_content='The Statue of Liberty was a gift from France to the United States.'),
 Document(id='a213c387-c497-4f81-a814-4891c9881dd5', metadata={'_id': 'rec17', 'category': 'history'}, page_content='The Pyramids of Giza are among the Seven Wonders of the Ancient World.'),
 Document(id='b7f08b56-c8af-4ebb-a9c9-7075cfb4ab81', metadata={'_id': 'rec1', 'category': 'history'}, page_content='The Eiffel Tower was completed in 1889 and stands in Paris, France.')]

In [55]:
from langchain_openai import ChatOpenAI  
from langchain.chains import RetrievalQA  
# completion llm  
llm = ChatOpenAI(  
    openai_api_key=config["OPEN_AI_KEY"],  
    model_name='gpt-3.5-turbo',  
    temperature=0.0  
)  
qa = RetrievalQA.from_chain_type(  
    llm=llm,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  
qa.invoke(query)  


{'query': 'What is the Eiffel Tower',
 'result': 'The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was completed in 1889 and is considered a global cultural icon of France.'}

In [16]:
# Not retrieving from Pinecone successfully