In [22]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [23]:
# Cell 1: Imports and API key variables

# Install necessary packages
!pip install openai pinecone-client

# All necessary imports
from openai import OpenAI
from pinecone import Pinecone
from pinecone import ServerlessSpec

#Additional imports
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
import numpy as np



In [24]:
# Cell 2: Variable definitions
OPENAI_API_KEY = 'openai_api_key'
PINECONE_API_KEY='pinecone_api_key'
PINECONE_CLOUD = 'aws'
PINECONE_REGION = 'us-east-1'

# Initialize OpenAI client with the provided API key
client = OpenAI(api_key=OPENAI_API_KEY)

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

MAX_TOKENS = 225
PINECONE_INDEX_NAME = 'chat-history-69'

In [25]:
# Cell 3: Define the conversation history
history = [
"1: User: Hi there! How are you doing today? | Bot: Hello! I'm doing great, thank you! How can I assist you today?",
"2: User: What's the weather like today in New York? | Bot: Today in New York, it's sunny with a slight chance of rain.",
"3: User: Great! Do you have any good lunch suggestions? | Bot: Sure! How about trying a new salad recipe?",
"4: User: That sounds healthy. Any specific recipes? | Bot: You could try a quinoa salad with avocado and chicken.",
"5: User: Sounds delicious! I'll try it. What about dinner? | Bot: For dinner, you could make grilled salmon with vegetables.",
"6: User: Thanks for the suggestions! Any dessert ideas? | Bot: How about a simple fruit salad or yogurt with honey?",
"7: User: Perfect! Now, what are some good exercises? | Bot: You can try a mix of cardio and strength training exercises.",
"8: User: Any specific recommendations for cardio? | Bot: Running, cycling, and swimming are all excellent cardio exercises.",
"9: User: I'll start with running. Can you recommend any books? | Bot: 'Atomic Habits' by James Clear is a highly recommended book.",
"10: User: I'll check it out. What hobbies can I take up? | Bot: You could explore painting, hiking, or learning a new instrument.",
"11: User: Hiking sounds fun! Any specific trails? | Bot: There are great trails in the Rockies and the Appalachian Mountains.",
"12: User: I'll plan a trip. What about indoor activities? | Bot: Indoor activities like reading, cooking, or playing board games.",
"13: User: Nice! Any good board games? | Bot: Settlers of Catan and Ticket to Ride are both excellent choices.",
"14: User: I'll try them out. Any movie recommendations? | Bot: 'Inception' and 'The Matrix' are must-watch movies.",
"15: User: I love those movies! Any TV shows? | Bot: 'Breaking Bad' and 'Stranger Things' are very popular.",
"16: User: Great choices! What about podcasts? | Bot: 'How I Built This' and 'The Daily' are very informative.",
"17: User: Thanks! What are some good travel destinations? | Bot: Paris, Tokyo, and Bali are amazing travel spots.",
"18: User: I'll add them to my list. Any packing tips? | Bot: Roll your clothes to save space and use packing cubes.",
"19: User: That's helpful! What about travel insurance? | Bot: Always get travel insurance for safety and peace of mind.",
"20: User: Thanks for the tips! Any last advice? | Bot: Just enjoy your journey and make the most out of your experiences."
]

In [26]:
def cluster_conversations(history, n_clusters=3, linkage='average'):
    """
    Cluster conversations using Agglomerative Clustering and append cluster names to each conversation entry.

    Args:
        history (list of str): List of conversation strings.
        n_clusters (int): Number of clusters to find.
        linkage (str): Linkage criterion for clustering.

    Returns:
        list of str: Updated conversation history with cluster labels.
    """
    # Load the SpaCy model
    nlp = spacy.load("en_core_web_md")

    # Generate embeddings for each history item
    history_embeddings = np.array([nlp(entry).vector for entry in history])

    # Calculate pairwise cosine similarity between each conversation
    similarity_matrix = cosine_similarity(history_embeddings)

    # Convert similarity to distance (since clustering algorithms work on distances)
    distance_matrix = 1 - similarity_matrix

    # Perform Agglomerative Clustering with 'precomputed' metric and average linkage
    clustering = AgglomerativeClustering(
        n_clusters=n_clusters,
        affinity='precomputed',  # Set affinity to precomputed since we're passing a distance matrix
        linkage=linkage  # Use the chosen linkage criterion
    )
    clustering.fit(distance_matrix)

    # Assign clusters to each conversation entry
    labels = clustering.labels_

    # Append cluster names to the history entries
    for i, label in enumerate(labels):
        history[i] = f"{history[i]} | Cluster: Cluster {label + 1}"

    return history

history = cluster_conversations(history)

In [27]:
# Cell 4: Function to add embeddings to Pinecone
def add_embeddings_to_pinecone(history, index_name='chat-history-69'):
    """Add embeddings to Pinecone without batch processing"""

    # Create index if it doesn't exist
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=3072,
            metric='dotproduct',
            spec=ServerlessSpec(
                cloud=PINECONE_CLOUD,
                region=PINECONE_REGION
            )
        )

    index = pc.Index(index_name)

    # Process all messages
    vectors = []

    for msg in history:
        msg_num = int(msg.split(':')[0])

        # Extract user message and bot response
        parts = msg.split(" | ")
        user_msg = parts[0].split(": User: ")[1]
        bot_msg = parts[1].split("Bot: ")[1]
        cluster_name = parts[2].split(": ")[1]

        # Split user message by period and get the second part if it exists
        user_message_split = user_msg.split(".")
        if len(user_message_split) > 1:
            user_msg = user_message_split[1]

        # Create embedding for combined message
        combined_text = f"{cluster_name}{user_msg}"

        # Get embedding directly using OpenAI API
        response = client.embeddings.create(
            model="text-embedding-3-large",
            input= combined_text
        )
        embedding = response.data[0].embedding

        vectors.append({
            'id': f'msg_{msg_num}',
            'values': embedding,
            'metadata': {
                'message_num': msg_num,
                'user_message': user_msg,
                'bot_message': bot_msg,
                'full_text': msg
            }
        })

    # Upsert all vectors in one call
    index.upsert(vectors=vectors)

    # Get index stats
    index_stats = index.describe_index_stats()
    print(f"Index '{index_name}' stats: {index_stats}")

# Add embeddings to Pinecone without batching
add_embeddings_to_pinecone(history)

Index 'chat-history-69' stats: {'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20}},
 'total_vector_count': 20}


In [28]:
# Cell 5: Defining the RAG mechanism
def retrieve_relevant_history(query, index_name='chat-history-69'):
    """Retrieve relevant history based on semantic similarity to query"""
    index = pc.Index(index_name)

    # Get query embedding
    query_embedding = client.embeddings.create(
            model="text-embedding-3-large",
            input = query
        ).data[0].embedding

    # Search with increased top_k and lower threshold
    results = index.query(
        vector=query_embedding,
        top_k=3,
        include_metadata=True,
        include_values=False
    )

    # Filter and sort results
    relevant_messages = []
    for match in results.matches:
        msg_num = match.metadata['message_num']
        relevant_messages.append({
            'message_num': msg_num,
            'text': match.metadata['bot_message'],
            'score': match.score
        })

    # Sort by message number
    #relevant_messages.sort(key=lambda x: x['message_num'])
    return relevant_messages

In [29]:
# Cell 6: Function to prepare the prompt
def prepare_prompt(test_prompt, history, index_name='chat-history-69'):
    """Prepare prompt with improved context integration"""
    # Retrieve relevant messages
    relevant_messages = retrieve_relevant_history(test_prompt, index_name=index_name)

    # Format context string
    context = "\nRelevant history:\n"
    context_refs = []

    for msg in relevant_messages:
        context += f"{msg['text']}\n"
        context_refs.append(str(msg['message_num']))

    # Combine prompt with context
    final_prompt = f"""Your a friendly chatbot and instructed to answer the user question based solely on the context and continue the good flow of conversation.
                        {context}\nCurrent query: {test_prompt}"""

    return final_prompt, context_refs

In [30]:
# Cell 7: Function to test the prompt
def test_final_prompt():
    """Test the prompt with improved response handling"""
    final_test_prompt = "Do you think it will help me stay fit?"

    prepared_prompt, context_refs = prepare_prompt(final_test_prompt, history)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant. Reference the relevant history when responding. You must answer the user question based mainly on the context provided. If you are not sure of the answer just say I don't know, don't hallucinate."
            },
            {"role": "user", "content": prepared_prompt}
        ],
        max_tokens=MAX_TOKENS,
        temperature=0.7
    )

    print(f"Final Test Prompt: {final_test_prompt}\n")
    print(f"Context Referred: Messages {', '.join(context_refs)}\n")
    print(f"Final Response: {response.choices[0].message.content}")

# Run the test
test_final_prompt()

Final Test Prompt: Do you think it will help me stay fit?

Context Referred: Messages 8.0, 7.0, 10.0

Final Response: Yes, incorporating a mix of cardio and strength training exercises, along with activities like hiking, can definitely help you stay fit. It's important to find activities you enjoy so you can maintain a consistent routine.
