# Retrieval Augmented Generation (RAG) and Vector Databases

In [3]:
#%pip install getenv openai==1.12.0
%pip install datetime

Defaulting to user installation because normal site-packages is not writeable
Collecting datetime
  Downloading DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting zope.interface (from datetime)
  Downloading zope.interface-6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Downloading DateTime-5.5-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading zope.interface-6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.3/247.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zope.interface, datetime
Successfully installed datetime-5.5

In [1]:
import os
import pandas as pd
import numpy as np
from ollama import Client
from dotenv import load_dotenv
load_dotenv()

client = Client(
    host = os.environ['OLLAMA_ENDPOINT']
    )
model = "mistral" # Change the model accordingly

## Creating our Knowledge base

Creating a Azure Cosmos DB database


In [3]:
pip install azure-cosmos

Note: you may need to restart the kernel to use updated packages.


In [4]:
## create your cosmoss db on Azure CLI using the following commands
## az login
## az group create -n <resource-group-name> -l <location>
## az cosmosdb create -n <cosmos-db-name> -r <resource-group-name>
## az cosmosdb list-keys -n <cosmos-db-name> -g <resource-group-name>

## Once done navigate to data explorer and create a new database and a new container


In [2]:
#from azure.cosmos import CosmosClient

# Initialize Cosmos Client
#url = os.getenv('COSMOS_DB_ENDPOINT')
#key = os.getenv('COSMOS_DB_KEY')
#client = CosmosClient(url, credential=key)

# Select database
#database_name = 'rag-cosmos-db'
#database = client.get_database_client(database_name)

# Select container
#container_name = 'data'
#container = database.get_container_client(container_name)

import psycopg2

database = os.environ["POSTGRES_SERVICE_URL"]

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    host='localhost',
    database='vectordb',
    user='postgres',
    password='password'
)

# Create a cursor object
cur = conn.cursor()

# Create a table with pgvector type
cur.execute('CREATE TABLE my_table (id SERIAL PRIMARY KEY,name TEXT,feature_vector VECTOR(128));')

# Commit changes and close the cursor
conn.commit()
cur.close()

DuplicateTable: relation "my_table" already exists


In [5]:
import pandas as pd

# Initialize an empty DataFrame
df = pd.DataFrame(columns=['path', 'text'])


# splitting our data into chunks
data_paths= ["data/frameworks.md", "data/own_framework.md", "data/perceptron.md"]

for path in data_paths:
    with open(path, 'r', encoding='utf-8') as file:
        file_content = file.read()

    # Append the file path and text to the DataFrame
    df = df._append({'path': path, 'text': file_content}, ignore_index=True)

df.head()

Unnamed: 0,path,text
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...
1,data/own_framework.md,# Introduction to Neural Networks. Multi-Layer...
2,data/perceptron.md,# Introduction to Neural Networks: Perceptron\...


In [6]:
def split_text(text, max_length, min_length):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) < max_length and len(' '.join(current_chunk)) > min_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    # If the last chunk didn't reach the minimum length, add it anyway
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Assuming analyzed_df is a pandas DataFrame and 'output_content' is a column in that DataFrame
splitted_df = df.copy()
splitted_df['chunks'] = splitted_df['text'].apply(lambda x: split_text(x, 400, 300))

splitted_df

Unnamed: 0,path,text,chunks
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,[# Neural Network Frameworks As we have learne...
1,data/own_framework.md,# Introduction to Neural Networks. Multi-Layer...,[# Introduction to Neural Networks. Multi-Laye...
2,data/perceptron.md,# Introduction to Neural Networks: Perceptron\...,[# Introduction to Neural Networks: Perceptron...


In [7]:
# Assuming 'chunks' is a column of lists in the DataFrame splitted_df, we will split the chunks into different rows
flattened_df = splitted_df.explode('chunks')

flattened_df.head()

Unnamed: 0,path,text,chunks
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,# Neural Network Frameworks As we have learned...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,descent optimization While the `numpy` library...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,should give us the opportunity to compute grad...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,those computations on GPUs is very important. ...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,"API, there is also higher-level API, called Ke..."


## Converting our text to embeddings

Converting out text  to embeddings, and storing them in our database in chunks

In [12]:
openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_KEY") 
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") 
openai.api_version = "2023-07-01-preview"



In [13]:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT"))

In [8]:
def create_embeddings(text, model=model):
    # Create embeddings for each document chunk
    embeddings = client.embeddings(model=model, prompt = text)
    return embeddings['embedding']

#embeddings for the first chunk
create_embeddings(flattened_df['chunks'].iloc[0])

[-0.8921346068382263,
 8.52659797668457,
 -3.0948145389556885,
 -2.9111342430114746,
 10.425239562988281,
 -5.898595809936523,
 -5.830853462219238,
 4.965948581695557,
 2.139979839324951,
 -0.7197437882423401,
 -1.6024450063705444,
 8.824053764343262,
 -4.8895263671875,
 0.3656989634037018,
 3.70275616645813,
 -9.930574417114258,
 6.048618316650391,
 -1.2650094032287598,
 7.475633144378662,
 6.2945332527160645,
 3.152017116546631,
 -2.9245221614837646,
 -5.490932464599609,
 5.333406925201416,
 -5.793164253234863,
 -1.7865945100784302,
 2.2408745288848877,
 -1.04669988155365,
 0.20598436892032623,
 3.2339589595794678,
 5.0343194007873535,
 -1.582205057144165,
 -2.825143337249756,
 -0.3908259868621826,
 -1.6198183298110962,
 3.430479049682617,
 -9.541641235351562,
 -1.6861480474472046,
 -7.946754455566406,
 10.17497444152832,
 2.904057264328003,
 -0.14150166511535645,
 -1.014401912689209,
 0.9184861779212952,
 -2.9481093883514404,
 2.119143486022949,
 -0.5214941501617432,
 -0.98495286703

In [9]:
cat = create_embeddings("cat")
cat

[-3.0885839462280273,
 -0.9665231704711914,
 -3.7123684883117676,
 -7.576166152954102,
 -5.440977573394775,
 -2.8460888862609863,
 -0.9820046424865723,
 -0.9961654543876648,
 5.756304740905762,
 0.12076100707054138,
 -3.511246681213379,
 -3.8843882083892822,
 -6.7444748878479,
 1.785901665687561,
 8.157567024230957,
 3.8194568157196045,
 2.216707944869995,
 -1.1053752899169922,
 1.7403157949447632,
 -7.9841437339782715,
 -5.2744574546813965,
 -6.830099582672119,
 -1.6660008430480957,
 16.30977439880371,
 0.9059739708900452,
 -0.5734777450561523,
 -1.2666869163513184,
 -2.3585450649261475,
 8.5783052444458,
 6.178630352020264,
 -3.8375163078308105,
 0.601824939250946,
 -1.9278286695480347,
 0.2557612657546997,
 -0.392887681722641,
 -1.808225393295288,
 -7.630749702453613,
 1.226311206817627,
 -6.0335774421691895,
 10.246954917907715,
 3.666661500930786,
 0.7611777782440186,
 0.18485736846923828,
 -4.3145904541015625,
 2.8262486457824707,
 -6.521684646606445,
 -0.7981764674186707,
 3.166

In [10]:
# create embeddings for the whole data chunks and store them in a list

embeddings = []
for chunk in flattened_df['chunks']:
    embeddings.append(create_embeddings(chunk))

# store the embeddings in the dataframe
flattened_df['embeddings'] = embeddings

flattened_df.head()

Unnamed: 0,path,text,chunks,embeddings
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,# Neural Network Frameworks As we have learned...,"[-0.8921346068382263, 8.52659797668457, -3.094..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,descent optimization While the `numpy` library...,"[0.2993680536746979, 6.695285797119141, -0.604..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,should give us the opportunity to compute grad...,"[-0.13819558918476105, 8.018620491027832, 0.93..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,those computations on GPUs is very important. ...,"[2.900969982147217, 5.173274993896484, -0.6027..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,"API, there is also higher-level API, called Ke...","[-2.3177270889282227, -5.472762107849121, -5.5..."


# Retrieval

Vector search and similiarity between our prompt and the database

### Creating an search index and reranking

In [11]:
from sklearn.neighbors import NearestNeighbors

embeddings = flattened_df['embeddings'].to_list()

# Create the search index
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(embeddings)

# To query the index, you can use the kneighbors method
distances, indices = nbrs.kneighbors(embeddings)

# Store the indices and distances in the DataFrame
flattened_df['indices'] = indices.tolist()
flattened_df['distances'] = distances.tolist()

flattened_df.head()

Unnamed: 0,path,text,chunks,embeddings,indices,distances
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,# Neural Network Frameworks As we have learned...,"[-0.8921346068382263, 8.52659797668457, -3.094...","[0, 5, 32, 27, 39]","[0.0, 344.7489186561398, 392.5099876298633, 39..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,descent optimization While the `numpy` library...,"[0.2993680536746979, 6.695285797119141, -0.604...","[1, 6, 37, 21, 27]","[0.0, 415.62151564591255, 424.5846534253134, 4..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,should give us the opportunity to compute grad...,"[-0.13819558918476105, 8.018620491027832, 0.93...","[2, 34, 31, 6, 3]","[0.0, 390.68556193111334, 400.27496385066115, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,those computations on GPUs is very important. ...,"[2.900969982147217, 5.173274993896484, -0.6027...","[3, 11, 4, 2, 6]","[0.0, 313.17887449093337, 402.17040979096464, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,"API, there is also higher-level API, called Ke...","[-2.3177270889282227, -5.472762107849121, -5.5...","[4, 11, 3, 6, 5]","[0.0, 382.3149493003382, 402.17040979096464, 4..."


In [13]:
# Your text question
question = "what is a perceptron?"

# Convert the question to a query vector
query_vector = create_embeddings(question)  # You need to define this function

# Find the most similar documents
distances, indices = nbrs.kneighbors([query_vector])

index = []
# Print the most similar documents
for i in range(3):
    index = indices[0][i]
    for index in indices[0]:
        print(flattened_df['chunks'].iloc[index])
        print(flattened_df['path'].iloc[index])
        print(flattened_df['distances'].iloc[index])
    else:
        print(f"Index {index} not found in DataFrame")

in our model, in which case the input vector would be a vector of size N. A perceptron is a **binary classification** model, i.e. it can distinguish between two classes of input data. We will assume that for each input vector x the output of our perceptron would be either +1 or -1, depending on the class.
data/perceptron.md
[0.0, 367.75447237096495, 368.6303934346619, 390.3733640086858, 394.0609895636897]
can calculate derivatives as: * &part;&lagran;/&part;w<sub>2</sub> = (&part;&lagran;/&part;&sigma;)(&part;&sigma;/&part;z<sub>2</sub>)(&part;z<sub>2</sub>/&part;w<sub>2</sub>) * &part;&lagran;/&part;w<sub>1</sub> = (&part;&lagran;/&part;&sigma;)(&part;&sigma;/&part;z<sub>2</sub>)(&part;z<sub>2</sub>/&part;&alpha;)(&part;&alpha;/&part;z<sub>1</sub>)(&part;z<sub>1</sub>/&part;w<sub>1</sub>)
data/own_framework.md
[0.0, 314.1944643242463, 385.406289484829, 390.3733640086858, 403.0711126839644]
**multi-class classification** in addition to two-class * solve **regression problems** in addit

## Putting it all together to answer a question

In [22]:
import os
import openai

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("AZURE_OPENAI_KEY")

In [14]:
user_input = "what is a perceptron?"

def chatbot(user_input):
    # Convert the question to a query vector
    query_vector = create_embeddings(user_input)

    # Find the most similar documents
    distances, indices = nbrs.kneighbors([query_vector])

    # add documents to query  to provide context
    history = []
    for index in indices[0]:
        history.append(flattened_df['chunks'].iloc[index])

    # combine the history and the user input
    history.append(user_input)

    # create a message object
    messages=[
        {"role": "system", "content": "You are an AI assiatant that helps with AI questions."},
        {"role": "user", "content": history[-1]}
    ]

    options = {
        "temperature":0.7,
        "num_predict":800,
    }
    # use chat completion to generate a response
    response = client.chat(
        model=model,
        options=options,
        messages=messages
    )

    return response['message']['content']

chatbot(user_input)

## Testing and evaluation

A basic example of how you can use Mean Average Precision (MAP) to evaluate the responses of your model based on their relevance.

In [25]:
from sklearn.metrics import average_precision_score

# Define your test cases
test_cases = [
    {
        "query": "What is a perceptron?",
        "relevant_responses": ["A perceptron is a type of artificial neuron.", "It's a binary classifier used in machine learning."],
        "irrelevant_responses": ["A perceptron is a type of fruit.", "It's a type of car."]
    },
    {
        "query": "What is machine learning?",
        "relevant_responses": ["Machine learning is a method of data analysis that automates analytical model building.", "It's a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention."],
        "irrelevant_responses": ["Machine learning is a type of fruit.", "It's a type of car."]
    },
    {
        "query": "What is deep learning?",
        "relevant_responses": ["Deep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled.", "It's a type of machine learning."],
        "irrelevant_responses": ["Deep learning is a type of fruit.", "It's a type of car."]
    },
    {
        "query": "What is a neural network?",
        "relevant_responses": ["A neural network is a series of algorithms that endeavors to recognize underlying relationships in a set of data through a process that mimics the way the human brain operates.", "It's a type of machine learning."],
        "irrelevant_responses": ["A neural network is a type of fruit.", "It's a type of car."]
    }
]

# Initialize the total average precision
total_average_precision = 0

# Test the RAG application
for test_case in test_cases:
    query = test_case["query"]
    relevant_responses = test_case["relevant_responses"]
    irrelevant_responses = test_case["irrelevant_responses"]

    # Generate a response using your RAG application
    response = chatbot(query) 

    # Create a list of all responses and a list of true binary labels
    all_responses = relevant_responses + irrelevant_responses
    true_labels = [1] * len(relevant_responses) + [0] * len(irrelevant_responses)

    # Create a list of predicted scores based on whether the response is the generated response
    predicted_scores = [1 if resp == response else 0 for resp in all_responses]

    # Calculate the average precision for this query
    average_precision = average_precision_score(true_labels, predicted_scores)

    # Add the average precision to the total average precision
    total_average_precision += average_precision

# Calculate the mean average precision
mean_average_precision = total_average_precision / len(test_cases)

In [26]:
mean_average_precision

0.5