In [2]:
from sentence_transformers import SentenceTransformer, util
import os
import csv
import pickle
import time
import faiss
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import json 

file=open('data/train.json','r')

with file:
    data=json.load(file)

In [5]:
len(data)


76560

In [6]:
data[0].keys()

dict_keys(['document', 'messages', 'answers'])

In [7]:
data[0]['messages']

[{'role': 'user',
  'content': 'How many Americans are part of the federal food assistance program? '}]

In [7]:
import ollama 

def create_vector_store(document,model_name,sep='\n'):
    documents=list()
    vector_store=dict()
    file=document.split(sep)
    for chunk in file:
        if chunk!='\n':
            documents.append(chunk)
    for index,chunk in enumerate(documents):
        vector=ollama.embeddings(model=model_name,prompt=chunk)
        vector_store.update({index:vector['embedding']})
    return vector_store,documents

In [43]:
doc_vector,documents=create_vector_store(data[0]['document'],'all-minilm',sep='.')

In [8]:
import numpy as np

embeddings_small=np.load('data/small_embeddings.npy')
queries_small=np.load('data/small_queries.npy')

In [9]:
embeddings_large=np.load('data/large_embeddings.npy')
queries_large=np.load('data/large_queries.npy')

In [10]:
embeddings_small.shape

(76560, 384)

In [7]:
queries_large.shape

(76560, 512)

In [9]:
import time
import sklearn as sk
from sklearn.neighbors import KDTree

t1=time.time()
tree_small = KDTree(embeddings_small, leaf_size=400)
t2=time.time()
print('KD Tree build time : {0:3f}[s]'.format(t2-t1))

KD Tree build time : 0.518381[s]


In [10]:
t1=time.time()
tree_large = KDTree(embeddings_large, leaf_size=400)
t2=time.time()
print('KD Tree build time : {0:3f}[s]'.format(t2-t1))

KD Tree build time : 0.977605[s]


In [11]:
queries_large[:100].shape

(100, 512)

In [27]:
t1=time.time()
ind = tree_small.query(queries_small[:1], k=1,return_distance=False,dualtree=True)
t2=time.time()
print('KD Tree query time 1 docs : {0:3f}[s]'.format(t2-t1))

KD Tree query time 1 docs : 0.046069[s]


In [28]:
t1=time.time()
ind = tree_large.query(queries_large[:1], k=1,return_distance=False,dualtree=True)
t2=time.time()
print('KD Tree query time 1 docs : {0:3f}[s]'.format(t2-t1))

KD Tree query time 1 docs : 0.045902[s]


In [36]:
t1=time.time()
ind = tree_small.query(queries_small[:100], k=1,return_distance=False,dualtree=True)
t2=time.time()
print('KD Tree query time 100 docs : {0:3f}[s]'.format(t2-t1))

KD Tree query time 100 docs : 2.179351[s]


In [30]:
t1=time.time()
ind = tree_large.query(queries_large[:100], k=1,return_distance=False,dualtree=True)
t2=time.time()
print('KD Tree query time 100 docs : {0:3f}[s]'.format(t2-t1))

KD Tree query time 100 docs : 3.069783[s]


In [25]:
t1=time.time()
ind = tree_small.query(queries_small[:1000], k=1,return_distance=False,dualtree=True)
t2=time.time()
print('KD Tree query time 1000 docs : {0:3f}[s]'.format(t2-t1))

KD Tree query time 1000 docs : 21.101429[s]


In [26]:
t1=time.time()
ind = tree_large.query(queries_large[:1000], k=1,return_distance=False,dualtree=True)
t2=time.time()
print('KD Tree query time 1000 docs : {0:3f}[s]'.format(t2-t1))

KD Tree query time 1000 docs : 30.423240[s]


In [31]:
import ollama
from ollama import chat

In [32]:
data[0]['messages'][0]['content']

'How many Americans are part of the federal food assistance program? '

In [33]:
data[0]['answers']

['31 million']

In [37]:
messages=list()
answers=list()
questions=list()
for ux,ix in enumerate(ind):
  question=data[ux]['messages'][0]['content']
  text=data[ix[0]]['document']
  messages.append({
    'role': 'user',
    'content': f'Respond the following question : "{question}" based on the following text : "{text}"',
  })
  questions.append(question)
  answers.append(data[ux]['answers'])

In [38]:
num=0
for q,a,m in zip(questions,answers,messages):
    if num==10:
        break
    response = chat('phi3', messages=[m])
    print(f'Question : {q}')
    print(f'Real Answers : {a}')
    print(f'Response : {response['message']['content']}')
    num+=1
    print('----------------')

Question : How many Americans are part of the federal food assistance program? 
Real Answers : ['31 million']
Response : The provided text does not directly state the number of Americans who are part of the federal food assistance program. However, it mentions about 425,000 more students participating in the National School Lunch Program and the School Breakfast Program. To obtain an accurate count of how many Americans are part of the entire federal food assistance program (which includes programs like SNAP - Supplemental Nutrition Assistance Program), further research beyond this text would be required.
----------------
Question : How much did Sean Callebs live on? 
Real Answers : ['$176']
Response : The text does not provide information on how much Sean Callebs, a former co-host of "Live! With Regis and Kelly," lived on financially after his tenure on the show ended in January when he announced his retirement. The details provided are related to Philbin's last episode and potential 

In [None]:
embeddings_small.shape

In [8]:
import faiss 

d=embeddings_small.shape[1]
# Number of clusters used for faiss. Select a value 4*sqrt(N) to 16*sqrt(N) - https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
n_clusters = 16*np.sqrt(embeddings_small.shape[1])
t1=time.time()
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, n_clusters, faiss.METRIC_INNER_PRODUCT)
print(index.is_trained)
index.add(embeddings_small)
t2=time.time()
print(index.ntotal)
print('FAISS build time : {0:3f}[s]'.format(t2-t1))

True
76560
FAISS build time : 0.038509[s]


In [13]:
print(index.ntotal)

76560


In [10]:
import time 

t4=time.time()
D, I = index.search(queries_small[:1], k=1)
t5=time.time()
print('FAISS query time 1 docs : : {0:3f}[s]'.format(t5-t4))

FAISS query time 1 docs : : 0.008810[s]


In [1]:
queries_small[:10]

NameError: name 'queries_small' is not defined

In [15]:
t4=time.time()
D, I = index.search(queries_small[:2], k=1)
t5=time.time()
print('FAISS query time 1 docs : : {0:3f}[s]'.format(t5-t4))

: 