### Uncomment and run the following cells if you work on Google Colab :) Don't forget to change your runtime type to GPU!

In [None]:
 ! git clone https://github.com/kstathou/vector_engine

In [None]:
 cd vector_engine

In [None]:
 ! pip install -r requirements.txt

In [None]:
%load_ext autoreload

In [28]:
%autoreload 2
# Used to import data from local.
import pandas as pd

# Used to create the dense document vectors.
import torch
from sentence_transformers import SentenceTransformer

# Used to create and store the Faiss index.
import faiss
import numpy as np
import pickle
from pathlib import Path

# Used to do vector searches and display the results.
from vector_engine.utils import vector_search, id2details

In [29]:
# Read a CSV in a table
df = pd.read_csv('/content/transcribed_audios.csv')
df['id'] = df.index

In [30]:
df.head(3)

Unnamed: 0,audio_file,chunk_num,transcription,begin_secs,end_secs,id
0,podcasts/Ep8-JOW-LEAN_FINAL-with-NEWCREDITSMAS...,0,I'M STEVE STROGATS AND THIS IS THE JOY OF GOR ...,0.0,9.951313,0
1,podcasts/Ep8-JOW-LEAN_FINAL-with-NEWCREDITSMAS...,1,THAT TAKES YOU INTO SOME OF THE BIGGEST UNANSW...,9.951313,19.902625,1
2,podcasts/Ep8-JOW-LEAN_FINAL-with-NEWCREDITSMAS...,2,HOW MUCH MATH CAN A COMPUTER DO AND COULD COMP...,19.902625,29.853938,2


In [31]:
print(f"transcripts: {df.id.unique().shape[0]}")

transcripts: 687


In [None]:
# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# Check if GPU is available and use it
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)

In [None]:
# Convert abstracts to vectors
embeddings = model.encode(df.transcription.to_list(), show_progress_bar=True)

In [34]:
print(f'Shape of the vectorised abstract: {embeddings.shape}')

Shape of the vectorised abstract: (687, 768)


In [35]:
# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df.id.values)

print(f"Number of vectors in the Faiss index: {index.ntotal}")

Number of vectors in the Faiss index: 687


In [36]:
D, I = index.search(np.array([embeddings[600]]), k=10)

### Searching the index
The index we built will perform a k-nearest-neighbour search. We have to provide the number of neighbours to be returned. 

Let's query the index with an abstract from our dataset and retrieve the 10 most relevant documents. **The first one must be our query!**


In [41]:
user_query = """
mathematics
"""

In [42]:
# For convenience, I've wrapped all steps in the vector_search function.
# It takes four arguments: 
# A query, the sentence-level transformer, the Faiss index and the number of requested results
D, I = vector_search([user_query], model, index, num_results=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')

L2 distance: [211.16189575195312, 237.3350372314453, 243.3343505859375, 245.60272216796875, 252.97988891601562, 256.58837890625, 258.6418151855469, 260.4259033203125, 261.7326965332031, 271.40216064453125]

MAG paper IDs: [145, 44, 186, 34, 62, 551, 7, 33, 56, 187]


In [43]:
# Fetching the paper titles based on their index
id2details(df, I, 'transcription')

[['IS PROVING THERMS THAT THE MODERN MAPETITION IS USING SO RIGHT NOW IT SEEMS THAT MATHEMATICIANS ARE TEACHING MATH TO'],
 ["GRHYTHMS WHICH TRANSLATE THIS MATHEMATIC DOWN INTO THE AXIMS ALL IN THE SAME SYSTEM IT'S GIVEN LEAD A BIT OF AN EDGE WHEN IT COMES TO MATHEMATICS OVER THE OTHER THEORY RIVERS"],
 ["SOLUTION TO IT MATH IS HARD THIS IS THE PROBLEM  IS HARD IN SEVERAL DIFFERENT DISTINCT WAYS ONLY ONE THING WE'RE DOING RIGHT NOW IS WITH TEACHING THE COMPUTER SCIENTIS"],
 ['AND HE PROPOSED A SETTING UP OF A DATA BASE OF A MATHEMATICAL THEOREMS JUST THE STATEMENTS OF MATHEMATICAL THEOREMS A ALL S'],
 ["URNS MATHEMATICS INTO A PUZZLE GAME AND E AMATHEMASCAL THERUM IS ELEVEL OF THAT PUZZLED GAME AND IF YOU STATE THE THERUM THEN YOU'VE MADE THE LEVEL AND Y"],
 ["ER GLADTO TALK ABOUT IT WAN'T YOU KNOW WHAT'S HAPPENING AT THE FRONTIERS OF MATH PHYSICS COMPUTER SCIENCE AND B"],
 ["THEY'RE BUILDING A LIBRARY OF MATHKNOWLEDGE FOR LEAN TO REFERENCE TO HELP HUMAN MATHEMATICIANS CHECK THEIR PROO

In [44]:
id2details(df, I, 'id')

[[145], [44], [186], [34], [62], [551], [7], [33], [56], [187]]