In [1]:
import datasets
from sentence_transformers import InputExample
from tqdm.auto import tqdm
from sentence_transformers import datasets as datasets_dup
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses

import pinecone

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Loading Model 
model = SentenceTransformer('/home/rishabh/GitHub/Knowledge-Graph/model-checkpoint')

In [3]:
### Loading the dataset
squad_dev = datasets.load_dataset('squad_v2', split='validation')
print('-'*50, 'Loading Dataset', '-'*50, sep='\n')
print(squad_dev[0])


### Pre-Processing Data
print('-'*50, 'Processing Dataset', '-'*50, sep='\n')
unique_contexts = []
unique_ids = []

# make list of IDs that represent only first instance of
# each context
for row in squad_dev:
    if row['context'] not in unique_contexts:
        unique_contexts.append(row['context'])
        unique_ids.append(row['id'])

# now filter out any samples that aren't included in unique IDs
squad_dev = squad_dev.filter(lambda x: True if x['id'] in unique_ids else False)
print(squad_dev)
# print(squad_dev[0])

Found cached dataset squad_v2 (/home/rishabh/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


--------------------------------------------------
Loading Dataset
--------------------------------------------------
{'id': '56ddde6b9a695914005b9628', 'title': 'Normans', 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', 'question': 'In what country is Normandy located?', 'answers': {'text'

Loading cached processed dataset at /home/rishabh/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d/cache-ec1d0790507cf203.arrow


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1204
})


In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

In [7]:

# convert the squad_dev data into a pandas DataFrame
df = pd.DataFrame(squad_dev)
df.head()

Unnamed: 0,id,title,context,question,answers
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France', 'France', 'France', 'Franc..."
1,56dddf4066d3e219004dad5f,Normans,"The Norman dynasty had a major political, cult...",Who was the duke in the battle of Hastings?,"{'text': ['William the Conqueror', 'William th..."
2,56dde0379a695914005b9636,Normans,"The English name ""Normans"" comes from the Fren...",What is the original meaning of the word Norman?,"{'text': ['Viking', 'Norseman, Viking', 'Norse..."
3,56dde0ba66d3e219004dad75,Normans,"In the course of the 10th century, the initial...",When was the Duchy of Normandy founded?,"{'text': ['911', '911', '911'], 'answer_start'..."
4,56dde1d966d3e219004dad8d,Normans,"Before Rollo's arrival, its populations did no...",Who upon arriving gave the original viking set...,"{'text': ['Rollo', 'Rollo', 'Rollo'], 'answer_..."


In [8]:
# compute the embeddings for each context using the model
embeddings = np.array([model.encode([c]).squeeze() for c in df['context']])
df['embedding'] = list(embeddings)
df.head()

Unnamed: 0,id,title,context,question,answers,embedding
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France', 'France', 'France', 'Franc...","[-0.003194691, 0.25825036, -0.02514003, 0.0068..."
1,56dddf4066d3e219004dad5f,Normans,"The Norman dynasty had a major political, cult...",Who was the duke in the battle of Hastings?,"{'text': ['William the Conqueror', 'William th...","[-0.003741815, 0.04705006, 0.011801047, 0.0031..."
2,56dde0379a695914005b9636,Normans,"The English name ""Normans"" comes from the Fren...",What is the original meaning of the word Norman?,"{'text': ['Viking', 'Norseman, Viking', 'Norse...","[0.02605035, 0.19965032, 0.09081283, 0.0789576..."
3,56dde0ba66d3e219004dad75,Normans,"In the course of the 10th century, the initial...",When was the Duchy of Normandy founded?,"{'text': ['911', '911', '911'], 'answer_start'...","[0.09164262, 0.12337525, -0.0092817675, 0.0851..."
4,56dde1d966d3e219004dad8d,Normans,"Before Rollo's arrival, its populations did no...",Who upon arriving gave the original viking set...,"{'text': ['Rollo', 'Rollo', 'Rollo'], 'answer_...","[0.027183965, 0.15210478, -0.024414279, -3.006..."


In [18]:
# compute the cosine similarity between each embedding and the query
def querydb(query):
    query_embedding = np.array(model.encode([query]).squeeze())

    df['cosine_sim'] = df['embedding'].apply(lambda x: 1 - cosine(x, query_embedding))

    # sort the DataFrame by cosine similarity and return the top k results
    k = 5
    result = df.sort_values('cosine_sim', ascending=False).head(k)

    # format the result as a list of dictionaries
    # output = []
    # for i, row in result.iterrows():
    #     output.append({
    #         'id': row['title'],
    #         # 'encoding': row['embedding'].tolist(),
    #         'metadata': {'text': row['context']}
    #     })
    # print(output)

    return result

In [19]:
query = "When were the Normans in Normandy?"
res = querydb(query)

[{'id': 'Normans', 'metadata': {'text': 'The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Sarac

In [14]:
query = "How many outputs are expected for each input in a function problem"
querydb(query).head()

Unnamed: 0,id,title,context,question,answers,embedding,cosine_sim
47,56e19724cd28a01900c679f6,Computational_complexity_theory,A function problem is a computational problem ...,A function problem is an example of what?,"{'text': ['a computational problem', 'a comput...","[0.08159484, -0.14719279, -0.043740097, 0.1237...",0.760304
42,56e17a7ccd28a01900c679a1,Computational_complexity_theory,A computational problem can be viewed as an in...,What is the name given to the input string of ...,"{'text': ['problem instance', 'a problem insta...","[0.021695478, -0.2981989, -0.054458123, 0.0954...",0.644326
48,56e1a0dccd28a01900c67a2e,Computational_complexity_theory,It is tempting to think that the notion of fun...,How can function problems typically be restated?,"{'text': ['decision problems', 'as decision pr...","[0.055308133, -0.29686043, -0.035305742, 0.126...",0.618741
50,56e1a564cd28a01900c67a48,Computational_complexity_theory,"If the input size is n, the time taken can be ...",Whose thesis states that the solution to a pro...,"{'text': ['Cobham's thesis', 'Cobham's', 'Cobh...","[-0.022643154, -0.4174587, -0.05235702, 0.1231...",0.573708
45,56e190bce3433e1400422fc8,Computational_complexity_theory,Decision problems are one of the central objec...,What kind of problems are one of the main topi...,"{'text': ['Decision problems', 'Decision probl...","[0.06764113, -0.19279698, -0.034374814, 0.1273...",0.571173


In [15]:
query = "Who used Islamic, Lombard, etc construction techniques in the Mediterranean?"
querydb(query).head()

Unnamed: 0,id,title,context,question,answers,embedding,cosine_sim
32,56de4b074396321400ee2793,Normans,"In England, the period of Norman architecture ...",What architecture type came after Norman in En...,"{'text': ['Early Gothic', 'Early Gothic', 'Ear...","[-0.10846567, 0.124409616, 0.020839885, -0.000...",0.578037
36,56de51244396321400ee27ef,Normans,"In Britain, Norman art primarily survives as s...",What is the most important type of Norman art ...,"{'text': ['mosaics', 'mosaics', 'mosaics'], 'a...","[-0.012012731, 0.17504457, -0.03161043, 0.0854...",0.516022
31,56de4a89cffd8e1900b4b7bd,Normans,Norman architecture typically stands out as a ...,What is the Norman architecture idiom?,"{'text': ['Romanesque', 'Romanesque', 'Romanes...","[-0.043765236, -0.00012196721, 0.006531468, -0...",0.410432
35,56de4c324396321400ee27ab,Normans,By far the most famous work of Norman art is t...,What kind of needlework was used in the creati...,"{'text': ['embroidery', 'embroidery', 'embroid...","[0.02596968, 0.010251664, -0.038997218, 0.0409...",0.362928
778,57287c142ca10214002da3d0,Yuan_dynasty,The Yuan undertook extensive public works. Amo...,What astronomer worked for Kublai?,"{'text': ['Guo Shoujing', 'Guo Shoujing', 'Guo...","[0.0374789, 0.24047974, -0.007026191, 0.057039...",0.348381
