In [1]:
import numpy as np
import pandas as pd

In [2]:
amaData = []
with open("ama.txt", "rt") as amafile:
    tag = None
    Q = None
    A = None
    for line in amafile:
        if line.startswith('#'):            
            tag = line[1:].lstrip().rstrip()
        elif line.startswith('Q'):
            Q = line[1:].lstrip().rstrip()
        elif line.startswith('A'):
            A = line[1:].lstrip().rstrip()   
            amaData.append([tag, Q, A])
    

In [3]:
df = pd.DataFrame(amaData, columns = ['tag', 'question', 'answer'])

In [4]:
df["id"] = df.index + 1
df

Unnamed: 0,tag,question,answer,id
0,Course-Related,"In any of the quizzes under this program, if t...",The number of attempts is not counted for Phas...,1
1,Course-Related,Are there any hands-on projects during the Pha...,There are no projects in Phase 1. The projects...,2
2,Course-Related,Is it compulsory to know how Flask or Django w...,Technical questions can be answered by your pe...,3
3,Course-Related,As I was going through the program I noticed t...,You're correct in that there is no project req...,4
4,Course-Related,"Hi I can't find email,what tools are we suppos...",Hey!!You can find the list of necessary tools ...,5
...,...,...,...,...
101,Miscellaneous,Will this slack workspace remain after phase 1?,This Slack space will not be in use after Phas...,102
102,Miscellaneous,Can we get the highlight of the mid term feedb...,While we might not be able to disclose every m...,103
103,Miscellaneous,What metrics do you use to evaluate the overal...,'% of students from the whole cohort that comp...,104
104,Miscellaneous,Will there be future SUSE scholarships?,We currently don’t have confirmation of future...,105


## Try sentence encoder

In [5]:
import torch
from sentence_transformers import SentenceTransformer

import pickle

In [6]:
import faiss

In [7]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)


cpu


In [8]:
question_answer = df.apply(lambda x: x['question'] +'.'+ x['answer'],  axis=1).to_list()
embeddings = model.encode( question_answer , show_progress_bar=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")


In [10]:
index = faiss.IndexFlatL2(embeddings.shape[1])

In [11]:
index = faiss.IndexIDMap(index)


In [12]:
index.add_with_ids(embeddings, df.id.values)


In [26]:
def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level
    DistilBERT model and finds similar vectors using FAISS.
    
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
    
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.
    
    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I


def id2details(df, I, column=None):
    """Returns the paper titles based on the paper index."""
    if column is None:
        return pd.concat([(df[df.id == idx]) for idx in I[0]])
    else:
        return pd.concat([(df[df.id == idx][column]) for idx in I[0]])

### Testing queries

In [27]:
user_query = """
what are student led initiatives?
"""

In [28]:
D, I = vector_search([user_query], model, index, num_results=10)

In [29]:
id2details(df, I)

Unnamed: 0,tag,question,answer,id
23,Student-Led Events/Initiatives,Organising Fun activities comes under Student ...,"Yes, you may organize fun events as you see fit",24
50,Student Advocate,Is applying for student advocate counted as pa...,We are only taking into consideration those wh...,51
94,Miscellaneous,Can we have some activity to get to know peopl...,"If you have an idea of how to go about this, ...",95
22,Student-Led Events/Initiatives,"Hello, if I'd like to create a Student Initiat...",You can have the volunteer call based on your ...,23
34,Study Teams,What are the Study Teams?,You might want to check https://sites.google.c...,35
80,Community Participation,On what basis will Slack community participati...,Your participation in Slack is not necessarily...,81
21,Student-Led Events/Initiatives,"Hello, will there be a channel created for stu...",We are already thinking about this but one of ...,22
30,Student-Led Events/Initiatives,"Regarding student-led initiatives, for Project...",Hey!! No need to have a live demo. You are wel...,31
32,Study Teams,Will students form groups by themselves?,"Through Study Teams, scholars will be able to ...",33
47,Student Advocate,Where may I read more about the Student Advoca...,"We will be launching Student Advocates Friday,...",48
