In [29]:
import pandas as pd

df = pd.read_excel('RAG/results/retrieval_file_7.xlsx')

questions = df.questions

In [64]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
import chromadb
client = chromadb.PersistentClient()

In [70]:
collection = client.get_collection('collection_16')
collection.metadata

{'hnsw:space': 'ip'}

In [76]:
model_name = 'BAAI/bge-base-en-v1.5'

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer(model_name)

question = 'What is the median time until death in MERS-COV?'

query_embedding = model.encode(question)
retrieved_docs = collection.query(query_embeddings=[query_embedding.tolist()], include=['documents'])['documents'][0]

In [77]:
retrieved_docs

['recovered in culture beyond four hours under any conditions  . Aerosol experiments found MERS-CoV viability only decreased 7 % at low RH at 20 °C. In comparison, influenza A virus decreased by 95 %  . MERS-CoV survival is inferior to that previously demonstrated for SARS-CoV  . For context,',
 'of MERS-CoV has exceeded 830, with at least 288 associated deaths.62 The majority of cases have involved patients with comorbidities   and are predominately males   with a median age of 47.63,64 Fewer than 25% of patients have reported contact with animals including dromedary camels, which have',
 'of Saudi Arabia. We found that MERS-CoV infection was found significantly in people aged between 41 and 60 years and was reported most commonly during the summer season. The odds of infection among males were found to be twice as high as that of females with suspected cases. During the study',
 'required intensive care and 3 died. Of those who died, 2 were exposed to MERS-CoV in the 3rd trimester, a

In [79]:
from sentence_transformers import CrossEncoder

model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2")
scores = model.predict([(question, doc) for doc in retrieved_docs])
scores

array([-0.44695026,  6.8437347 , -0.97145253, -2.6003458 ,  1.294803  ,
       -3.7943788 , -2.285297  ,  0.26899815,  4.5836077 , -7.642929  ],
      dtype=float32)

In [55]:
retrieved_docs[scores.argmax()]

'also no data on the safety in humans, reducing enthusiasm for baculovirus as a vaccine vector for influenza at this time. Newcastle disease virus   is a single-stranded, negative-sense RNA virus that causes disease in poultry. NDV has a number of appealing qualities as a vaccine vector. As an avian virus, there is little or no preexisting immunity to NDV in humans and NDV propagates to high titers in both chicken eggs and cell culture. As a paramyxovirus, there is no DNA phase in the virus'

In [80]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


def generate(q, doc)->str:
    input_text = f"""answer the question based on this context: {doc} 
    question: {q}
    answer: """

    model_name = "google/flan-t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    input_ids = tokenizer(input_text, return_tensors="pt")

    outputs = model.generate(**input_ids)
    outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return outputs

In [81]:
generate(q=question, doc=retrieved_docs[scores.argmax()])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'47.63'

In [22]:
from langchain_community.embeddings import FakeEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores.chroma import Chroma

def get_vectorstore(chroma_collection_name = 'collection_1',
                    distance_fn = 'cosine',
                    embedding_model = 'sentence-transformers/msmarco-MiniLM-L-6-v3'):
    
    # connect to Chroma client
    client = chromadb.PersistentClient()

    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': True}
    emb_model = HuggingFaceEmbeddings(model_name=model_name,
                                          model_kwargs=model_kwargs,
                                          encode_kwargs=encode_kwargs)

    # Langchain Chroma wrapper
    langchain_chroma = Chroma(client=client,
                              collection_name=chroma_collection_name,
                              embedding_function=emb_model,
                              collection_metadata={"hnsw:space": distance_fn})  
    print(f"{langchain_chroma._embedding_function.model_name=}")  
    print(f"{langchain_chroma._collection.name=}")  
    print(f"{langchain_chroma._collection.metadata=}")  
    return langchain_chroma

vector_store = get_vectorstore()

langchain_chroma._embedding_function.model_name='sentence-transformers/msmarco-MiniLM-L-6-v3'
langchain_chroma._collection.name='collection_1'
langchain_chroma._collection.metadata={'hnsw:space': 'cosine'}


In [25]:
vector_store.similarity_search(query=text, k=4, filter=None)

[Document(page_content='political will in low- and middle-income countries to commit to immunization as a priority, social marketing to individuals and communities, strengthening health systems and promoting relevant local research and development innovations  . Maternal vaccination to prevent disease in the youngest infants has been shown to be effective for tetanus, influenza and pertussis  . Influenza vaccination during pregnancy is safe, provides reasonable maternal protection against influenza, and also protects infants for a limited period from confirmed influenza infection  . However as antibody levels drop sharply after birth, infant protection does not persist much beyond 8 weeks  . Recently respiratory syncytial virus vaccination in pregnancy has been shown to be safe and immunogenic, and a phase-3', metadata={'source': 'data/covid/PMC5608782.xml.json.txt'}),
 Document(page_content='Vaccination remains most economical and effective means against respiratory diseases caused by

In [11]:
import pandas as pd

df = pd.read_csv('metrics.csv', index_col=None)

In [12]:
df

Unnamed: 0.1,Unnamed: 0,bleu_bleu,bleu_precisions,bleu_brevity_penalty,bleu_length_ratio,bleu_translation_length,bleu_reference_length,rouge_rouge1,rouge_rouge2,rouge_rougeL,rouge_rougeLsum,meteor_meteor,sts
0,0,0.000000,"[0.16666666666666666, 0.0, 0.0, 0.0]",5.881647e-02,0.260870,6,23,0.066667,0.000000,0.066667,0.066667,0.046948,0.258555
1,1,0.074274,"[1.0, 1.0, 1.0, 1.0]",7.427358e-02,0.277778,5,18,0.434783,0.380952,0.434783,0.434783,0.298204,0.728408
2,2,0.000000,"[1.0, 0.0, 0.0, 0.0]",1.000000e+00,1.000000,1,1,1.000000,0.000000,1.000000,1.000000,0.500000,0.922458
3,3,0.000000,"[0.25, 0.0, 0.0, 0.0]",6.065307e-01,0.666667,4,6,0.181818,0.000000,0.181818,0.181818,0.086207,0.312877
4,4,0.000000,"[1.0, 1.0, 0.0, 0.0]",5.043477e-07,0.064516,2,31,0.160000,0.086957,0.160000,0.160000,0.071293,0.414881
...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,282,0.000000,"[1.0, 0.0, 0.0, 0.0]",1.000000e+00,1.000000,1,1,1.000000,0.000000,1.000000,1.000000,0.500000,0.974265
283,283,0.000000,"[0.0, 0.0, 0.0, 0.0]",2.635971e-01,0.428571,6,14,0.000000,0.000000,0.000000,0.000000,0.000000,0.209577
284,284,0.000000,"[0.0, 0.0, 0.0, 0.0]",6.144212e-06,0.076923,1,13,0.000000,0.000000,0.000000,0.000000,0.000000,0.125022
285,285,0.000000,"[0.0, 0.0, 0.0, 0.0]",1.000000e+00,4.000000,4,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.004810


In [85]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained('google/flan-t5-base')
model.name_or_path

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'google/flan-t5-base'

In [11]:
import uuid

def get_new_row(aa):
    return dict(dd=str(uuid.uuid4()), aa=aa)

In [36]:
path = 'texxxxst.jsonl'

In [13]:
import json

def insert(row):
    with open(path, mode='a') as f:
        f.write(json.dumps(row) + '\n')

In [37]:
insert(get_new_row(99))

In [19]:
def get():
    with open(path, mode='r') as f:
        data = [json.loads(row.strip()) for row in f.readlines()]


In [20]:
db = get()

In [23]:
db.append({
    'dd':'b66bb06a-07de-4f9b-94bd-2d46b02e23cb', 'aa': 99
})

In [24]:
def update(index, key, value):
    for e in db:
        if e['dd'] == index:
            row = e.copy()

    row[key] = value
    insert(row)

In [25]:
db

[{'dd': 'b66bb06a-07de-4f9b-94bd-2d46b02e23cb', 'aa': 23},
 {'dd': 'eb3140d5-afbd-4178-bb5b-3fbb51f3c32b', 'aa': 99},
 {'dd': 'b66bb06a-07de-4f9b-94bd-2d46b02e23cb', 'aa': 99}]

In [30]:
update('b66bb06a-07de-4f9b-94bd-2d46b02e23cb','aa','xxxx')

In [31]:
db = get()

In [32]:
db

[{'dd': 'b66bb06a-07de-4f9b-94bd-2d46b02e23cb', 'aa': 23},
 {'dd': 'eb3140d5-afbd-4178-bb5b-3fbb51f3c32b', 'aa': 99},
 {'dd': 'b66bb06a-07de-4f9b-94bd-2d46b02e23cb', 'aa': 876},
 {'dd': 'b66bb06a-07de-4f9b-94bd-2d46b02e23cb', 'aa': 'xxxx'}]

In [33]:
x = dict()
def test(**kwa):
    for k,v in kwa.items():
        x[k] = v

In [34]:
test(a=1, b=2)

In [35]:
x

{'a': 1, 'b': 2}