https://towardsdatascience.com/12-rag-pain-points-and-proposed-solutions-43709939a28c

https://www.mixedbread.ai/blog/mxbai-rerank-v1

https://medium.com/@vinitgela/decoding-raft-and-raft-datasetpack-by-llamaindex-5be9d912f992

https://arxiv.org/pdf/2403.10131.pdf

https://github.com/ShishirPatil/gorilla/blob/main/raft/raft.py

https://medium.com/@ud.chandra/instruction-fine-tuning-llama-2-with-pefts-qlora-method-d6a801ebb19

In [1]:
import pandas as pd
from typing import Literal, Any
import argparse
from openai import OpenAI

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
import json
import random
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader

#from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.llms import Ollama
from tqdm import tqdm


In [None]:
#!pip install pandas datasets langchain sentence_transformers tqdm openai langchain_experimental langchain_openai

In [2]:
llm = Ollama(model='mistral')
embeddings = SentenceTransformer("all-MiniLM-L6-v2")
llm

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Ollama(model='mistral')

In [3]:
data = pd.read_csv("reddit_dot_scores_quality.csv", index_col="Unnamed: 0")
data.shape

(10482, 11)

In [4]:
data = data.loc[data['title_query_falcon_dot'] > 0.8]
data.shape

(73, 11)

In [5]:
text = ''.join(data['falcon_summary'].values.tolist())
len(text)

22709

In [6]:
question_loader = DataFrameLoader(data, page_content_column="falcon_summary")
question_data = question_loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                          chunk_overlap=20)
chunks = splitter.split_documents(question_data)

chunks = [chunk.page_content for chunk in chunks]

In [7]:
len(chunks)

87

In [8]:
chunk_size  = 512
num_distract_docs = 4
num_chunks = len(text) / chunk_size 
num_chunks

44.353515625

In [9]:
text_splitter = SemanticChunker(OpenAIEmbeddings(openai_api_key='''input key here'''), number_of_chunks=num_chunks)
chunks = text_splitter.create_documents([text])
chunks = [chunk.page_content for chunk in chunks]

In [10]:
def generate_instructions_gen(chunk: Any, x: int = 5) -> list[str]:
    """
    Generates `x` questions / use cases for `chunk`. Used when the input document is of general types 
    `pdf`, `json`, or `txt`.
    """
    
    response = llm.invoke(f"""You are a synthetic question-answer pair generator. Given a chunk of context about some topic(s), 
    generate {x} example questions a user could ask and would be answered using information from the chunk. 
    For example, if the given context was a Wikipedia paragraph about the United States, 
    an example question could be 'How many states are in the United States?'
    The questions should be able to be answered in a few words or less. Include only the questions in your response.
                        
    {str(chunk)}""")

    queries = response
    #queries = [strip_str(q) for q in queries]
    #queries = [q for q in queries if any(c.isalpha() for c in q)]
    queries = [q.split('.')[1].strip() for q in queries.split('\n')]

    return queries

def encode_question_gen(question: str, chunk: Any) -> list[str]:
    """
    Encode multiple prompt instructions into a single string for the general case.
    """
    
    #prompts = []
        
    prompt = """
        You are a helpful question answerer who can provide an answer given a question and relevant context.
        Question: {question}\nContext: {context}\n
        Answer this question using the information given in the context above. Here is things to pay attention to: 
        - First provide step-by-step reasoning on how to answer the question. 
        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context. 
        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succint.
    """.format(question=question, context=str(chunk))
    return prompt

def generate_label(question: str, context: Any): #-> str | None:
    """
    Generates the label / answer to `question` using `context` and GPT-4.
    """
    question = encode_question_gen(question, context)
    response = llm(question)
    return response

def add_chunk_to_dataset(
    chunks: list[str], 
    chunk: str, 
    x: int = 5, 
    num_distract: int = 3, 
    p: float = 1.0
) -> None:
    """
    Given a chunk, create {Q, A, D} triplets and add them to the dataset.
    """
    global ds
    i = chunks.index(chunk)
    #print("Generating instructions...")
    qs = generate_instructions_gen(chunk, x)
    for q in qs:
        datapt = {
            "id": None,
            "type": None,
            "question": None,
            "context": None,
            "oracle_context": None,
            "cot_answer": None
        }

        datapt["id"] = f"seed_task_{0 if not ds else ds.num_rows}"
        datapt["type"] =  "general"
        datapt["question"] = q

        # add num_distract distractor docs
        docs = [chunk]
        indices = list(range(0, len(chunks)))
        indices.remove(i)
        for j in random.sample(indices, num_distract):
            docs.append(chunks[j])
        # decides whether to add oracle document
        oracle = random.uniform(0, 1) < p
        if not oracle:
            docs[0] = chunks[random.sample(indices, 1)[0]]
        random.shuffle(docs)

        d = {
            "title": [],
            "sentences": []
        }

        d["title"].append(["placeholder_title"]*(num_distract+1))
        d["sentences"].append(docs)
        datapt["context"] = d
        datapt["oracle_context"] = chunk

        # add answer to q
        datapt["cot_answer"] = generate_label(q, chunk) 

        # construct model instruction 
        context = ""
        for doc in docs:
            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
        context += q
        datapt["instruction"] = context

        # add to dataset
        if not ds:
            # init ds
            datapt["id"] = [datapt["id"]]
            datapt["type"] = [datapt["type"]]
            datapt["question"] = [datapt["question"]]
            datapt["context"] = [datapt["context"]]
            datapt["oracle_context"] = [datapt["oracle_context"]]
            datapt["cot_answer"] = [datapt["cot_answer"]]
            datapt["instruction"] = [datapt["instruction"]]
            print(datapt)
            ds = Dataset.from_dict(datapt)
        else:
            ds = ds.add_item(datapt)

## example of synthetic dataset

In [12]:
print("chunk example")
print(chunks[0],"\n")

print("Generative questions")
q = generate_instructions_gen(chunks[0], 5)
print(q,"\n")

print("CoT answers")
cot = generate_label(q, chunks[0])
print(cot,"\n")

chunk example
You can definitely go into Lisbon and enjoy a meal, walk around for a couple hours before your boarding time . The metro gets you into Lisbon from the airport in like minutes . If you have lots of time, i would definitely try to go to Lisbon and get something to eat and walk around a bit, better than spending six hours on the airport .Otherwise your extension will be automatically cancelled when HR cancel your work permit . Youll need to leave the country immediately you can get a day extension to deal with this, but need to apply for that in person at any immigration office . If youre still on the books, you should be able to renew it as usual .The idea of an Osaka Expo and the size of its budget have become too expensive, and countries are now withdrawing from the competition . This is a good article about Mexico also making a hard decision .You can just bring a cat directly through immigration in Amsterdam without any issue, but you will need to fly to Oslo first unles

  warn_deprecated(


 To answer the first question, "How long does it take to get from Lisbon airport to the city center by metro?", according to the context provided, the metro gets you into Lisbon from the airport in just a few minutes.
##begin_quote## If you have lots of time, I would definitely try to go to Lisbon and get something to eat and walk around a bit, better than spending six hours on the airport. ##end_quote##

Therefore, the answer is: <ANSWER>: It takes only a few minutes by metro to reach Lisbon city center from the airport. 



In [13]:
ds = None
for chunk in tqdm(chunks):
    add_chunk_to_dataset(chunks, chunk, 5, num_distract_docs)


  0%|          | 0/45 [00:00<?, ?it/s]

{'id': ['seed_task_0'], 'type': ['general'], 'question': ['How long does it take to get from Lisbon airport to the city center via metro?'], 'context': [{'title': [['placeholder_title', 'placeholder_title', 'placeholder_title', 'placeholder_title', 'placeholder_title']], 'sentences': [['Its not the best or biggest part of my trip, but like good food, it adds to it . For me its about spending time with myself, zero expectations of meeting others or making friends, thats how I am in real life too . The thing I see on this sub is that I can always meet up with them later or never .My biggest gripe with solo travel is that you have full control when you want to be among people. And you can nope out whenever you want. Welcome to the club!, I had a friend who complained about everything, left star reviews on Google, and constantly complained about his feet getting tired. I swear to god, this type of person is the most horrible to travel with.', 'Big nightclubs. Good access to boat tours.', '

100%|██████████| 45/45 [21:45:51<00:00, 1741.15s/it]   


* 12am: 29%
* 7am: 64%
    * Took 7 hours to complete 35%
* 11:00am: 80%
* 3pm: 96%
* ~4pm completed 100%

In [27]:
ds #226 rows only

Dataset({
    features: ['id', 'type', 'question', 'context', 'oracle_context', 'cot_answer', 'instruction'],
    num_rows: 226
})

In [15]:
ds[0]

{'id': 'seed_task_0',
 'type': 'general',
 'question': 'How long does it take to get from Lisbon airport to the city center via metro?',
 'context': {'sentences': [['Its not the best or biggest part of my trip, but like good food, it adds to it . For me its about spending time with myself, zero expectations of meeting others or making friends, thats how I am in real life too . The thing I see on this sub is that I can always meet up with them later or never .My biggest gripe with solo travel is that you have full control when you want to be among people. And you can nope out whenever you want. Welcome to the club!, I had a friend who complained about everything, left star reviews on Google, and constantly complained about his feet getting tired. I swear to god, this type of person is the most horrible to travel with.',
    'Big nightclubs. Good access to boat tours.',
    'The homeowner lives out of the country most of the year, so I am the guest contact . When they have responded enou

In [29]:
ds.to_json('reddit_falcon_raft.json')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1585574

https://shashi-vish.medium.com/how-to-upload-dataset-on-huggingface-7b6ce68f1ea0

# DO NOT HIBERNATE - CLOSE LID 