https://towardsdatascience.com/12-rag-pain-points-and-proposed-solutions-43709939a28c

https://www.mixedbread.ai/blog/mxbai-rerank-v1

https://medium.com/@vinitgela/decoding-raft-and-raft-datasetpack-by-llamaindex-5be9d912f992

https://arxiv.org/pdf/2403.10131.pdf

https://github.com/ShishirPatil/gorilla/blob/main/raft/raft.py

https://medium.com/@ud.chandra/instruction-fine-tuning-llama-2-with-pefts-qlora-method-d6a801ebb19

In [40]:
import pandas as pd
from typing import Literal, Any
import argparse
from openai import OpenAI

from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
import json
import random
from langchain_experimental.text_splitter import SemanticChunker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader

#from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.llms import Ollama
from tqdm import tqdm


In [2]:
!pip install pandas datasets langchain sentence_transformers tqdm openai langchain_experimental langchain_openai

Collecting pandas
  Downloading pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting langchain
  Downloading langchain-0.1.13-py3-none-any.whl.metadata (13 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting mu

In [22]:
llm = Ollama(model='mistral')
embeddings = SentenceTransformer("all-MiniLM-L6-v2")
llm

Ollama(model='mistral')

In [15]:
data = pd.read_csv("new_reddit.csv",index_col="Unnamed: 0")
data.shape

(16289, 8)

In [34]:
text = ''.join(data['cleaned_comments'].values.tolist())
len(text)


53043321

In [45]:
question_loader = DataFrameLoader(data, page_content_column="cleaned_comments")
question_data = question_loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                          chunk_overlap=20)
chunks = splitter.split_documents(question_data)

chunks = [chunk.page_content for chunk in chunks]

In [46]:
len(chunks)

147401

In [43]:
chunk_size  = 512
num_distract_docs = 4
num_chunks = len(text) / chunk_size 
num_chunks

103600.236328125

In [None]:
openai_key = '''input key here'''

In [41]:

text_splitter = SemanticChunker(OpenAIEmbeddings(openai_api_key=openai_key), number_of_chunks=num_chunks)
chunks = text_splitter.create_documents([text])
chunks = [chunk.page_content for chunk in chunks]

In [20]:

def generate_instructions_gen(chunk: Any, x: int = 5) -> list[str]:
    """
    Generates `x` questions / use cases for `chunk`. Used when the input document is of general types 
    `pdf`, `json`, or `txt`.
    """
    
    response = llm.invoke(f"""You are a synthetic question-answer pair generator. Given a chunk of context about some topic(s), 
    generate {x} example questions a user could ask and would be answered using information from the chunk. 
    For example, if the given context was a Wikipedia paragraph about the United States, 
    an example question could be 'How many states are in the United States?'
    The questions should be able to be answered in a few words or less. Include only the questions in your response.
                        
    {str(chunk)}""")

    queries = response
    #queries = [strip_str(q) for q in queries]
    #queries = [q for q in queries if any(c.isalpha() for c in q)]
    queries = [q.split('.')[1].strip() for q in queries.split('\n')]

    return queries 

def encode_question_gen(question: str, chunk: Any) -> list[str]:
    """
    Encode multiple prompt instructions into a single string for the general case.
    """
    
    #prompts = []
        
    prompt = """
        You are a helpful question answerer who can provide an answer given a question and relevant context.
        Question: {question}\nContext: {context}\n
        Answer this question using the information given in the context above. Here is things to pay attention to: 
        - First provide step-by-step reasoning on how to answer the question. 
        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context. 
        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succint.
    """.format(question=question, context=str(chunk))
    return prompt

def generate_label(question: str, context: Any) -> str | None:
    """
    Generates the label / answer to `question` using `context` and GPT-4.
    """
    question = encode_question_gen(question, context)
    response = llm(question)
    return response

def add_chunk_to_dataset(
    chunks: list[str], 
    chunk: str, 
    x: int = 5, 
    num_distract: int = 3, 
    p: float = 1.0
) -> None:
    """
    Given a chunk, create {Q, A, D} triplets and add them to the dataset.
    """
    global ds
    i = chunks.index(chunk)
    #print("Generating instructions...")
    qs = generate_instructions_gen(chunk, x)
    for q in qs:
        datapt = {
            "id": None,
            "type": None,
            "question": None,
            "context": None,
            "oracle_context": None,
            "cot_answer": None
        }

        datapt["id"] = f"seed_task_{0 if not ds else ds.num_rows}"
        datapt["type"] =  "general"
        datapt["question"] = q

        # add num_distract distractor docs
        docs = [chunk]
        indices = list(range(0, len(chunks)))
        indices.remove(i)
        for j in random.sample(indices, num_distract):
            docs.append(chunks[j])
        # decides whether to add oracle document
        oracle = random.uniform(0, 1) < p
        if not oracle:
            docs[0] = chunks[random.sample(indices, 1)[0]]
        random.shuffle(docs)

        d = {
            "title": [],
            "sentences": []
        }

        d["title"].append(["placeholder_title"]*(num_distract+1))
        d["sentences"].append(docs)
        datapt["context"] = d
        datapt["oracle_context"] = chunk

        # add answer to q
        datapt["cot_answer"] = generate_label(q, chunk) 

        # construct model instruction 
        context = ""
        for doc in docs:
            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
        context += q
        datapt["instruction"] = context

        # add to dataset
        if not ds:
            # init ds
            datapt["id"] = [datapt["id"]]
            datapt["type"] = [datapt["type"]]
            datapt["question"] = [datapt["question"]]
            datapt["context"] = [datapt["context"]]
            datapt["oracle_context"] = [datapt["oracle_context"]]
            datapt["cot_answer"] = [datapt["cot_answer"]]
            datapt["instruction"] = [datapt["instruction"]]
            print(datapt)
            ds = Dataset.from_dict(datapt)
        else:
            ds = ds.add_item(datapt)

## example of synthetic dataset

In [23]:
print("chunk example")
print(chunks[0],"\n")

print("Generative questions")
q = generate_instructions_gen(chunks[0], 5)
print(q,"\n")

print("CoT answers")
cot = generate_label(q, chunks[0])
print(cot,"\n")

chunk example
I live in Vancouver as well and booking a flight to Colombia recently I noticed its much cheaper to get to MEX on a layover Annoying,Are you checking bags If so,dont do this Itll cause all sorts of problems for you,the airline,and the other passengers on the flight,Just go up to the counter for your next flight and tell them you have diarrhea and you wont be boarding Theyll be happy to use your 

Generative questions


ConnectionError: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/generate (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f6d1c4735b0>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [123]:
ds = None
for chunk in tqdm(chunks):
    add_chunk_to_dataset(chunks, chunk, 5, num_distract_docs)

  0%|          | 0/67 [00:00<?, ?it/s]

{'id': ['seed_task_0'], 'type': ['general'], 'question': ['What should I say at the check-in counter if I want to skip my connecting flight in Mexico?'], 'context': [{'title': [['placeholder_title', 'placeholder_title', 'placeholder_title', 'placeholder_title', 'placeholder_title']], 'sentences': [['bags must be reported,Silent please,Normal I like listening to the announcements,it’s part of the airport experience for me,There are so many announcements in Melbourne internatjonal and I think on balance I am not a fan and would just rather screens,I recently experienced this in Dubai and liked it Just keep an eye on the screens,phones are also sending you push notifications nowadays when', 'that everything was there,including all her money We asked how this was possible amongst such poverty that we saw This could be triple someones annual salary Our guide said our people are not jealous of you It is an honor to help you enjoy your stay All true When someone asks me about what was the bes

  6%|▌         | 4/67 [03:33<55:56, 53.27s/it]  


KeyboardInterrupt: 

In [124]:
ds

Dataset({
    features: ['id', 'type', 'question', 'context', 'oracle_context', 'cot_answer', 'instruction'],
    num_rows: 22
})

In [125]:
ds[0]

{'id': 'seed_task_0',
 'type': 'general',
 'question': 'What should I say at the check-in counter if I want to skip my connecting flight in Mexico?',
 'context': {'sentences': [['bags must be reported,Silent please,Normal I like listening to the announcements,it’s part of the airport experience for me,There are so many announcements in Melbourne internatjonal and I think on balance I am not a fan and would just rather screens,I recently experienced this in Dubai and liked it Just keep an eye on the screens,phones are also sending you push notifications nowadays when',
    'that everything was there,including all her money We asked how this was possible amongst such poverty that we saw This could be triple someones annual salary Our guide said our people are not jealous of you It is an honor to help you enjoy your stay All true When someone asks me about what was the best part of my trip,I always have to include this Botswana proud,Everyone says to skip Naples,or to',
    'I live in Van

In [126]:
ds[6]

{'id': 'seed_task_6',
 'type': 'general',
 'question': 'Can an experienced skiplagger still fly one-way without being banned?',
 'context': {'sentences': [['happy to use your seat for standbyers,its only a problem if they dont know if youre going to show up or not Signed,an experienced skiplagger,Wouldn’t be banned,but the practice for many airlines now is if you miss one and don’t rebook nearly immediately,they will cancel the rest of your return flights,so best to do with one way bookings,I did it exactly once International flight,originating in',
    'Waiters in Paris weren’t rude at all People were friendly and helpful I felt happy trying out my French and we all enjoyed laughing about my struggles 😂,I picked Marseille as my base in southern France out of convenience to get to other towns for day trips but after reading opinions on Reddit I was expecting a dodgy ass,crime ridden town Found a beautiful city with a beautiful port and beautiful',
    'its like an open museum with tons