In [None]:
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from string import Template # For generating prompt template

import os
import gc # grabage collector
# we need to install the sentence transformer and use its embedding to read the faiss index
#cp stands for a copy. This command is used to copy files or groups of files or directories. 
# The -r option tells rm to remove directories recursively, and the -f option tells it to force the removal of files and directories that are read-only or do not exist

!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers

#installing faiss package for reading faiss wikipedia index
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# as per wikipedia faiss index https://www.kaggle.com/datasets/jjinho/wikipedia-2023-07-faiss-index
import faiss
from faiss import write_index, read_index


import ctypes
libc = ctypes.CDLL("libc.so.6")

# installing langchain package# We will use langchain recursive splitter
!pip install langchain --no-index --find-links=file:///kaggle/input/llm-pkg/
from langchain.text_splitter import RecursiveCharacterTextSplitter



from tqdm.auto import tqdm

In [None]:
# Reading the csv file
#df_train = pd.read_csv("./train.csv")
df_final = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")
df_final.head(5)

READING WIKIPEDIA FILES TO FIND CONTEXT****

In [None]:
# PART 1 - Searching Wikipedia Titles

In [None]:
# loadding the wikipedia faiss index. This will be used for searching
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [None]:
# Creating index of prompts i.e q to serach for relavnt wikipedia documents
from sentence_transformers import SentenceTransformer
SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
DEVICE = 0
MAX_LENGTH = 384
BATCH_SIZE = 32

model = SentenceTransformer(SIM_MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half() # The model.half() method in PyTorch is used to convert a model to half-precision. This can be useful for reducing the memory footprint of a model, as half-precision numbers use half the memory as single-precision numbers

In [None]:
prompt_embeddings = model.encode(df_final['prompt'].values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy() # detach to remove gradients.
search_score, search_index = sentence_index.search(prompt_embeddings, 3)

In [None]:

del sentence_index # deleting as not required. otherwise it will give memory issue
del prompt_embeddings
_ = gc.collect() # garbage collector..frees up memmory

In [None]:
# PART 2 - Fetching relavant text of wikipedia documents

In [None]:
# getting wikipedia documents 
df_wiki = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet",
                     columns=['id', 'file'])

In [None]:
wikipedia_file_data = []

for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
    scr_idx = idx
    _df = df_wiki.loc[scr_idx].copy()
    _df['prompt_id'] = i
    wikipedia_file_data.append(_df)

In [None]:
wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

In [None]:
wikipedia_file_data.head(5)


In [None]:
del df_wiki
_ = gc.collect()
libc.malloc_trim(0)

In [None]:
import os
WIKI_PATH = "/kaggle/input/wikipedia-20230701"
wiki_files = os.listdir(WIKI_PATH)

wiki_text_data = []

for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
    _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
    _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

    _df_temp = _df[_df['id'].isin(_id)].copy()
    del _df
    _ = gc.collect()
    libc.malloc_trim(0)
    wiki_text_data.append(_df_temp)
wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
_ = gc.collect()

In [None]:
context_df = wikipedia_file_data.merge(wiki_text_data,on='id')
print(len(context_df))

In [None]:
context_df.head()


In [None]:
# Spliting the wiki text in the context df in chunk size

chunk_size = 1000
chunk_overlap = 100

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

split_text =[]
for i in range(len(context_df)):
    split_text.append ( r_splitter.split_text(context_df.loc[i,'text']))
context_df['split'] = split_text


In [None]:
context_df.head()

PREPARING THE PROMPT

In [None]:
text  = """
Give below is a question labelled as 'Q' : and 5 possible answers to the question labelled as 'A':,'B':,'C':,'D':,'E':. 
Your task is to predict the top 3 most likely answer to the question.
Your output should cosist be 3 letters from A,B,C,D,E. The first letter should indicate the most liley answer folloed by 2nd most likely answer followed by 3rd most likely answer.
Use context labeled as 'T' for any relavant information

'Q' : $q

'A' : $a
'B' : $b
'C' : $c
'D' : $d
'E' : $e

'T' : $t


"""

template = Template(text)

In [None]:
def format_dataframe(df, context_df):
    
    final_prompt = []
    for i in range(len(df)):
        q = df.loc[i,'prompt']
        a = df.loc[i,'A']
        b = df.loc[i,'B']
        c = df.loc[i,'C']
        d = df.loc[i,'D']
        e = df.loc[i,'E']
        
        text = context_df[context_df['prompt_id'] == i].iloc[0]['split']

        text_df = pd.DataFrame(text,columns=['text'])
        vectors = model.encode(text_df['text'])
        vector_dimension = vectors.shape[1]
        index = faiss.IndexFlatL2(vector_dimension)
        faiss.normalize_L2(vectors)
        index.add(vectors)

    
        search_vector = model.encode(q)
        _vector = np.array([search_vector])
        faiss.normalize_L2(_vector)

        k = 1
        distances, ann = index.search(_vector, k=k)
        chunk = text[ann[0,0]]

        final_prompt.append(template.substitute(q=q,a = a,b=b,c=c,d=d,e=e,t=chunk))
     #   break
    
    return final_prompt




In [None]:
model_prompt = format_dataframe(df_final,context_df)

PROMPTING THE MODEL TO GET RESPONSE

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = "cuda:0" if torch.cuda.is_available() else "cpu"

llm = '/kaggle/input/flan-t5/pytorch/small/2'

model_llm = T5ForConditionalGeneration.from_pretrained(llm,local_files_only = True).to(device)
tokenizer = T5Tokenizer.from_pretrained(llm)


In [None]:
# checking for one response

inputs = tokenizer(model_prompt[0], return_tensors="pt").to(device)
outputs = model_llm.generate(**inputs)
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(answer)

**SUBMISSION **

In [None]:
submission = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/sample_submission.csv', index_col='id')

i = 0
for text in model_prompt:
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model_llm.generate(**inputs)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    submission.loc[i,'prediction'] = answer[0]
    i = i+1

submission.to_csv('submission.csv')