<a href="https://www.kaggle.com/code/rahulsmahajan/starter-bert-wiki-context-0-528?scriptVersionId=143263051" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from string import Template # For generating prompt template

import os
import gc # grabage collector
# we need to install the sentence transformer and use its embedding to read the faiss index
#cp stands for a copy. This command is used to copy files or groups of files or directories. 
# The -r option tells rm to remove directories recursively, and the -f option tells it to force the removal of files and directories that are read-only or do not exist

!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers

#installing faiss package for reading faiss wikipedia index
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# as per wikipedia faiss index https://www.kaggle.com/datasets/jjinho/wikipedia-2023-07-faiss-index
import faiss
from faiss import write_index, read_index


import ctypes
libc = ctypes.CDLL("libc.so.6")

# installing langchain package# We will use langchain recursive splitter
!pip install langchain --no-index --find-links=file:///kaggle/input/llm-pkg/
from langchain.text_splitter import RecursiveCharacterTextSplitter



from tqdm.auto import tqdm

In [None]:
import pandas as pd
# Reading the csv file
#df_train = pd.read_csv("./train.csv")
df_train = pd.read_csv("/kaggle/input/additional-train-data-for-llm-science-exam/6000_train_examples.csv")
df_test = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")
#df_extra = pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/extra_train_set.csv')
#df_train.head(5)

In [None]:
df_train.reset_index(inplace=True)
df_train.rename(columns={'index':'id'},inplace=True)
df_train = df_train[:2000]


## READING WIKIPEDIA FILES TO FIND CONTEXT

In [None]:
# PART 1 - Searching Wikipedia Titles

In [None]:
# loadding the wikipedia faiss index. This will be used for searching
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [None]:
# Creating index of prompts i.e q to search for relavnt wikipedia documents
from sentence_transformers import SentenceTransformer
SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
DEVICE = 0
MAX_LENGTH = 384
BATCH_SIZE = 32

model = SentenceTransformer(SIM_MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half() # The model.half() method in PyTorch is used to convert a model to half-precision. This can be useful for reducing the memory footprint of a model, as half-precision numbers use half the memory as single-precision numbers

In [None]:
prompt_embeddings_train = model.encode(df_train['prompt'].values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings_train = prompt_embeddings_train.detach().cpu().numpy() # detach to remove gradients.
search_score_train, search_index_train = sentence_index.search(prompt_embeddings_train, 5)

In [None]:
del prompt_embeddings_train
_ = gc.collect() # garbage collector..frees up memmory
libc.malloc_trim(0)

In [None]:
prompt_embeddings_test = model.encode(df_test['prompt'].values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings_test = prompt_embeddings_test.detach().cpu().numpy() # detach to remove gradients.
search_score_test, search_index_test = sentence_index.search(prompt_embeddings_test, 5)

In [None]:

del sentence_index # deleting as not required. otherwise it will give memory issue

del prompt_embeddings_test
_ = gc.collect() # garbage collector..frees up memmory
libc.malloc_trim(0)

In [None]:
# PART 2 - Fetching relavant text of wikipedia documents

In [None]:
import os
# getting wikipedia documents 
def wiki_context(search_score,search_index):
    df_wiki = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet",
                         columns=['id', 'file'])
    wikipedia_file_data = []

    for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
        scr_idx = idx
        _df = df_wiki.loc[scr_idx].copy()
        _df['prompt_id'] = i
        wikipedia_file_data.append(_df)
    wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
    wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

    WIKI_PATH = "/kaggle/input/wikipedia-20230701"
    wiki_files = os.listdir(WIKI_PATH)

    wiki_text_data = []

    for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
        _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
        _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

        _df_temp = _df[_df['id'].isin(_id)].copy()
        del _df
        _ = gc.collect()
        libc.malloc_trim(0)
        wiki_text_data.append(_df_temp)
    wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
    del df_wiki
    _ = gc.collect()
    libc.malloc_trim(0)
    context_df = wikipedia_file_data.merge(wiki_text_data,on='id')
    return context_df

In [None]:
# del df_wiki
# _ = gc.collect()
# libc.malloc_trim(0)

In [None]:
context_df_train = wiki_context(search_score_train,search_index_train)


In [None]:
context_df_test = wiki_context(search_score_test,search_index_test)

In [None]:
# Spliting the wiki text in the context df in chunk size

chunk_size = 1200
chunk_overlap = 200

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

split_text =[]
for i in range(len(context_df_train)):
    split_text.append ( r_splitter.split_text(context_df_train.loc[i,'text']))
context_df_train['split'] = split_text


In [None]:
def format_dataframe(df, context_df):
    
    model = SentenceTransformer(SIM_MODEL, device='cuda')
    model.max_seq_length = MAX_LENGTH
    model = model.half() # The model.half() method in PyTorch is used to convert a model to half-precision. This can be useful for reducing the memory footprint of a model, as half-precision numbers use half the memory as single-precision numbers
    
 #   final_prompt = []
    #for i in range(5):
    for i in range(len(df)):
        q = df.iloc[i]['prompt']
        idx = df.iloc[i]['id']
        chunk = ''
        
        text_rel = context_df[context_df['prompt_id'] == idx].iloc[:]['split']
        text = []
        for j in range(len(text_rel)):
            text.extend(text_rel.iloc[j])
        if text != []:
            text_df = pd.DataFrame(text,columns=['text'])
            vectors = model.encode(text_df['text'])
            vector_dimension = vectors.shape[1]
            index = faiss.IndexFlatL2(vector_dimension)
            faiss.normalize_L2(vectors)
            index.add(vectors)


            search_vector = model.encode(q)
            _vector = np.array([search_vector])
            faiss.normalize_L2(_vector)

            k = 1
            distances, ann = index.search(_vector, k=k)
            chunk = text[ann[0,0]]

        df.iloc[i,1] = str(chunk) + ' ### ' + q

    return df

In [None]:
model_train_df = format_dataframe(df_train,context_df_train)

## TRAINING THE MODEL

In [None]:
#model_train[["prompt", "context", "A", "B", "C", "D", "E","answer"]].to_csv("./train_context.csv", index=False)
#model_train_df = pd.read_csv("train_context.csv")
#model_train_df.index = list(range(len(model_train_df)))
#model_train_df['id'] = list(range(len(model_train_df)))
#model_train_df['context'] = model_train_df['context'].apply(lambda x: str(x))
#model_train_df["prompt"] = model_train_df["context"] + " #### " +  model_train_df["prompt"]
#model_train_df['answer'] = 'B'

In [None]:
from transformers import AutoTokenizer,AutoModelForMultipleChoice,Trainer,TrainingArguments
# For convenience we'll turn our pandas Dataframe into a Dataset
from datasets import Dataset
from transformers import AutoTokenizer

# The path of the model checkpoint we want to use
model_dir = '/kaggle/input/huggingface-bert/bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_dir)
#tokenizer(truncation=True,max_length=512,text_target='None')
model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
model.eval()


In [None]:
total_examples = len(model_train_df)
split_size = 0.65*total_examples
split_size = int(split_size)
train_ds = Dataset.from_pandas(model_train_df[:split_size])
eval_ds = Dataset.from_pandas(model_train_df[split_size:])

In [None]:
# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again
options = 'ABCDE'
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    # The AutoModelForMultipleChoice class expects a set of question/answer pairs
    # so we'll copy our question 5 times before tokenizing
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    for option in options:
        second_sentence.append(str(example[option]))
    # Our tokenizer will turn our text into token IDs BERT can understand
    tokenized_example = tokenizer(first_sentence, second_sentence,padding=True, truncation=True,max_length=256)
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example


In [None]:
tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_eval_ds = eval_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

In [None]:
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.
from datasets import Dataset
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = 'max_length'
    max_length: Optional[int] = 512
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch


In [None]:
# The arguments here are selected to run quickly; feel free to play with them.
import shutil
#shutil.rmtree('finetuned_bert')
model_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
#    save_strategy="epoch",
#    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to='none'
)

In [None]:
# Generally it's a bad idea to validate on your training set, but because our training set
# for this problem is so small we're going to train on all our data.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_eval_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
)

In [None]:

_ = gc.collect() # garbage collector..frees up memmory
libc.malloc_trim(0)

In [None]:
# Training should take about a minute
trainer.train()

In [None]:
# The following function gets the indices of the highest scoring answers for each row
# and converts them back to our answer format (A, B, C, D, E)
import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

## INFERENCING ON TEST DF

In [None]:
del df_train
del context_df_train
del model_train_df
del tokenized_train_ds
del tokenized_eval_ds

_ = gc.collect() # garbage collector..frees up memmory
libc.malloc_trim(0)

In [None]:
chunk_size = 1200
chunk_overlap = 200

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

split_text =[]
for i in range(len(context_df_test)):
    split_text.append ( r_splitter.split_text(context_df_test.loc[i,'text']))
context_df_test['split'] = split_text

In [None]:
test_df = format_dataframe(df_test,context_df_test)

In [None]:
#model_test[["prompt", "context", "A", "B", "C", "D", "E"]].to_csv("./test_context.csv", index=False)
#test_df = pd.read_csv("test_context.csv")
#test_df.index = list(range(len(test_df)))
#test_df['id'] = list(range(len(test_df)))
#test_df['context'] = test_df['context'].apply(lambda x: str(x))
#test_df["prompt"] = test_df["context"] + " #### " +  test_df["prompt"]
test_df['answer'] = 'B'

In [None]:
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

In [None]:
# Here we'll generate our "real" predictions on the test set
test_predictions = trainer.predict(tokenized_test_ds)

In [None]:
submission_df = test_df[['id']]
submission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)

submission_df.head()

In [None]:
# Once we write our submission file we're good to submit!
if os.path.exists('submission.csv'):
    os.remove('submission.csv')
submission_df.to_csv('submission.csv', index=False)