In [None]:
import numpy as np
import pandas as pd

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from string import Template # For generating prompt template

import os
import gc # grabage collector
# we need to install the sentence transformer and use its embedding to read the faiss index
#cp stands for a copy. This command is used to copy files or groups of files or directories. 
# The -r option tells rm to remove directories recursively, and the -f option tells it to force the removal of files and directories that are read-only or do not exist

!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers

#installing faiss package for reading faiss wikipedia index
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
# as per wikipedia faiss index https://www.kaggle.com/datasets/jjinho/wikipedia-2023-07-faiss-index
import faiss
from faiss import write_index, read_index


import ctypes
libc = ctypes.CDLL("libc.so.6")

# installing langchain package# We will use langchain recursive splitter
!pip install langchain --no-index --find-links=file:///kaggle/input/llm-pkg/
from langchain.text_splitter import RecursiveCharacterTextSplitter



from tqdm.auto import tqdm

In [None]:
import pandas as pd
# Reading the csv file
#df_train = pd.read_csv("./train.csv")
#df_train = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/train.csv")

#df_train = pd.read_csv("/kaggle/input/additional-train-data-for-llm-science-exam/6000_train_examples.csv")
df_test = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")
#df_extra = pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/extra_train_set.csv')
#df_train.head(5)

In [None]:
# df_train.reset_index(inplace=True)
# df_train.rename(columns={'index':'id'},inplace=True)
# df_train = df_train


## READING WIKIPEDIA FILES TO FIND CONTEXT

In [None]:
# PART 1 - Searching Wikipedia Titles

In [None]:
# loadding the wikipedia faiss index. This will be used for searching
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [None]:
# Creating index of prompts i.e q to search for relavnt wikipedia documents
from sentence_transformers import SentenceTransformer
SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'
DEVICE = 0
MAX_LENGTH = 384
BATCH_SIZE = 16

model = SentenceTransformer(SIM_MODEL, device='cuda')
model.max_seq_length = MAX_LENGTH
model = model.half() # The model.half() method in PyTorch is used to convert a model to half-precision. This can be useful for reducing the memory footprint of a model, as half-precision numbers use half the memory as single-precision numbers

In [None]:
# prompt_embeddings_train = model.encode(df_train['prompt'].values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
# prompt_embeddings_train = prompt_embeddings_train.detach().cpu().numpy() # detach to remove gradients.
# search_score_train, search_index_train = sentence_index.search(prompt_embeddings_train, 5)

In [None]:
# del prompt_embeddings_train
# _ = gc.collect() # garbage collector..frees up memmory
# libc.malloc_trim(0)

In [None]:
prompt_embeddings_test = model.encode(df_test['prompt'].values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings_test = prompt_embeddings_test.detach().cpu().numpy() # detach to remove gradients.
search_score_test, search_index_test = sentence_index.search(prompt_embeddings_test, 5)

In [None]:

del sentence_index # deleting as not required. otherwise it will give memory issue

del prompt_embeddings_test
_ = gc.collect() # garbage collector..frees up memmory
libc.malloc_trim(0)

In [None]:
torch.cuda.empty_cache()

In [None]:
# PART 2 - Fetching relavant text of wikipedia documents

In [None]:
import os
# getting wikipedia documents 
def wiki_context(search_score,search_index):
    df_wiki = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet",
                         columns=['id', 'file'])
    wikipedia_file_data = []

    for i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):
        scr_idx = idx
        _df = df_wiki.loc[scr_idx].copy()
        _df['prompt_id'] = i
        wikipedia_file_data.append(_df)
    wikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)
    wikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

    WIKI_PATH = "/kaggle/input/wikipedia-20230701"
    wiki_files = os.listdir(WIKI_PATH)

    wiki_text_data = []

    for file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):
        _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]
        _df = pd.read_parquet(f"{WIKI_PATH}/{file}", columns=['id', 'text'])

        _df_temp = _df[_df['id'].isin(_id)].copy()
        del _df
        _ = gc.collect()
        libc.malloc_trim(0)
        wiki_text_data.append(_df_temp)
    wiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)
    del df_wiki
    _ = gc.collect()
    libc.malloc_trim(0)
    context_df = wikipedia_file_data.merge(wiki_text_data,on='id')
    return context_df

In [None]:
# del df_wiki
# _ = gc.collect()
# libc.malloc_trim(0)

In [None]:
context_df_train = wiki_context(search_score_train,search_index_train)


In [None]:
context_df_test = wiki_context(search_score_test,search_index_test)

In [None]:
# Spliting the wiki text in the context df in chunk size

chunk_size = 2400
chunk_overlap = 400

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

split_text =[]
for i in range(len(context_df_train)):
    split_text.append ( r_splitter.split_text(context_df_train.loc[i,'text']))
context_df_train['split'] = split_text


In [None]:
import re
def format_dataframe(df, context_df):
    
    model = SentenceTransformer(SIM_MODEL, device='cuda')
    model.max_seq_length = 384
    model = model.half() # The model.half() method in PyTorch is used to convert a model to half-precision. This can be useful for reducing the memory footprint of a model, as half-precision numbers use half the memory as single-precision numbers
    
 #   final_prompt = []
    #for i in range(5):
    for i in range(len(df)):
        q = df.iloc[i]['prompt']
        idx = df.iloc[i]['id']
        chunk = ''
        
        text_rel = context_df[context_df['prompt_id'] == idx].iloc[:]['split']
        text = []
        for j in range(len(text_rel)):
            text.extend(text_rel.iloc[j])
        if text != []:
            text_df = pd.DataFrame(text,columns=['text'])
            vectors = model.encode(text_df['text'])
            vector_dimension = vectors.shape[1]
            index = faiss.IndexFlatL2(vector_dimension)
            faiss.normalize_L2(vectors)
            index.add(vectors)


            search_vector = model.encode(q)
            _vector = np.array([search_vector])
            faiss.normalize_L2(_vector)

            k = 1
            distances, ann = index.search(_vector, k=k)
            chunk = text[ann[0,0]]
            chunk = re.sub('[^a-zA-Z0-9 \n\.]', '', chunk)

        df.iloc[i,8] = str(chunk)
    
    del index
    del search_vector
    del model
    _ = gc.collect()
    libc.malloc_trim(0)
    
    return df

In [None]:
# df_train['context'] = ''

In [None]:
# model_train_df = format_dataframe(df_train,context_df_train)

In [None]:
# model_train_df.to_csv("context_6k_2400_400.csv")

## TRAINING THE MODEL

In [None]:

import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
#model_train[["prompt", "context", "A", "B", "C", "D", "E","answer"]].to_csv("./train_context.csv", index=False)
#model_train_df = pd.read_csv("train_context.csv")
#model_train_df.index = list(range(len(model_train_df)))
#model_train_df['id'] = list(range(len(model_train_df)))
#model_train_df['context'] = model_train_df['context'].apply(lambda x: str(x))
#model_train_df["prompt"] = model_train_df["context"] + " #### " +  model_train_df["prompt"]
#model_train_df['answer'] = 'B'

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer


VER=10
# TRAIN WITH SUBSET OF 60K
NUM_TRAIN_SAMPLES = 2048#1_024
# PARAMETER EFFICIENT FINE TUNING
# PEFT REQUIRES 1XP100 GPU NOT 2XT4
USE_PEFT = False
# NUMBER OF LAYERS TO FREEZE 
# DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_LAYERS = 18#18
# BOOLEAN TO FREEZE EMBEDDINGS
FREEZE_EMBEDDINGS = True
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 256
# HUGGING FACE MODEL
MODEL = "/kaggle/input/debertav3model/LLMQAModel"#'microsoft/deberta-v3-large'
model = AutoModelForMultipleChoice.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
total_examples = len(model_train_df)
split_size = 0.65*total_examples
split_size = int(split_size)
train_ds = Dataset.from_pandas(model_train_df[:split_size])
eval_ds = Dataset.from_pandas(model_train_df[split_size:])

In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    second_sentences = [" #### " + str(example['prompt']) + " [SEP] " + str(example[option]) + " [SEP]" for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT,padding='max_length', add_special_tokens=False)
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
tokenized_train_ds = train_ds.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer','context'])
tokenized_eval_ds = eval_ds.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer','context'])

In [None]:
# NOTE PEFT REQUIRES US TO USE 1XP100 NOT 2XT4. I'M NOT SURE WHY.
if USE_PEFT:
    !pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl

In [None]:
if USE_PEFT:
    print('We are using PEFT.')
    from peft import LoraConfig, get_peft_model, TaskType
    peft_config = LoraConfig(
        r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.2, 
        bias="none", inference_mode=False, 
        target_modules=["query_proj", "value_proj"],
        modules_to_save=['classifier','pooler'],
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

In [None]:
if FREEZE_EMBEDDINGS:
    print('Freezing embeddings.')
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
if FREEZE_LAYERS>0:
    print(f'Freezing {FREEZE_LAYERS} layers.')
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

In [None]:
def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}


In [None]:
training_args = TrainingArguments(
    warmup_ratio=0.1, 
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    report_to='none',
    output_dir = f'./checkpoints_{VER}',
    overwrite_output_dir=True,
    fp16=True,
    gradient_accumulation_steps=8,#8
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='map@3',
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    save_total_limit=4,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_eval_ds,
    compute_metrics = compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()
trainer.save_model(f'model_v{VER}')

In [None]:
del model, trainer
if USE_PEFT:
    model = AutoModelForMultipleChoice.from_pretrained(MODEL)
    model = get_peft_model(model, peft_config)
    checkpoint = torch.load(f'model_v{VER}/pytorch_model.bin')
    model.load_state_dict(checkpoint)
else:
    model = AutoModelForMultipleChoice.from_pretrained(f'model_v{VER}')
trainer = Trainer(model=model)

In [None]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
import numpy as np
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [None]:
model_train_df.to_csv("context_train.csv")

## INFERENCING ON TEST DF

In [None]:
del df_train
del context_df_train
del model_train_df
del tokenized_train_ds
del tokenized_eval_ds

_ = gc.collect() # garbage collector..frees up memmory
libc.malloc_trim(0)

In [None]:
chunk_size = 1200
chunk_overlap = 200

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

split_text =[]
for i in range(len(context_df_test)):
    split_text.append ( r_splitter.split_text(context_df_test.loc[i,'text']))
context_df_test['split'] = split_text

In [None]:
df_test = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")
df_test['answer'] = 'B'
df_test['context']= ' '
test_df = format_dataframe(df_test,context_df_test)

In [None]:
test_df

In [None]:
#model_test[["prompt", "context", "A", "B", "C", "D", "E"]].to_csv("./test_context.csv", index=False)
#test_df = pd.read_csv("test_context.csv")
#test_df.index = list(range(len(test_df)))
#test_df['id'] = list(range(len(test_df)))
#test_df['context'] = test_df['context'].apply(lambda x: str(x))
#test_df["prompt"] = test_df["context"] + " #### " +  test_df["prompt"]


In [None]:
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer','context'])

In [None]:

test_predictions = trainer.predict(tokenized_test_ds).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

In [None]:
m = MAP_at_3(test_df.prediction.values, test_df.answer.values)
print( 'CV MAP@3 =',m )

In [None]:
# The following function gets the indices of the highest scoring answers for each row
# and converts them back to our answer format (A, B, C, D, E)
import numpy as np
def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)
    top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
    top_answers = np.vectorize(index_to_option.get)(top_answer_indices)
    return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)


In [None]:
submission_df = test_df[['id']]
submission_df['prediction'] = predictions_to_map_output(test_predictions)

submission_df.head()

In [None]:
# Once we write our submission file we're good to submit!
if os.path.exists('submission.csv'):
    os.remove('submission.csv')
submission_df.to_csv('submission.csv', index=False)