# Initial Setup

In [None]:
!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers
!pip install -U /kaggle/working/sentence-transformers
!pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
MAX_INPUT = 256

import pandas as pd
import numpy as np
import faiss
from faiss import write_index, read_index

from sentence_transformers import SentenceTransformer

import gc

# Generate WIKI Context Using FAISS

In [None]:
test_df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv")

In [None]:
test_df.head()

In [None]:
use_long_context = True
device = 'cuda'
max_length = 384
batch_size = 16
if use_long_context:
    num_top_pages = 5
else:
    num_top_pages = 3

In [None]:
sentence_transformer_model = '/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2'
model = SentenceTransformer(sentence_transformer_model, device=device)
model.max_seq_length = max_length
model = model.half()

In [None]:
sentence_index = read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")

In [None]:
prompt_embeddings = model.encode(test_df['prompt'].values, batch_size=batch_size, device=device, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_embeddings = prompt_embeddings.detach().cpu().numpy()

In [None]:
prompt_embeddings.shape

In [None]:
search_score, search_index = sentence_index.search(prompt_embeddings, num_top_pages)
del sentence_index
del prompt_embeddings
gc.collect()

In [None]:
search_score[0], search_index[0]

In [None]:
wiki_source_df = pd.read_parquet("/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet", columns=['id', 'file'])

In [None]:
wiki_source_df.shape

In [None]:
prompt_to_wiki_file_dfs = []
for i in range(len(search_index)):
    cur_wiki_source_df = wiki_source_df.loc[search_index[i]].copy()
    cur_wiki_source_df['orig_prompt_id'] = i
    prompt_to_wiki_file_dfs.append(cur_wiki_source_df)
prompt_to_wiki_final_df = pd.concat(prompt_to_wiki_file_dfs).reset_index(drop=True)
del wiki_source_df
gc.collect()

In [None]:
prompt_to_wiki_final_df.head()

In [None]:
wiki_base_path = "/kaggle/input/wikipedia-20230701"
wiki_test_df = pd.read_parquet(f"{wiki_base_path}/a.parquet")
print(wiki_test_df.dtypes)
wiki_test_df.head()

In [None]:
del wiki_test_df
gc.collect()

In [None]:
wiki_text_dfs = []
unique_wiki_files = prompt_to_wiki_final_df['file'].unique()
for unique_wiki_file in unique_wiki_files:
    wiki_ids = [str(x) for x in prompt_to_wiki_final_df.loc[prompt_to_wiki_final_df['file']==unique_wiki_file]['id'].values]
    cur_wiki_text_df = pd.read_parquet(f"{wiki_base_path}/{unique_wiki_file}", columns=['id', 'text'])
    final_wiki_text_df = cur_wiki_text_df.loc[cur_wiki_text_df['id'].isin(wiki_ids)].copy()
    wiki_text_dfs.append(final_wiki_text_df)
    del cur_wiki_text_df
    gc.collect()
wiki_text_df_final = pd.concat(wiki_text_dfs).drop_duplicates().reset_index(drop=True)

In [None]:
wiki_text_df_final.shape

In [None]:
wiki_text_df_final.head(5)

In [None]:
from collections.abc import Iterable
from tqdm.auto import tqdm
import blingfire as bf
def process_documents(documents: Iterable[str],
                      document_ids: Iterable,
                      split_sentences: bool = True,
                      filter_len: int = 3,
                      disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Main helper function to process documents from the EMR.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param document_type: String denoting the document type to be processed
    :param document_sections: List of sections for a given document type to process
    :param split_sentences: Flag to determine whether to further split sections into sentences
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """
    
    df = sectionize_documents(documents, document_ids, disable_progress_bar)

    if split_sentences:
        df = sentencize(df.text.values, 
                        df.document_id.values,
                        df.offset.values, 
                        filter_len, 
                        disable_progress_bar)
    return df


def sectionize_documents(documents: Iterable[str],
                         document_ids: Iterable,
                         disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Obtains the sections of the imaging reports and returns only the 
    selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param disable_progress_bar: Flag to disable tqdm progress bar
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`
    """
    processed_documents = []
    for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):
        row = {}
        text, start, end = (document, 0, len(document))
        row['document_id'] = document_id
        row['text'] = text
        row['offset'] = (start, end)

        processed_documents.append(row)

    _df = pd.DataFrame(processed_documents)
    if _df.shape[0] > 0:
        return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)
    else:
        return _df


def sentencize(documents: Iterable[str],
               document_ids: Iterable,
               offsets: Iterable[tuple[int, int]],
               filter_len: int = 3,
               disable_progress_bar: bool = False) -> pd.DataFrame:
    """
    Split a document into sentences. Can be used with `sectionize_documents`
    to further split documents into more manageable pieces. Takes in offsets
    to ensure that after splitting, the sentences can be matched to the
    location in the original documents.

    :param documents: Iterable containing documents which are strings
    :param document_ids: Iterable containing document unique identifiers
    :param offsets: Iterable tuple of the start and end indices
    :param filter_len: Minimum character length of a sentence (otherwise filter out)
    :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`
    """

    document_sentences = []
    for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1]-o[0] > filter_len:
                    sentence = document[o[0]:o[1]]
                    abs_offsets = (o[0]+offset[0], o[1]+offset[0])
                    row = {}
                    row['document_id'] = document_id
                    row['text'] = sentence
                    row['offset'] = abs_offsets
                    document_sentences.append(row)
        except:
            continue
    return pd.DataFrame(document_sentences)

In [None]:
processed_wiki_text_data = process_documents(wiki_text_df_final['text'].values, wiki_text_df_final['id'].values)

In [None]:
processed_wiki_text_data.shape

In [None]:
processed_wiki_text_data.head(5)

In [None]:
wiki_data_embeddings = model.encode(processed_wiki_text_data.text,
                                    batch_size=batch_size,
                                    device=device,
                                    show_progress_bar=True,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)#.half()
wiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()

In [None]:
wiki_data_embeddings.shape

In [None]:
gc.collect()

In [None]:
test_df['prompt_answer_text'] = test_df.apply(lambda x: x['prompt'] + " " + " ".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)

In [None]:
test_df.head(5)

In [None]:
prompt_answer_embeddings = model.encode(test_df['prompt_answer_text'].values, batch_size=batch_size, device=device, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
prompt_answer_embeddings = prompt_answer_embeddings.detach().cpu().numpy()
prompt_answer_embeddings.shape

In [None]:
processed_wiki_text_data.head(5)

In [None]:
if use_long_context:
    num_context_sentences = 20
else:
    num_context_sentences = 5
context_texts = []

for prompt_id in test_df.index.tolist():
    context = ''
    context_sent_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(prompt_to_wiki_final_df[prompt_to_wiki_final_df['orig_prompt_id']==prompt_id]['id'].values)].index.values
    context_sent_index = faiss.index_factory(wiki_data_embeddings.shape[1], "Flat")
    context_sent_index.add(wiki_data_embeddings[context_sent_indices])
    _, x = context_sent_index.search(prompt_answer_embeddings, num_context_sentences)
    for context_sent_idx in x[prompt_id]:
        context += processed_wiki_text_data.loc[context_sent_indices, 'text'].iloc[context_sent_idx] + " "
    context_texts.append(context.strip())

In [None]:
test_df['context'] = context_texts

In [None]:
test_df_with_context = test_df[['prompt', 'context', 'A', 'B', 'C', 'D', 'E']]
test_df_with_context.to_csv('./test_df_with_context.csv', index=False)

In [None]:
pd.read_csv('./test_df_with_context.csv').head(5)

# Direct Inference on Question Answering Model

In [None]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice

In [None]:
if use_long_context:
    model_path = '/kaggle/input/fine-tuned-open-book-model/model_v2'
else:
    model_path = '/kaggle/input/llm-science-run-context-2'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMultipleChoice.from_pretrained(model_path).cuda()
model.eval()

In [None]:
num_chars_in_context = 1750
test_df_with_context = pd.read_csv('./test_df_with_context.csv')
test_df_with_context['id'] = list(range(len(test_df_with_context)))
test_df_with_context['prompt_with_context'] = test_df_with_context.apply(lambda x: x['context'][:num_chars_in_context] + ' #### ' + x['prompt'], axis=1)
test_df_with_context['label'] = 0
test_df_with_context.head()

In [None]:
def preprocess_function(examples):
    options = 'ABCDE'
    first_sentences = [examples["prompt_with_context"]] * 5
    second_sentences = [examples[options[option_ind]] for option_ind in range(len(options))]
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    tokenized_examples['label'] = examples['label']
    return tokenized_examples

def preprocess_long_context(example):
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT, add_special_tokens=False)
    tokenized_example['label'] = example['label']
    return tokenized_example

In [None]:
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id
cls_token_id, sep_token_id

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
from datasets import Dataset
from torch.utils.data import DataLoader
if use_long_context:
    tokenized_test_dataset = Dataset.from_pandas(test_df_with_context[['id', 'prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'label']]).map(preprocess_long_context, remove_columns=['id', 'prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'label'])
else:
    tokenized_test_dataset = Dataset.from_pandas(test_df_with_context[['id', 'prompt_with_context', 'A', 'B', 'C', 'D', 'E', 'label']]).map(preprocess_function, remove_columns=['id', 'prompt_with_context', 'A', 'B', 'C', 'D', 'E', 'label'])
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)

In [None]:
import torch
output_logits = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        output_logits.append(model(**batch).logits.cpu().detach().numpy())

In [None]:
top_k_predictions = 3
output_logits = np.argsort(-1 * np.stack(output_logits).squeeze(), axis=1)
option_list = np.array('A B C D E'.split())
final_predictions = [' '.join(option_list[output_logits[i, :top_k_predictions]]) for i in range(len(output_logits))]
test_df_with_context['prediction'] = final_predictions
test_df_with_context[['id', 'prediction']].to_csv('submission.csv', index=False)