In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kaggle-llm-science-exam/sample_submission.csv
/kaggle/input/kaggle-llm-science-exam/train.csv
/kaggle/input/kaggle-llm-science-exam/test.csv


In [2]:
import pandas as pd
from string import Template
from pathlib import Path

import warnings
warnings.simplefilter("ignore")

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

data_path = Path('/kaggle/input/kaggle-llm-science-exam')

In [3]:
llm = 'google/flan-t5-base'


device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = T5ForConditionalGeneration.from_pretrained(llm).to(device)
tokenizer = T5Tokenizer.from_pretrained(llm)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
test = pd.read_csv(data_path / 'test.csv', index_col='id')
test.head()

Unnamed: 0_level_0,prompt,A,B,C,D,E
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...


In [5]:
preamble = \
    'Answer the following question by outputting the letters A, B, C, D, and E '\
    'in order of the most likely to be correct to the to least likely to be correct.'

template = Template('$preamble\n\n$prompt\n\nA) $a\nB) $b\nC) $c\nD) $d\nE) $e')

In [6]:
def format_input(df, idx):
    
    prompt = df.loc[idx, 'prompt']
    a = df.loc[idx, 'A']
    b = df.loc[idx, 'B']
    c = df.loc[idx, 'C']
    d = df.loc[idx, 'D']
    e = df.loc[idx, 'E']

    input_text = template.substitute(
        preamble=preamble, prompt=prompt, a=a, b=b, c=c, d=d, e=e)
    
    return input_text

In [7]:
print(format_input(test, 0))

Answer the following question by outputting the letters A, B, C, D, and E in order of the most likely to be correct to the to least likely to be correct.

Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed "missing baryonic mass" discrepancy in galaxy clusters?

A) MOND is a theory that reduces the observed missing baryonic mass in galaxy clusters by postulating the existence of a new form of matter called "fuzzy dark matter."
B) MOND is a theory that increases the discrepancy between the observed missing baryonic mass in galaxy clusters and the measured velocity dispersions from a factor of around 10 to a factor of about 20.
C) MOND is a theory that explains the missing baryonic mass in galaxy clusters that was previously considered dark matter by demonstrating that the mass is in the form of neutrinos and axions.
D) MOND is a theory that reduces the discrepancy between the observed missing baryonic mass in galaxy cl

In [8]:
inputs = tokenizer(format_input(test, 0), return_tensors="pt").to(device)
outputs = model.generate(**inputs)
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(answer)

['A']


## Post-processing

In [9]:
def post_process(predictions):
    valid = set(['A', 'B', 'C', 'D', 'E'])
    # If there are no valid choices, return something and hope for partial credit
    if set(predictions).isdisjoint(valid):
        final_pred = 'A B C D E'
    else:
        final_pred = []
        for prediction in predictions:
            if prediction in valid:
                final_pred += prediction
        # add remaining letters
        to_add = valid - set(final_pred)
        final_pred.extend(list(to_add))
        # put in space-delimited format
        final_pred = ' '.join(final_pred)
        
    return final_pred

In [10]:
submission = pd.read_csv(
    data_path / 'sample_submission.csv', index_col='id')

for idx in test.index:
    inputs = tokenizer(format_input(test, idx), return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    submission.loc[idx, 'prediction'] = post_process(answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors


In [11]:
submission.head()

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
0,A B C D E
1,A B C D E
2,C B D E A
3,A B C D E
4,A B C D E


In [12]:
submission.to_csv('submission.csv')