In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!pip install -q accelerate ujson

# 설치 해도 에러 뜨면 세션 다시 시작

In [10]:
import torch
from torch.cuda.amp import autocast
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import json
import pandas as pd
import csv
import gc
import importlib.util
from tqdm import tqdm

In [11]:
# PATHS, FLAGS, HYPERPARAMETERS

TESTING = True

BATCH_SIZE = 64

QUESTION_FILE_PATH = '/content/drive/My Drive/sogang-nlp-rag/qa_train.json' if TESTING else '/content/drive/My Drive/sogang-nlp-rag/qa_test.json'
PREDICTION_FILE_PATH = '/content/drive/My Drive/sogang-nlp-rag/submission.csv'
EVALUATION_FILE_PATH = '/content/drive/My Drive/sogang-nlp-rag/evaluate.py'

NUM_QUESTIONS = 300 if TESTING else None

HF_TOKEN = # YOUR_HF_READ_TYPE_TOKEN

In [15]:
# RuntimeError: cutlassF: no kernel found to launch!
# https://github.com/Lightning-AI/litgpt/issues/327#issuecomment-1664674460
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

# OSError: We couldn't connect to 'https://huggingface.co' to load this file
# https://github.com/huggingface/diffusers/issues/6223#issuecomment-2141411382

# Login to HuggingFace with a token
login(token=HF_TOKEN)
# Load the tokenizer and model from the HuggingFace model hub
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    token=HF_TOKEN,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# Load questions from a JSON file
with open(QUESTION_FILE_PATH, 'r') as f:
    questions = json.load(f)
# Limit the number of questions if NUM_QUESTIONS is specified
questions = questions[:NUM_QUESTIONS] if NUM_QUESTIONS else questions

# Process questions in batches
results = []
for i in tqdm(range(0, len(questions), BATCH_SIZE)):
    batch_questions = questions[i:i+BATCH_SIZE]

    batch_input_texts = []
    for item in batch_questions:
        question = item["question"]
        input_text = f"""
The following are questions and answers about random facts searchable on wikipedia.
**Question:** In what year the the venue that Marcia White is president of open?
**Answer:** (1966)
**Question:** What country is home to the sports club loaning Bruno Paulista to Vasco da Gama?
**Answer:** (Portugal)
**Question:** Southern Air featured Ray Stevens, Minnie Pearl and what other Southern comedian?
**Answer:** (Jerry Clower)
**Question:** {question}
**Answer:** ("""
        batch_input_texts.append(input_text)

    # Tokenize the batch of input texts
    batch_input_ids = tokenizer(batch_input_texts, return_tensors="pt", padding=True, truncation=True, max_length=1024).to("cuda")

    # Generate answers for the batch using mixed precision
    with autocast():
        batch_outputs = model.generate(**batch_input_ids, max_new_tokens=256)

    # Decode and process the generated answers for the batch
    for output, input_text in zip(batch_outputs, batch_input_texts):
        answer = tokenizer.decode(output, skip_special_tokens=True)
        answer = answer.split("**Answer:** (")[-1].strip()
        answer = answer.split(")")[0]
        answer = answer.replace('\n', ' ')
        query = input_text.replace('\n', ' ')
        results.append({"queries": query, "sentences": answer})

    # Clear variables and GPU cache to free up memory
    del batch_input_texts
    del batch_input_ids
    del batch_outputs
    gc.collect()
    torch.cuda.empty_cache()

# Convert the results to a DataFrame
df = pd.DataFrame(results)
df['id'] = df.index
df = df[['id', 'sentences', 'queries']]
# Save the DataFrame to a CSV file
df.to_csv(PREDICTION_FILE_PATH, index=False, sep=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:17<00:00,  3.43s/it]


In [16]:
# If TESTING is enabled, load the evaluation module
if TESTING:
    # Dynamically load the evaluation module from a file
    spec = importlib.util.spec_from_file_location("evaluate", EVALUATION_FILE_PATH)
    evaluate = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(evaluate)

    # Extract functions from the evaluation module
    normalize_answer = evaluate.normalize_answer
    f1_score = evaluate.f1_score
    exact_match_score = evaluate.exact_match_score
    update_answer = evaluate.update_answer
    update_sp = evaluate.update_sp

    def eval(PREDICTION_FILE_PATH, QUESTION_FILE_PATH):
        # Read the predictions from the CSV file
        with open(PREDICTION_FILE_PATH, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
            prediction = [{'queries': row['queries'], 'sentences': row['sentences']} for row in reader]

        # Read the questions and answers from the JSON file
        with open(QUESTION_FILE_PATH, encoding='utf-8') as f:
            answer = json.load(f)

        metrics = {'em': 0, 'f1': 0, 'prec': 0, 'recall': 0}

        # Update metrics for each prediction
        for id, gold in enumerate(answer[:len(prediction)]):
            update_answer(metrics, prediction[id]['sentences'], gold['answer'])

        metrics = {k: v/len(prediction) for k, v in metrics.items()}

        print(metrics)
        print((2 * metrics['em'] + 1 * metrics['f1']) / 3)

    eval(PREDICTION_FILE_PATH, QUESTION_FILE_PATH)

{'em': 0.12, 'f1': 0.14419519369519368, 'prec': 0.14720370370370373, 'recall': 0.14454629629629628}
0.12806506456506456


In [17]:
df = pd.read_csv(PREDICTION_FILE_PATH)

# Add double quotes around each entry in the 'sentences' column
df['sentences'] = '"' + df['sentences'].astype(str) + '"'
# Add double quotes around each entry in the 'queries' column
df['queries'] = '"' + df['queries'].astype(str) + '"'

# Save the modified DataFrame back to the CSV file
df.to_csv(PREDICTION_FILE_PATH, index=False, sep=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

In [18]:
from google.colab import files

files.download(PREDICTION_FILE_PATH)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>