In [1]:
import torch
import pandas as pd
from transformers import pipeline

In [2]:
# CONFIG
MODEL = "unsloth/Llama-3.2-3B-Instruct"

DEVICE = "cpu"
if torch.cuda.is_available():
    DEVICE = "cuda"
if torch.backends.mps.is_available():
    DEVICE = "mps"

## Train Data

In [3]:
# Import the data
splits = {'train': 'ARC-Challenge/train-00000-of-00001.parquet', 'test': 'ARC-Challenge/test-00000-of-00001.parquet', 'validation': 'ARC-Challenge/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/allenai/ai2_arc/" + splits["train"])

# Extract the questions
questions = df["question"].copy()

# Extract the choices
choices = df["choices"].copy()

# Load the model pipeline
pipe = pipeline("text-generation", model=MODEL, device=DEVICE)

# Parse the choices 
for idx in range(len(choices)):
    choice_string = "Choices:\n"
    for a, b in zip(choices[idx]['label'], choices[idx]['text']):
        choice_string += f"Choice {a}: {b}\n"
    choices[idx] = choice_string

# Construct the reasoning prompts
prompts = [q + "\n" + c + "The step by step behind the correct answer is" for q, c in zip(questions, choices)]

# Run the model on the prompts
results = pipe(prompts, max_new_tokens=128, truncation=True, batch_size=80)

# Extract the reasoning
reasoning = [x[0]['generated_text'].split(p)[-1].strip() for x, p in zip(results, prompts)]

# Update the questions with reasoning
for idx in range(len(questions)):
    questions[idx] += "\nReasoning: " + reasoning[idx]

# Update the df
df['question'] = questions

# Save the df
df.to_parquet('train-00000-of-00001.parquet')

## Test Data

In [3]:
# Import the data
splits = {'train': 'ARC-Challenge/train-00000-of-00001.parquet', 'test': 'ARC-Challenge/test-00000-of-00001.parquet', 'validation': 'ARC-Challenge/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/allenai/ai2_arc/" + splits["test"])

# Extract the questions
questions = df["question"].copy()

# Extract the choices
choices = df["choices"].copy()

# Load the model pipeline
pipe = pipeline("text-generation", model=MODEL, device=DEVICE)

# Parse the choices 
for idx in range(len(choices)):
    choice_string = "Choices:\n"
    for a, b in zip(choices[idx]['label'], choices[idx]['text']):
        choice_string += f"Choice {a}: {b}\n"
    choices[idx] = choice_string

# Construct the reasoning prompts
prompts = [q + "\n" + c + "The step by step behind the correct answer is" for q, c in zip(questions, choices)]

# Run the model on the prompts
results = pipe(prompts, max_new_tokens=128, truncation=True, batch_size=80)

# Extract the reasoning
reasoning = [x[0]['generated_text'].split(p)[-1].strip() for x, p in zip(results, prompts)]

# Update the questions with reasoning
for idx in range(len(questions)):
    questions[idx] += "\nReasoning: " + reasoning[idx]

# Update the df
df['question'] = questions

# Save the df
df.to_parquet('test-00000-of-00001.parquet')

## Validation Data

In [None]:
# Import the data
splits = {'train': 'ARC-Challenge/train-00000-of-00001.parquet', 'test': 'ARC-Challenge/test-00000-of-00001.parquet', 'validation': 'ARC-Challenge/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/allenai/ai2_arc/" + splits["validation"])

# Extract the questions
questions = df["question"].copy()

# Extract the choices
choices = df["choices"].copy()

# Load the model pipeline
pipe = pipeline("text-generation", model=MODEL, device=DEVICE)

# Parse the choices 
for idx in range(len(choices)):
    choice_string = "Choices:\n"
    for a, b in zip(choices[idx]['label'], choices[idx]['text']):
        choice_string += f"Choice {a}: {b}\n"
    choices[idx] = choice_string

# Construct the reasoning prompts
prompts = [q + "\n" + c + "The step by step behind the correct answer is" for q, c in zip(questions, choices)]

# Run the model on the prompts
results = pipe(prompts, max_new_tokens=128, truncation=True, batch_size=80)

# Extract the reasoning
reasoning = [x[0]['generated_text'].split(p)[-1].strip() for x, p in zip(results, prompts)]

# Update the questions with reasoning
for idx in range(len(questions)):
    questions[idx] += "\nReasoning: " + reasoning[idx]

# Update the df
df['question'] = questions

# Save the df
df.to_parquet('validation-00000-of-00001.parquet')