# Libraries

In [None]:
%pip install -q -U langchain-huggingface langchain-community langgraph langchain-groq
%pip install -q -U evaluate PdfReader PyPDF2 bitsandbytes openai peft openpyxl wandb

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/764.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m757.8/764.4 kB[0m [31m26.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m764.4/764.4 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
from openai import OpenAI
import pandas as pd, numpy as np, openpyxl, wandb, random

import json, PyPDF2, peft, torch, evaluate, time, requests
from groq import Groq
from io import StringIO
from datasets import Dataset
from huggingface_hub import notebook_login

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    default_data_collator,
)

from peft import LoraConfig, get_peft_model
from typing import Optional, List, Dict

In [None]:
# add api keys

token = os.environ["HF_TOKEN"]

client = OpenAI(api_key = OPENAI_API_KEY)
chunking_strategy =  {
        "type": "static",
        "static": {
          "max_chunk_size_tokens": 800, # reduce size to ensure better context integrity
          "chunk_overlap_tokens": 400 # increase overlap to maintain context across chunks
        }}

# Create a vector store caled "Scientific articles"
vector_store = client.vector_stores.create(name="Scientific articles",
                                                chunking_strategy=chunking_strategy)

# Data extraction

In [None]:
!gdown --folder "https://drive.google.com/drive/u/0/folders/1-66VrFbR6OgwNiNcCoOZ0FNeLy7b5ITC" > /dev/null 2>&1
!gdown --remaining-ok --folder "https://drive.google.com/drive/u/0/folders/1zurB2MibVbrhiobBe5oyvxcK6n5hdbB_" > /dev/null 2>&1

In [None]:
assistant_data = '/content/lab/files'

In [None]:
print("Files to upload:")
file_paths = []
for root, dirs, files in os.walk(assistant_data):
    # Filter out hidden dirs
    dirs[:] = [d for d in dirs if not d.startswith('.')]
    for file in files:
        # Skip hidden and notebook/config files
        if file.startswith('.') or file.endswith(('.ipynb', '.yaml', '.txt')):
            continue
        print(f"\t- {file}")
        file_paths.append(os.path.join(root, file))

print(f"\nTotal files found: {len(file_paths)}")

Files to upload:
	- Crevecoeur - 2008 - Investigating the effects of a kindergarten vocabulary intervention on the word learning of English-.pdf
	- ED435985.pdf
	- Reese et al. - 2010 - Maternal Elaborative Reminiscing Increases Low-Income Children's Narrative Skills Relative to Dialog.pdf
	- Chow et al. - 2008 - Dialogic reading and morphology training in Chinese children Effects on language and literacy..pdf
	- Blom-Hoffman et al. - 2007 - Instructing Parents to Use Dialogic Reading Strategies with Preschool Children Impact of a Video-Ba.pdf
	- Elmonayer - 2013 - Promoting phonological awareness skills of Egyptian kindergarteners through dialogic reading.pdf
	- Valdez-Menchaca and Whitehurst - Accelerating Language Development Through Picture Book Reading A Systematic Extension to Mexican Da.pdf
	- Vaquero - 2014 - An Exploratory Study of a Shared-Book Reading Intervention Involving Spanish-Speaking Latino Familie.pdf
	- eric.ed.gov.html
	- BOIT_umass_0118D_10512.pdf
	- Chacko et al.

# Data Preprocessing

In [None]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [None]:
import re

def format(study_name_string):
    """
    Cuts a string after the last four-digit number, assuming it represents the year.

    Args:
        study_name_string (str): The input string potentially containing a year.

    Returns:
        str: The string cut after the year, or the original string if no year is found.
    """
    # Get rid of all the points, -
    study_name_string = study_name_string.replace('.', '')
    study_name_string = study_name_string.replace(',', '')
    study_name_string = study_name_string.replace(' - ', ' ')
    study_name_string = study_name_string.replace(')', '')
    study_name_string = study_name_string.replace('(', '')
    study_name_string = study_name_string.replace('&', 'and')
    study_name_string = remove_accents(study_name_string)
    # Find all occurrences of four consecutive digits (potential years)
    year_matches = list(re.finditer(r'\b\d{4}\b', study_name_string))

    if year_matches:
        # Get the last match
        last_year_match = year_matches[-1]
        # Get the end index of the last year match
        end_of_year_index = last_year_match.end()
        # Slice the string up to the end of the year
        cut_string = study_name_string[:end_of_year_index]
        return cut_string.strip() # Use strip to remove trailing whitespace
    else:
        # If no four-digit number is found, return the original string
        return study_name_string.strip()

In [None]:
# Read the Excel file
df = pd.read_excel('/content/table/data.xlsx')

# Group by 'Study_ID' (case-sensitive)
output_dir = '/data/grouped_csv'
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Group and save CSVs
grouped = df.groupby('Study_ID')

In [None]:
# Ensure the 'Study' column exists and is in a comparable format (string)
if 'Study' not in df.columns:
    print("Error: 'Study' column not found in the DataFrame.")
# Ensure 'Study' column is string type for consistent comparison
df['Study'] = df['Study'].astype(str)


study_to_group_map = {}
for study_id, group_df in grouped:
    if not group_df['Study'].empty:
        study_name_from_df = format(group_df['Study'].iloc[0])
        study_to_group_map[study_name_from_df] = group_df

matched_pairs = []

for file_path in file_paths:
    filename = os.path.basename(file_path)
    cleaned_filename_study_name = format(filename)

    matching_group_df = study_to_group_map.get(cleaned_filename_study_name)

    if matching_group_df is not None:
        # print(f"  - Matched file '{cleaned_filename_study_name}' with Study '{cleaned_filename_study_name}'")
        matched_pairs.append((file_path, matching_group_df))
    else:
        print(f"  - No matching DataFrame group found for cleaned filename '{cleaned_filename_study_name}' (from file '{filename}')")

print(f"\nFound {len(matched_pairs)} matched pairs.")

  - No matching DataFrame group found for cleaned filename 'ED435985pdf' (from file 'ED435985.pdf')
  - No matching DataFrame group found for cleaned filename 'Chow et al 2008' (from file 'Chow et al. - 2008 - Dialogic reading and morphology training in Chinese children Effects on language and literacy..pdf')
  - No matching DataFrame group found for cleaned filename 'Valdez-Menchaca and Whitehurst Accelerating Language Development Through Picture Book Reading A Systematic Extension to Mexican Dapdf' (from file 'Valdez-Menchaca and Whitehurst - Accelerating Language Development Through Picture Book Reading A Systematic Extension to Mexican Da.pdf')
  - No matching DataFrame group found for cleaned filename 'ericedgovhtml' (from file 'eric.ed.gov.html')
  - No matching DataFrame group found for cleaned filename 'BOIT_umass_0118D_10512pdf' (from file 'BOIT_umass_0118D_10512.pdf')
  - No matching DataFrame group found for cleaned filename 'Chacko et al 2018' (from file 'Chacko et al. - 20

In [None]:
train = matched_pairs[:3]
test = matched_pairs[3:]

In [None]:
ground_truth_messages = []

for file_path, group_df in train:
    print(f"  - Uploading ground truth PDF: {os.path.basename(file_path)}")
    file = client.files.create(
        file=open(file_path, "rb"), purpose="assistants"
    )

    # df_data_text = group_df.to_markdown(index=False)
    df_data_text = group_df.to_csv(index=False)

    message_content = f"Here is a document ({os.path.basename(file_path)}) \
    and the data extracted from it:\n\n{df_data_text}"

    ground_truth_messages.append({
        "role": "user",
        "content": message_content,
    })


  - Uploading ground truth PDF: Crevecoeur - 2008 - Investigating the effects of a kindergarten vocabulary intervention on the word learning of English-.pdf
  - Uploading ground truth PDF: Reese et al. - 2010 - Maternal Elaborative Reminiscing Increases Low-Income Children's Narrative Skills Relative to Dialog.pdf
  - Uploading ground truth PDF: Blom-Hoffman et al. - 2007 - Instructing Parents to Use Dialogic Reading Strategies with Preschool Children Impact of a Video-Ba.pdf


In [None]:
def extract_text_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    pages = [page.extract_text() or '' for page in reader.pages]
    return '\n'.join(pages)

# Fine tuning for Classification


## Preprocessing

In [None]:
def build_dataframe_from_pairs(matched_pairs):
    records = []
    for pdf_path, table_df in matched_pairs:
        text = extract_text_from_pdf(pdf_path)
        for _, row in table_df.iterrows():
            records.append({
                'pdf_path': pdf_path,
                'allocation': row['Allocation'],
                'experimenter': row['Experimenter'],
                'text': text
            })
    return pd.DataFrame(records)

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# 1. Build DataFrame
random.shuffle(matched_pairs)
df = build_dataframe_from_pairs(matched_pairs)

# Random - 1, Non-random - 0
alloc_map = {'Random': 1, 'Non-random': 0}
# 'teacher': 0, 'parent': 1, 'researcher': 2, 'combined': 3
expt_map = {'Teacher': 0, 'Parent': 1, 'Researcher': 2, 'Combined - Teacher and Parent': 3}

# Apply the mapping and handle potential missing values
alloc_labels = [alloc_map.get(a, -1) for a in df['allocation']]
expt_labels = [expt_map.get(e, -1) for e in df['experimenter']]

df['allocation_labels'] = alloc_labels
df['experimenter_labels'] = expt_labels
df.head()

Unnamed: 0,pdf_path,allocation,experimenter,text,allocation_labels,experimenter_labels
0,/content/lab/files/6782/Coyne et al. - 2010 - ...,Non-random,Researcher,Journal of Research on Educational Eﬀectivenes...,0,2
1,/content/lab/files/6782/Coyne et al. - 2010 - ...,Non-random,Researcher,Journal of Research on Educational Eﬀectivenes...,0,2
2,/content/lab/files/7006/Vaquero - 2014 - An Ex...,Random,Parent,\n AN EXPLORATORY STUDY OF A SHARED-BOOK REA...,1,1
3,/content/lab/files/7006/Vaquero - 2014 - An Ex...,Random,Parent,\n AN EXPLORATORY STUDY OF A SHARED-BOOK REA...,1,1
4,/content/lab/files/7006/Vaquero - 2014 - An Ex...,Random,Parent,\n AN EXPLORATORY STUDY OF A SHARED-BOOK REA...,1,1


In [None]:
# 2. Split the dataframe for allocation and experimenter tasks
df_alloc = df[["text", "allocation_labels"]].rename(columns={"allocation_labels": "label"})
df_expt = df[["text", "experimenter_labels"]].rename(columns={"experimenter_labels": "label"})
df_expt = df_expt[df_expt['label'] != -1].copy()

# Create Hugging Face datasets for each task
dataset_alloc = Dataset.from_pandas(df_alloc)
dataset_alloc = dataset_alloc.train_test_split(test_size=0.1)

if len(df_expt) > 1:
    dataset_expt = Dataset.from_pandas(df_expt)
    dataset_expt = dataset_expt.train_test_split(test_size=0.1)
else:
    print("Warning: Not enough data for the experimenter task after filtering invalid labels. Skipping experimenter training.")
    dataset_expt = None

## LLama

In [None]:
import os
import torch
from datasets import load_dataset

from peft import (
    LoraConfig,
    get_peft_model,
)

In [None]:
output_dir = '/content/offload'
os.makedirs(output_dir, exist_ok=True)
print(f"Directory '{output_dir}' created successfully.")

Directory '/content/offload' created successfully.


In [None]:
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoConfig, AutoModelForSequenceClassification

# MODEL_NAME = "meta-llama/Llama-3.1-8B"
MODEL_NAME = "meta-llama/Llama-Prompt-Guard-2-22M"


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=token)
base_model     = AutoModelForSequenceClassification.from_pretrained(
                                          MODEL_NAME, use_auth_token=token,

                                          torch_dtype="auto",
                                          low_cpu_mem_usage=True,
                                          device_map="auto",
                                          offload_folder="offload",

                                          # Set num_labels to 4 for the multi-class experimenter task
                                          num_labels=4,
                                          # Ignore size mismatches in the classification head
                                          ignore_mismatched_sizes=True
                                          )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-Prompt-Guard-2-22M and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 384]) in the checkpoint and torch.Size([4, 384]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def tokenize(example):
    tokenized = tokenizer(example["text"],
                     truncation=True,
                     padding=True,
                     max_length=512)


    tokenized["labels"] = torch.tensor(example["label"])
    return tokenized

tokenized_dataset_alloc = dataset_alloc.map(tokenize, batched=True)
tokenized_dataset_expt = dataset_expt.map(tokenize, batched=True)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/169 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)
model_lora = get_peft_model(base_model, lora_config)

In [None]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics_alloc(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)

    # Filter out samples with label -1
    valid_indices = labels != -1
    valid_preds = preds[valid_indices]
    valid_labels = labels[valid_indices]

    # Always use 'weighted' average for precision, recall, and f1 for robustness
    return {
        "precision": precision.compute(predictions=valid_preds, references=valid_labels, average="weighted", zero_division=0)["precision"],
        "recall": recall.compute(predictions=valid_preds, references=valid_labels, average="weighted", zero_division=0)["recall"],
        "f1": f1.compute(predictions=valid_preds, references=valid_labels, average="weighted")["f1"],
    }


def compute_metrics_expt(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    # For multi-class, accuracy is standard. Precision/recall/f1 need average='weighted' or 'macro'
    return {
        "precision": precision.compute(predictions=preds, references=labels, average="weighted", zero_division=0)["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="weighted", zero_division=0)["recall"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

In [None]:
output_dir_alloc = '/content/outputs_alloc'
os.makedirs(output_dir_alloc, exist_ok=True)
print(f"Directory '{output_dir_alloc}' created successfully.")

Directory '/content/outputs_alloc' created successfully.


## Allocation prediction

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir_alloc,
    run_name="llama_alloc",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    learning_rate=1e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_dataset_alloc["train"],
    eval_dataset=tokenized_dataset_alloc["test"],
    compute_metrics=compute_metrics_alloc,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [None]:
import wandb
wandb.login(key="194f6592fbb1d30fb6015ea55606573509e7d647")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mremizova-ann[0m ([33mremizova-ann-uga[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
c0 = time.process_time()

trainer.train()

elapsed_cpu = time.process_time() - c0
print(f"All samples processed in {elapsed_cpu:.2f} seconds.")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.1557,2.11985,0.0,0.0,0.0
2,1.1693,1.994678,0.0,0.0,0.0
3,1.2074,1.835234,0.0,0.0,0.0
4,1.0113,1.700736,0.0,0.0,0.0
5,0.9154,1.541484,0.0,0.0,0.0
6,0.8471,1.42791,0.0,0.0,0.0
7,0.8658,1.398512,0.0,0.0,0.0
8,0.8341,1.38319,0.0,0.0,0.0
9,0.8337,1.377109,0.0,0.0,0.0
10,0.8075,1.375332,0.0,0.0,0.0


All samples processed in 92.42 seconds.


In [None]:
def save_predictions_to_csv(preds, inputs, csv_path, labels=None, id2label=None, extra_columns=None):
    """
    Save classification predictions to a CSV file.

    Args:
        preds (np.ndarray or list): Predicted label IDs.
        inputs (list[str]): List of input texts.
        csv_path (str): Path to save the CSV file.
        labels (np.ndarray or list, optional): True label IDs.
        id2label (dict[int,str], optional): Mapping from label IDs to label names.
        extra_columns (dict[str,list], optional): Additional columns to include (name -> list of values).
    Returns:
        pd.DataFrame: DataFrame containing inputs, predictions, and optional labels.
    """
    preds = np.array(preds)
    if labels is not None:
        labels = np.array(labels)
    # Build default id2label if not provided
    if id2label is None:
        unique_ids = set(preds.tolist())
        if labels is not None:
            unique_ids |= set(labels.tolist())
        id2label = {i: str(i) for i in sorted(unique_ids)}
    # Prepare records
    records = []
    for i, pred in enumerate(preds):
        record = {
            'input': inputs[i],
            'prediction_id': int(pred),
            'prediction_label': id2label.get(pred, str(pred))
        }
        if labels is not None:
            true = int(labels[i])
            record['true_label_id'] = true
            record['true_label'] = id2label.get(true, str(true))
        if extra_columns:
            for col, values in extra_columns.items():
                record[col] = values[i]
        records.append(record)
    # Create DataFrame and save
    df = pd.DataFrame(records)
    df.to_csv(csv_path, index=False)
    return df

In [None]:
# Example usage:
test_dataset = tokenized_dataset_alloc["test"]
pred_output = trainer.predict(test_dataset)
logits, labels = pred_output.predictions, pred_output.label_ids
preds = np.argmax(logits, axis=1)
inputs = test_dataset['text']
id2label = trainer.model.config.id2label
df = save_predictions_to_csv(preds, inputs, 'predictions.csv', labels=labels, id2label=id2label)

In [None]:
df = df.drop(columns=['prediction_id', 'true_label_id'])

# Define the mapping
label_mapping = {'LABEL_0': 'Non-random', 'LABEL_1': 'Random'}

# Create a new DataFrame with the mapped labels
output_df = df.copy()
output_df['prediction'] = output_df['prediction_label'].map(label_mapping)
output_df['true_label'] = output_df['true_label'].map(label_mapping)
output_df = output_df.drop(columns=['prediction_label'])

# Display the new DataFrame
display(output_df)

Unnamed: 0,input,true_label,prediction
0,"Early Childhood Research Quarterly, 13, No., 2...",Random,Non-random
1,\n AN EXPLORATORY STUDY OF A SHARED-BOOK REA...,Random,Non-random
2,Journal of Applied School Psychology\nISSN: 15...,Random,Non-random
3,Journal of Research on Educational Eﬀectivenes...,Random,Non-random
4,Discussing stories: On how a dialogic reading\...,Random,Non-random
5,Discussing stories: On how a dialogic reading\...,Random,Non-random
6,"Early Childhood Research Quarterly, 13, No., 2...",Random,Non-random
7,Mother–child joint writing and storybook readi...,Random,Non-random
8,Discussing stories: On how a dialogic reading\...,Random,Non-random
9,EARLY EDUCATION AND DEVELOPMENT\nISSN: 1040-92...,Random,Non-random


## Experimenter prediction

In [None]:
output_dir_expt = '/content/outputs_expt'
os.makedirs(output_dir_expt, exist_ok=True)
print(f"Directory '{output_dir_expt}' created successfully.")

Directory '/content/outputs_expt' created successfully.


In [None]:
training_args = TrainingArguments(
    output_dir=output_dir_expt,
    run_name="llama_expt",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    learning_rate=1e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,
)

In [None]:
trainer_expt = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_dataset_expt["train"],
    eval_dataset=tokenized_dataset_expt["test"],
    compute_metrics=compute_metrics_expt,  # Use the correct metric function for multi-class
    tokenizer=tokenizer,
)

c0 = time.process_time()

trainer_expt.train()

elapsed_cpu = time.process_time() - c0
print(f"All samples processed in {elapsed_cpu:.2f} seconds.")

  trainer_expt = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4627,1.399639,0.069252,0.263158,0.109649
2,0.5034,1.379873,0.069252,0.263158,0.109649
3,0.467,1.372546,0.47807,0.631579,0.52322
4,0.4828,1.362936,0.366029,0.578947,0.448493
5,0.4668,1.351194,0.373819,0.578947,0.449282
6,0.4378,1.343149,0.373819,0.578947,0.449282
7,0.4473,1.33502,0.342105,0.526316,0.403509
8,0.4845,1.330507,0.342105,0.526316,0.403509
9,0.5105,1.328009,0.342105,0.526316,0.403509
10,0.5293,1.327276,0.342105,0.526316,0.403509


All samples processed in 88.61 seconds.


#Fine Tuning for Prompting

## Preprocessing

In [None]:
def build_dataframe_from_pairs(matched_pairs):
    records = []
    for pdf_path, table_df in matched_pairs:
        text = extract_text_from_pdf(pdf_path)
        for _, row in table_df.iterrows():
            records.append({
                'pdf_path': pdf_path,
                'allocation': row['Allocation'],
                'experimenter': row['Experimenter'],
                'text': text
            })
    return pd.DataFrame(records)

df = build_dataframe_from_pairs(matched_pairs)
df.head()

Unnamed: 0,pdf_path,allocation,experimenter,text
0,/content/lab/files/6782/Coyne et al. - 2010 - ...,Non-random,Researcher,Journal of Research on Educational Eﬀectivenes...
1,/content/lab/files/6782/Coyne et al. - 2010 - ...,Non-random,Researcher,Journal of Research on Educational Eﬀectivenes...
2,/content/lab/files/7006/Vaquero - 2014 - An Ex...,Random,Parent,\n AN EXPLORATORY STUDY OF A SHARED-BOOK REA...
3,/content/lab/files/7006/Vaquero - 2014 - An Ex...,Random,Parent,\n AN EXPLORATORY STUDY OF A SHARED-BOOK REA...
4,/content/lab/files/7006/Vaquero - 2014 - An Ex...,Random,Parent,\n AN EXPLORATORY STUDY OF A SHARED-BOOK REA...


In [None]:
# Recreate the dataset from the dataframe before splitting
dataset = Dataset.from_pandas(df, preserve_index=False)

dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

print("Training dataset size:", len(train_dataset))
print("Testing dataset size:", len(test_dataset))

Training dataset size: 160
Testing dataset size: 40


In [None]:
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
OUTPUT_DIR = "./llama3-extraction-lora"
BATCH_SIZE = 1             # adjust to your GPU memory
GRAD_ACCUM_STEPS = 8       # simulate larger batch
LR = 2e-5
EPOCHS = 3
MAX_INPUT_LENGTH = 2048
MAX_TARGET_LENGTH = 256

In [None]:
PROMPT_TEMPLATE = """You have several examples of pdfs and their corresponding csv tables.
A new pdf is attached.
Extract the following information **only** from this document:
### Allocation
Were the students allocated randomly to the experimental group? Random / Non-random
{{Random, Non-random}}
### Experimenter
Who delivered the experimental intervention to the students/pupils? \
If the teacher or the parent was trained by the researcher,\
but the intervention was actually delivered by the teacher or the parent,\
then the researcher is not the experimenter.
{{teacher, parent, researcher, combined:teacher and parent}}

### Document:
{text}

### Output:
"""

In [None]:
def make_prompt(example: Dict[str, str]) -> Dict[str, str]:
    """
    Given a row {'text': ..., 'allocation': ..., 'experimenter': ...},
    build:
      input_text = PROMPT_TEMPLATE.format(text=example['text'])
      target_text = the stringified JSON array
    """
    # build the desired JSON array
    json_out = [
        {
            "Allocation": example["allocation"],
            "Experimenter": example["experimenter"],
        }
    ]
    # ensure no extra whitespace
    target = f"{json_out}"
    inp = PROMPT_TEMPLATE.format(text=example["text"])
    return {"input_text": inp, "target_text": target}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token



tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def preprocess(examples):
    # Reconstruct individual example dictionaries from the batched dictionary
    individual_examples = []
    for i in range(len(examples['text'])): # Assuming 'text' is always present
        example = {col: examples[col][i] for col in examples.keys()}
        individual_examples.append(example)

    mapped = [make_prompt(ex) for ex in individual_examples]
    inputs = [m["input_text"] for m in mapped]
    targets = [m["target_text"] for m in mapped]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length",
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length",
        )
    # mask padded tokens
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in seq]
        for seq in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=['pdf_path', 'allocation', 'experimenter', 'text'], # Remove non-numerical columns
)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]



Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [None]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

base_model_prompt = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True,
)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
)
model_prompt = get_peft_model(base_model_prompt, peft_config)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args_prompt = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    fp16=True,
    remove_unused_columns=False,
    label_names=["labels"],
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,
)

# Ensure the tokenizer is correctly passed to the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer_prompt = Trainer(
    model=model_prompt,
    args=training_args_prompt,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
)

In [None]:
trainer_prompt.train()



Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.83 GiB. GPU 0 has a total capacity of 14.74 GiB of which 960.12 MiB is free. Process 58793 has 13.80 GiB memory in use. Of the allocated memory 13.60 GiB is allocated by PyTorch, and 64.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)