In [1]:
import os
import json
import random
import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


2025-11-18 19:46:32.634829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763495192.827181      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763495192.882903      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using device: cuda


In [2]:
TRAIN_FILE = "/kaggle/input/traindata/train_data.json"

with open(TRAIN_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

Shape: (960, 4)
Columns: ['id', 'syllogism', 'validity', 'plausibility']


Unnamed: 0,id,syllogism,validity,plausibility
0,50146f21-d265-4e3a-8d93-8165cdbe89a3,All cars are a type of vehicle. No animal is a...,False,True
1,dfafb4f6-4e1d-4cd5-aeb4-75d36aafdf1a,Nothing that is a soda is a juice. A portion o...,True,True
2,e30b1f83-a4c3-49cb-8aaf-5f64208c625b,Everything that is a planet is a celestial bod...,False,False
3,a30e07d5-0fb3-4097-9892-4b145b0c54f5,Every cat is an invisible creature. A number o...,True,False
4,5b8161b7-b1bf-4e16-a854-cd52cdce8a1b,There are no capital cities which are oceans. ...,True,True


In [3]:
def validity_to_int(x):
    # handles True/False, "True"/"False", 0/1
    if isinstance(x, bool):
        return int(x)
    if isinstance(x, str):
        return 1 if x.lower() in ["true", "1", "valid"] else 0
    return int(x)

df["label"] = df["validity"].apply(validity_to_int)

# Keep only needed columns
df = df[["id", "syllogism", "label"]].rename(columns={"syllogism": "text"})

# VERY light cleaning: just collapse extra spaces
df["text"] = df["text"].str.replace(r"\s+", " ", regex=True).str.strip()

print(df["label"].value_counts())
df.head()


label
0    480
1    480
Name: count, dtype: int64


Unnamed: 0,id,text,label
0,50146f21-d265-4e3a-8d93-8165cdbe89a3,All cars are a type of vehicle. No animal is a...,0
1,dfafb4f6-4e1d-4cd5-aeb4-75d36aafdf1a,Nothing that is a soda is a juice. A portion o...,1
2,e30b1f83-a4c3-49cb-8aaf-5f64208c625b,Everything that is a planet is a celestial bod...,0
3,a30e07d5-0fb3-4097-9892-4b145b0c54f5,Every cat is an invisible creature. A number o...,1
4,5b8161b7-b1bf-4e16-a854-cd52cdce8a1b,There are no capital cities which are oceans. ...,1


In [4]:
from sklearn.utils import resample

label_counts = df["label"].value_counts(normalize=True)
print("Label ratios:", label_counts.to_dict())

# If one class > 15% more than the other, upsample minority
if abs(label_counts.get(0, 0) - label_counts.get(1, 0)) > 0.15:
    minority_label = df["label"].value_counts().idxmin()
    majority_label = df["label"].value_counts().idxmax()

    minority = df[df["label"] == minority_label]
    majority = df[df["label"] == majority_label]

    minority_upsampled = resample(
        minority,
        replace=True,
        n_samples=len(majority),
        random_state=SEED
    )

    df_balanced = pd.concat([majority, minority_upsampled], ignore_index=True)
    df_balanced = df_balanced.sample(frac=1.0, random_state=SEED).reset_index(drop=True)  # shuffle

    df = df_balanced
    print("Rebalanced label counts:", df["label"].value_counts())
else:
    print("No heavy imbalance detected; using original data.")

print("Final dataset size:", len(df))


Label ratios: {0: 0.5, 1: 0.5}
No heavy imbalance detected; using original data.
Final dataset size: 960


In [5]:
train_df, val_df = train_test_split(
    df,
    test_size=0.15,
    random_state=SEED,
    stratify=df["label"]
)

print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Train label ratios:", train_df["label"].value_counts(normalize=True).to_dict())
print("Val label ratios:", val_df["label"].value_counts(normalize=True).to_dict())


Train size: 816
Val size: 144
Train label ratios: {0: 0.5, 1: 0.5}
Val label ratios: {0: 0.5, 1: 0.5}


In [6]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

train_dataset, val_dataset

(Dataset({
     features: ['id', 'text', 'label'],
     num_rows: 816
 }),
 Dataset({
     features: ['id', 'text', 'label'],
     num_rows: 144
 }))

In [7]:
MODEL_NAME = "microsoft/mdeberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=512,   # plenty for syllogisms
    )

train_tok = train_dataset.map(tokenize_fn, batched=True)
val_tok   = val_dataset.map(tokenize_fn, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Tokenization done.")
train_tok[0]


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Tokenization done.


{'id': 'd58fd355-2a98-4501-ac04-058a05090a80',
 'text': 'Anyone who is a president is a citizen. Every single person who is a senator is a citizen. This leads to the conclusion that there are some senators who are not citizens.',
 'label': 0,
 'input_ids': [1,
  299,
  38227,
  1867,
  340,
  260,
  263,
  13244,
  340,
  260,
  263,
  260,
  102208,
  261,
  39580,
  6676,
  2986,
  1867,
  340,
  260,
  263,
  260,
  120812,
  340,
  260,
  263,
  260,
  102208,
  261,
  1495,
  14867,
  264,
  289,
  288,
  260,
  49667,
  534,
  2109,
  419,
  2156,
  260,
  120812,
  264,
  1867,
  419,
  777,
  260,
  87669,
  261,
  2],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
).to(device)

# Explicit label mapping: 0=invalid, 1=valid
model.config.id2label = {0: "invalid", 1: "valid"}
model.config.label2id = {"invalid": 0, "valid": 1}

print("Model on:", device)
print("id2label:", model.config.id2label)


pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model on: cuda
id2label: {0: 'invalid', 1: 'valid'}


In [9]:
args = TrainingArguments(
    output_dir="./validity_results",
    num_train_epochs=6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=1e-5,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    save_steps=9999999,   # effectively disables mid-training checkpoints
    eval_steps=200,
    do_eval=True,
    report_to="none",
)

print("TrainingArguments ready.")


TrainingArguments ready.


In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer ready.")

Trainer ready.


  trainer = Trainer(


In [11]:
train_output = trainer.train()
print("\nTraining finished.")

Step,Training Loss
50,0.7106
100,0.6851
150,0.5998
200,0.5869
250,0.5108
300,0.4367
350,0.3467
400,0.3654
450,0.274
500,0.2668



Training finished.


In [12]:
eval_metrics = trainer.evaluate(val_tok)
print("Raw eval metrics:", eval_metrics)

val_true = val_df["label"].tolist()
pred_logits = trainer.predict(val_tok).predictions
val_preds = pred_logits.argmax(axis=1)

acc = accuracy_score(val_true, val_preds)
f1  = f1_score(val_true, val_preds, average="macro")

print(f"\nFinal VALIDITY Accuracy: {acc:.4f}")
print(f"Final VALIDITY F1 (macro): {f1:.4f}")

Raw eval metrics: {'eval_loss': 0.6078586578369141, 'eval_accuracy': 0.8055555555555556, 'eval_f1': 0.8054054054054054, 'eval_runtime': 0.9717, 'eval_samples_per_second': 148.187, 'eval_steps_per_second': 37.047, 'epoch': 6.0}

Final VALIDITY Accuracy: 0.8056
Final VALIDITY F1 (macro): 0.8054


In [13]:
VALIDITY_SAVE_DIR = "./validity_deberta_finalversion"

model.save_pretrained(VALIDITY_SAVE_DIR)
tokenizer.save_pretrained(VALIDITY_SAVE_DIR)

print("Validity model saved to:", VALIDITY_SAVE_DIR)
print("Files:", os.listdir(VALIDITY_SAVE_DIR))

Validity model saved to: ./validity_deberta_finalversion
Files: ['tokenizer_config.json', 'tokenizer.json', 'model.safetensors', 'special_tokens_map.json', 'config.json', 'added_tokens.json', 'spm.model']
