In [1]:
import os
import json
import numpy as np
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

TEST_FILE = "/kaggle/input/testdataforpredictions/test.json" 

VALIDITY_MODEL_DIR = "/kaggle/input/valmodelretrainedon1118/validity_results/checkpoint-612" 
PLAUS_MODEL_DIR    = "/kaggle/input/finalplausibilitymodelds/plausibility_deberta_final" 

print("TEST_FILE:", TEST_FILE)
print("VALIDITY_MODEL_DIR:", VALIDITY_MODEL_DIR)
print("PLAUS_MODEL_DIR:", PLAUS_MODEL_DIR)

Using device: cuda
TEST_FILE: /kaggle/input/testdataforpredictions/test.json
VALIDITY_MODEL_DIR: /kaggle/input/valmodelretrainedon1118/validity_results/checkpoint-612
PLAUS_MODEL_DIR: /kaggle/input/finalplausibilitymodelds/plausibility_deberta_final


In [2]:
valid_tokenizer = AutoTokenizer.from_pretrained(VALIDITY_MODEL_DIR)
valid_model = AutoModelForSequenceClassification.from_pretrained(VALIDITY_MODEL_DIR).to(device)
valid_model.eval()

plaus_tokenizer = AutoTokenizer.from_pretrained(PLAUS_MODEL_DIR)
plaus_model = AutoModelForSequenceClassification.from_pretrained(PLAUS_MODEL_DIR).to(device)
plaus_model.eval()

2025-11-18 20:04:16.198700: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763496256.392312      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763496256.451378      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [3]:
with open(TEST_FILE, "r", encoding="utf-8") as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data)
df_test.head()


Unnamed: 0,id,syllogism,validity,plausibility
0,0,Not all canines are aquatic creatures known as...,False,True
1,1,All birds lay eggs. All chickens lay eggs. All...,False,True
2,2,There are some bridges that are considered art...,False,True
3,3,Every apparatus that indicates time can be cla...,False,True
4,4,There are certain activities that can be class...,False,True


In [4]:
def clean_test_text(t):
    t = t.strip()                     # remove leading/trailing spaces
    t = " ".join(t.split())           # remove weird spacing
    t = t.replace("T herefore", "Therefore")
    t = t.replace("t herefore", "therefore")
    return t

df_test["clean_text"] = df_test["syllogism"].apply(clean_test_text)
texts = df_test["clean_text"].tolist()


In [5]:
def predict_labels(model, tokenizer, texts, batch_size=16, max_length=512):
    all_preds = []
    all_probs = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        encodings = tokenizer(
            list(batch_texts),
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            logits = model(**encodings).logits
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
            preds = np.argmax(probs, axis=-1)

        all_preds.extend(preds)
        all_probs.extend(probs)

    return np.array(all_preds), np.array(all_probs)


In [6]:
valid_preds, valid_probs = predict_labels(
    valid_model, valid_tokenizer, texts
)

plaus_preds, plaus_probs = predict_labels(
    plaus_model, plaus_tokenizer, texts
)

print("Validity sample:", valid_preds[:10])
print("Plausibility sample:", plaus_preds[:10])


Validity sample: [1 0 1 1 1 0 1 0 1 1]
Plausibility sample: [1 0 1 1 0 0 1 1 1 1]


In [7]:
df_test.columns

Index(['id', 'syllogism', 'validity', 'plausibility', 'clean_text'], dtype='object')

In [8]:
df_test["validity_pred"] = valid_preds
df_test["plausibility_pred"] = plaus_preds

# Compare with actual labels in the test file
acc_valid = accuracy_score(df_test["validity"], valid_preds)
f1_valid  = f1_score(df_test["validity"], valid_preds, average="macro")

acc_plaus = accuracy_score(df_test["plausibility"], plaus_preds)
f1_plaus  = f1_score(df_test["plausibility"], plaus_preds, average="macro")

print("VALIDITY RESULTS")
print("Accuracy:", round(acc_valid,4))
print("F1-macro:", round(f1_valid,4))

print("\nPLAUSIBILITY RESULTS")
print("Accuracy:", round(acc_plaus,4))
print("F1-macro:", round(f1_plaus,4))

VALIDITY RESULTS
Accuracy: 0.5875
F1-macro: 0.5429

PLAUSIBILITY RESULTS
Accuracy: 0.75
F1-macro: 0.7498


In [9]:
from sklearn.metrics import classification_report
print(classification_report(df_test["validity"], valid_preds))


              precision    recall  f1-score   support

       False       0.73      0.28      0.40        40
        True       0.55      0.90      0.69        40

    accuracy                           0.59        80
   macro avg       0.64      0.59      0.54        80
weighted avg       0.64      0.59      0.54        80



In [10]:
from sklearn.metrics import balanced_accuracy_score

balanced_acc = balanced_accuracy_score(df_test["validity"], valid_preds)

print("Balanced Accuracy:", round(balanced_acc,4))
print("Macro F1:", round(f1_valid,4))


Balanced Accuracy: 0.5875
Macro F1: 0.5429


In [11]:
valid_map = {0: "invalid", 1: "valid"}
plaus_map = {0: "implausible", 1: "plausible"}

df_test["validity_label"] = df_test["validity_pred"].map(valid_map)
df_test["plausibility_label"] = df_test["plausibility_pred"].map(plaus_map)

df_test[["id", "validity_label", "plausibility_label"]].head()


Unnamed: 0,id,validity_label,plausibility_label
0,0,valid,plausible
1,1,invalid,implausible
2,2,valid,plausible
3,3,valid,plausible
4,4,valid,implausible


In [12]:
with open("submission.jsonl", "w", encoding="utf-8") as f:
    for _, row in df_test.iterrows():
        item = {
            "id": row["id"],
            "validity": row["validity_label"],
            "plausibility": row["plausibility_label"]
        }
        f.write(json.dumps(item) + "\n")

print("submission.jsonl saved.")


submission.jsonl saved.
