In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
train_df = pd.read_csv('/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv') 

In [9]:
texts = []
labels = []  
article_ids = []

def get_article_folder_name(article_id):
    """Convert numeric ID to folder name format: article_XXXX"""
    return f"article_{str(article_id).zfill(4)}"


In [10]:
def load_article_data(article_id, real_text_id, data_dir='/kaggle/input/fake-or-real-the-impostor-hunt/data/train'):
    # Convert numeric ID to folder name
    folder_name = get_article_folder_name(article_id)
    folder_path = os.path.join(data_dir, folder_name)
    
    try:
        with open(os.path.join(folder_path, 'file_1.txt'), 'r', encoding='utf-8') as f:
            text1 = f.read()
        with open(os.path.join(folder_path, 'file_2.txt'), 'r', encoding='utf-8') as f:
            text2 = f.read()
        
        return {
            'real_text': text1 if real_text_id == 1 else text2,
            'fake_text': text2 if real_text_id == 1 else text1
        }
    except FileNotFoundError as e:
        print(f"Error loading article {article_id}: {e}")
        return None

In [11]:
train_data = []
for idx, row in train_df.iterrows():
    article_id = row['id']
    real_text_id = row['real_text_id']
    
    article_data = load_article_data(article_id, real_text_id)
    
    if article_data:
        train_data.extend([
            {'article_id': article_id, 'text': article_data['real_text'], 'label': 1},
            {'article_id': article_id, 'text': article_data['fake_text'], 'label': 0}
        ])

train_df_processed = pd.DataFrame(train_data)
print(f"Loaded {len(train_df_processed)} text samples")
print(train_df_processed.head())

Loaded 190 text samples
   article_id                                               text  label
0           0  The VIRSA (Visible Infrared Survey Telescope A...      1
1           0  The China relay network has released a signifi...      0
2           1  The project aims to achieve an accuracy level ...      1
3           1  China\nThe goal of this project involves achie...      0
4           2  Scientists can learn about how galaxies form a...      1


In [13]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
    TrainingArguments, Trainer, DataCollatorWithPadding)
import torch 
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [17]:
def prepare_data(texts, labels, tokenizer, max_length=512):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors = 'pt'
    )

    return Dataset.from_dict({
        'input_ids':encodings['input_ids'],
        'attention_mask':encodings['attention_mask'],
        'labels': torch.tensor(labels)
    })

In [14]:
train_text = train_df_processed['text'].tolist()
train_label = train_df_processed['label'].tolist()

In [16]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_text, train_label, test_size=0.2, random_state=42)

In [None]:
# Correct model names (note the 'a' in roberta)
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

In [19]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', num_labels=2)
train_dataset = prepare_data(train_texts, train_labels, tokenizer)
val_dataset = prepare_data(val_texts, val_labels, tokenizer)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [20]:
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions ,axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions)
    }

In [33]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
     report_to="none"  
)

In [34]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [35]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.705855,0.447368,0.618182
2,No log,0.687686,0.526316,0.653846
3,No log,0.618194,0.736842,0.772727


TrainOutput(global_step=57, training_loss=0.6770312660618832, metrics={'train_runtime': 1494.9114, 'train_samples_per_second': 0.305, 'train_steps_per_second': 0.038, 'total_flos': 119978641244160.0, 'train_loss': 0.6770312660618832, 'epoch': 3.0})

In [54]:
def predict_article_pair(model, tokenizer, text1, text2):
    input1 = tokenizer(text1, return_tensors='pt',truncation=True, padding=True, max_length=512)
    input2 = tokenizer(text2, return_tensors='pt',truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        output1 = model(**input1)
        output2 = model(**input2)

        prob1 = torch.softmax(output1.logits, dim=1)[0][1].item()
        prob2 = torch.softmax(output2.logits, dim=1)[0][1].item()

    return 1 if prob1 > prob2 else 2

In [51]:
def load_text_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except:
        return ""
    

In [52]:
def test_data_set(model, tokenizer, test_dir):
    test_predictions = []
    article_folders = [f for f in os.listdir(test_dir) if f.startswith('article_')]

    for article_folder in article_folders:
        folder_path = os.path.join(test_dir, article_folder)
        text1 = load_text_data(os.path.join(folder_path, 'file_1.txt'))
        text2 = load_text_data(os.path.join(folder_path, 'file_2.txt'))

        if not text1 and not text2:
            print(f"Warning: Empty files in {article_folder}, defaulting to file_1")
            predicted_real = 1
        else:
            predicted_real = predict_article_pair(model, tokenizer, text1, text2)

        article_id = article_folder.replace('article_', '')

        test_predictions.append({
            'id':article_id,
            'real_text_id':predicted_real
        })

    return pd.DataFrame(test_predictions)

In [56]:
def predict_test_set(model, tokenizer, test_dir):
    test_predictions = []
    article_folders = [f for f in os.listdir(test_dir) if f.startswith('article_')]
    
    for article_folder in article_folders:
        folder_path = os.path.join(test_dir, article_folder)

        text1 = load_text_data(os.path.join(folder_path, 'file_1.txt'))
        text2 = load_text_data(os.path.join(folder_path, 'file_2.txt'))

        if not text1 or not text2:
            print(f"Warning: Empty files in {article_folder}, defaulting to file_1")
            predicted_real = 1
        else:
            predicted_real = predict_article_pair(model, tokenizer, text1, text2)
        
        article_id = article_folder.replace('article_', '')
        
        test_predictions.append({
            'id': article_id,
            'real_text_id': predicted_real
        })
    
    return pd.DataFrame(test_predictions)

test_dir = '/kaggle/input/fake-or-real-the-impostor-hunt/data/test'
submission_df = predict_test_set(model, tokenizer, test_dir)




In [59]:
submission_df.shape

(1068, 2)

In [60]:
test_dir = '/kaggle/input/fake-or-real-the-impostor-hunt/data/test'
submitted_df = test_data_set(model, tokenizer, test_dir)
submitted_df.shape

(1, 2)

In [61]:
submission_df.to_csv('submission.csv', index=False)
print("Submission file created!")
print(submission_df.head())

Submission file created!
     id  real_text_id
0  0192             2
1  0956             1
2  0266             2
3  0435             1
4  1054             2
