In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import os
import shutil

# Clear previous outputs
for folder in ["./results", "./logs", "./submission"]:
    if os.path.exists(folder):
        shutil.rmtree(folder)

# Disable Weights & Biases integration
os.environ["WANDB_DISABLED"] = "true"

# Text preprocessing with advanced cleaning
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Paths for Kaggle datasets
train_paths = {
    "tamil": "/kaggle/input/dataset/tam_training_data_hum_ai.csv",
    "malayalam": "/kaggle/input/dataset/mal_training_data_hum_ai.csv"
}
test_paths = {
    "tamil": "/kaggle/input/dataset/tam_test_data_hum_ai.xlsx",
    "malayalam": "/kaggle/input/dataset/mal_test_data_hum_ai.xlsx"
}

# Verify files
for path in train_paths.values():
    assert os.path.exists(path), f"Training file not found: {path}"
for path in test_paths.values():
    assert os.path.exists(path), f"Test file not found: {path}"

# Load datasets
train_data = {lang: pd.read_csv(path) for lang, path in train_paths.items()}
test_data = {lang: pd.read_excel(path) for lang, path in test_paths.items()}

# Validate required columns
for lang, df in train_data.items():
    assert 'DATA' in df.columns and 'LABEL' in df.columns, f"Missing columns in {lang} training data"
for lang, df in test_data.items():
    assert 'DATA' in df.columns, f"Missing 'DATA' column in {lang} test data"

# Preprocess datasets
for lang, df in train_data.items():
    df['CLEANED_DATA'] = df['DATA'].apply(preprocess_text)

# Tokenizer and Model
MODEL_NAME = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["CLEANED_DATA"], padding="max_length", truncation=True, max_length=512)

# Split datasets into training and validation
encoded_datasets = {}
for lang, df in train_data.items():
    df['LABEL'] = df['LABEL'].map({"HUMAN": 0, "AI": 1})
    train_split, val_split = train_test_split(df, test_size=0.2, stratify=df['LABEL'], random_state=42)
    encoded_datasets[lang] = (train_split, val_split)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Match eval and save strategy
    save_total_limit=1,  # Retain only the best checkpoint
    learning_rate=2e-5,  # Optimized learning rate
    per_device_train_batch_size=4,  # Optimize batch size
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,  # Maintain effective batch size
    num_train_epochs=5,  # Reduce epochs
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,  # Log less frequently
    fp16=True,  # Enable mixed precision training
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    lr_scheduler_type="cosine",
    warmup_steps=500,
    label_smoothing_factor=0.2,
    report_to=["none"],  # Disable external logging
)

models = {}
for lang, (train_split, val_split) in encoded_datasets.items():
    train_encodings = tokenizer(list(train_split['CLEANED_DATA']), truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(list(val_split['CLEANED_DATA']), truncation=True, padding=True, max_length=512)

    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item["labels"] = torch.tensor(self.labels[idx])
            return item

    train_dataset = Dataset(train_encodings, list(train_split['LABEL']))
    val_dataset = Dataset(val_encodings, list(val_split['LABEL']))

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=lambda p: {"f1": f1_score(np.argmax(p.predictions, axis=1), p.label_ids, average="macro")},
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    trainer.train()

    # Store the trained model
    models[lang] = model

    # Clear intermediate results
    del train_dataset, val_dataset, train_split, val_split, train_encodings, val_encodings
    torch.cuda.empty_cache()

# Predictions for test data
submission = {}
for lang, df in test_data.items():
    df['CLEANED_DATA'] = df['DATA'].apply(preprocess_text)
    test_encodings = tokenizer(list(df['CLEANED_DATA']), truncation=True, padding=True, max_length=512, return_tensors="pt")
    logits = models[lang](**test_encodings).logits
    preds = torch.argmax(logits, axis=1).numpy()
    df['PREDICTION'] = preds
    df['PREDICTION'] = df['PREDICTION'].map({0: "HUMAN", 1: "AI"})
    submission[lang] = df[['ID', 'PREDICTION']]

# Save submission
os.makedirs("submission", exist_ok=True)
TEAM_NAME = "CUET_NetworkSociety"
for lang, sub_df in submission.items():
    sub_df.to_csv(f"submission/{TEAM_NAME}_{lang}_run.tsv", sep="\t", index=False)

# Create the zip file in the Kaggle working directory
shutil.make_archive(f"/kaggle/working/{TEAM_NAME}", 'zip', "submission")

# Debug: Verify output directory
print("Submission Directory Contents:", os.listdir("submission"))
print("Root Directory Contents:", os.listdir("/kaggle/working"))
