In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, TensorDataset
import seaborn as sns
import matplotlib.pyplot as plt

# âœ… Load datasets
ai_df = pd.read_csv('/kaggle/input/urdu-human-and-ai-text-dataset-uhat/AI.csv')
human_df = pd.read_csv('/kaggle/input/urdu-human-and-ai-text-dataset-uhat/Human.csv')

# âœ… Combine and clean
df = pd.concat([ai_df, human_df], ignore_index=True)
df = df[['Text', 'Label']].dropna()
df = df[df['Text'].astype(str).str.strip().astype(bool)]  # Remove empty strings

# âœ… Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# âœ… Use multilingual BERT (supports Urdu)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize_texts(texts, tokenizer, max_length=512):
    return tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

# âœ… Train and evaluation functions
def train_epoch(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss, total_correct = 0, 0
    for batch in loader:
        input_ids, attention_masks, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=-1)
        total_correct += (preds == labels).sum().item()
    return total_loss / len(loader), total_correct / len(loader.dataset)

def evaluate_model(model, loader, loss_fn, device):
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for batch in loader:
            input_ids, attention_masks, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
            total_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=-1)
            total_correct += (preds == labels).sum().item()
    return total_loss / len(loader), total_correct / len(loader.dataset)

# âœ… Tokenize Urdu text
tokenized = tokenize_texts(df['Text'], tokenizer)
input_ids = tokenized['input_ids']
attention_masks = tokenized['attention_mask']
labels = torch.tensor(df['Label'].values)

# âœ… Train/val/test split
X_train, X_temp, y_train, y_temp, mask_train, mask_temp = train_test_split(
    input_ids, labels, attention_masks, test_size=0.3, stratify=labels, random_state=42
)
X_val, X_test, y_val, y_test, mask_val, mask_test = train_test_split(
    X_temp, y_temp, mask_temp, test_size=0.5, stratify=y_temp, random_state=42
)

train_data = TensorDataset(X_train, mask_train, y_train)
val_data = TensorDataset(X_val, mask_val, y_val)
test_data = TensorDataset(X_test, mask_test, y_test)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)
test_loader = DataLoader(test_data, batch_size=16)

# âœ… Load model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = CrossEntropyLoss()

# âœ… Train the model
for epoch in range(3):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, loss_fn, device)
    val_loss, val_acc = evaluate_model(model, val_loader, loss_fn, device)
    print(f"Epoch {epoch + 1}/3 | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")


In [None]:
# âœ… Final test evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_masks, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_masks)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# âœ… Print metrics
print("\nðŸ“Š Test Results:")
print(f"Accuracy: {accuracy_score(true_labels, predictions):.4f}")
print(f"Precision: {precision_score(true_labels, predictions, average='weighted'):.4f}")
print(f"Recall: {recall_score(true_labels, predictions, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(true_labels, predictions, average='weighted'):.4f}")

print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Human', 'AI']))

# âœ… Plot confusion matrix
cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Human', 'AI'], yticklabels=['Human', 'AI'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()