<a href="https://colab.research.google.com/github/shill7/APS360_Project/blob/main/FINAL_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

tajrianislam_training_path = kagglehub.dataset_download('tajrianislam/training')
tajrianislam_testingdat_path = kagglehub.dataset_download('tajrianislam/testingdat')

print('Data source import complete.')


In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.8-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.8-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.1/239.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading cmudict-1.1.1-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.7/939.7 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.1.1 pyphen-0.17.2 textstat-0.7.8


In [None]:


import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaModel
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import nltk
from nltk.corpus import stopwords
from textstat import flesch_reading_ease
import string
from tqdm import tqdm
import matplotlib.pyplot as plt
import joblib



nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

MAX_LEN = 256
BATCH_SIZE = 128
EPOCHS = 4
LEARNING_RATE = 2e-5
MODEL_NAME = 'roberta-base'

class LinguisticFeatureExtractor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def extract_features(self, text):
        sentences = nltk.sent_tokenize(text)
        sentence_count = len(sentences)
        words = nltk.word_tokenize(text)
        word_count = len(words)

        features = {
            'avg_sentence_length': word_count / sentence_count if sentence_count > 0 else 0,
            'avg_word_length': sum(len(word) for word in words) / word_count if word_count > 0 else 0,
            'stopword_ratio': sum(1 for word in words if word.lower() in self.stop_words) / word_count if word_count > 0 else 0,
            'punctuation_count': sum(1 for char in text if char in string.punctuation),
            'uppercase_ratio': sum(1 for word in words if word.isupper()) / word_count if word_count > 0 else 0,
            'readability_score': flesch_reading_ease(text),
            'type_token_ratio': len(set(words)) / word_count if word_count > 0 else 0
        }
        return features

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, feature_extractor):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        features = self.feature_extractor.extract_features(text)
        features_tensor = torch.tensor([
            features['avg_sentence_length'],
            features['avg_word_length'],
            features['stopword_ratio'],
            features['punctuation_count'],
            features['uppercase_ratio'],
            features['readability_score'],
            features['type_token_ratio']
        ], dtype=torch.float)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'linguistic_features': features_tensor,
            'label': torch.tensor(label, dtype=torch.long)
        }

class HybridClassifier(nn.Module):
    def __init__(self, n_linguistic_features, n_classes=2):
        super(HybridClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(MODEL_NAME)

        # Freeze RoBERTa
        for param in self.roberta.parameters():
            param.requires_grad = False

        self.linguistic_fc = nn.Linear(n_linguistic_features, 32)
        self.classifier = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size + 32, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, n_classes)
        )

    def forward(self, input_ids, attention_mask, linguistic_features):
        roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = roberta_output.pooler_output
        linguistic_output = torch.relu(self.linguistic_fc(linguistic_features))
        combined = torch.cat((pooled_output, linguistic_output), dim=1)
        return self.classifier(combined)

# --------------------------
# Training Functions
# --------------------------

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        linguistic_features = batch['linguistic_features'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            linguistic_features=linguistic_features
        )

        loss = nn.CrossEntropyLoss()(outputs, labels)
        losses.append(loss.item())

        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            linguistic_features = batch['linguistic_features'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                linguistic_features=linguistic_features
            )

            loss = nn.CrossEntropyLoss()(outputs, labels)
            losses.append(loss.item())

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=['Human', 'AI']))

    return accuracy, np.mean(losses)

# --------------------------
# Main Training Pipeline
# --------------------------

def main():
    # Load datasets
    data_path = '/kaggle/input/testingdat/'
    ai_train = pd.read_csv("/kaggle/input/training/ai_train.csv")
    human_train = pd.read_csv("/kaggle/input/training/human_train.csv")
    ai_val = pd.read_csv("/kaggle/input/training/ai_val.csv")
    human_val = pd.read_csv("/kaggle/input/training/human_val.csv")

    # Create labels (1=AI, 0=Human)
    ai_train['label'] = 1
    human_train['label'] = 0
    ai_val['label'] = 1
    human_val['label'] = 0

    # Combine datasets
    train_df = pd.concat([ai_train, human_train])
    val_df = pd.concat([ai_val, human_val])

    # Initialize components
    tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
    feature_extractor = LinguisticFeatureExtractor()

    # Create datasets
    train_dataset = TextDataset(
        texts=train_df['text_clean'].values,
        labels=train_df['label'].values,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        feature_extractor=feature_extractor
    )

    val_dataset = TextDataset(
        texts=val_df['text_clean'].values,
        labels=val_df['label'].values,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        feature_extractor=feature_extractor
    )

    # Create data loaders
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2
    )

    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=BATCH_SIZE,
        num_workers=2
    )

    # Initialize model
    model = HybridClassifier(n_linguistic_features=7).to(device)

    # Set up optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Lists to store metrics for plotting
    iters = []
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    iteration = 0

    # Training loop
    best_accuracy = 0
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)

        # Training
        model.train()
        epoch_train_loss = 0
        epoch_train_correct = 0
        train_samples = 0

        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            linguistic_features = batch['linguistic_features'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                linguistic_features=linguistic_features
            )

            loss = nn.CrossEntropyLoss()(outputs, labels)
            _, preds = torch.max(outputs, dim=1)
            epoch_train_correct += torch.sum(preds == labels).item()
            epoch_train_loss += loss.item() * input_ids.size(0)
            train_samples += input_ids.size(0)

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            # Store metrics per iteration
            iters.append(iteration)
            train_losses.append(loss.item())
            train_accs.append((torch.sum(preds == labels).double() / input_ids.size(0)).item())
            iteration += 1

        # Compute epoch-level training metrics
        epoch_train_loss /= train_samples
        epoch_train_acc = epoch_train_correct / train_samples
        print(f'Train loss: {epoch_train_loss:.4f}, accuracy: {epoch_train_acc:.4f}')

        # Validation
        val_acc, val_loss = eval_model(model, val_loader, device)
        print(f'Validation loss: {val_loss:.4f}, accuracy: {val_acc:.4f}')
        val_accs.append(val_acc.item())
        val_losses.append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), 'best_model.bin')
            best_accuracy = val_acc

    # Plotting
    # Adjust x-axis for validation metrics to align with the end of each epoch
    epoch_iters = [i for i in range(len(train_loader) - 1, len(iters), len(train_loader))] + ([len(iters) - 1] if len(iters) % len(train_loader) != 0 else [])

    plt.figure(figsize=(10, 5))
    plt.title("Training and Validation Loss")
    plt.plot(iters, train_losses, label="Train", color="#1f77b4")
    plt.plot(epoch_iters, val_losses, label="Validation", color="#ff7f0e", marker='o')
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.legend(loc='best')
    plt.show()

    plt.figure(figsize=(10, 5))
    plt.title("Training and Validation Accuracy")
    plt.plot(iters, train_accs, label="Train", color="#1f77b4")
    plt.plot(epoch_iters, val_accs, label="Validation", color="#ff7f0e", marker='o')
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

    print(f"Final Training Accuracy: {train_accs[-1]:.4f}")
    print(f"Final Validation Accuracy: {val_accs[-1]:.4f}")

    # Load best model and evaluate on test set
    print("\nEvaluating on test set...")
    model.load_state_dict(torch.load('best_model.bin'))

    # Load test data
    ai_test = pd.read_csv(data_path + "ai_test.csv")
    human_test = pd.read_csv(data_path + "human_test.csv")
    ai_test['label'] = 1
    human_test['label'] = 0
    test_df = pd.concat([ai_test, human_test])

    test_dataset = TextDataset(
        texts=test_df['text_clean'].values,
        labels=test_df['label'].values,
        tokenizer=tokenizer,
        max_len=MAX_LEN,
        feature_extractor=feature_extractor
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=2
    )

    test_acc, test_loss = eval_model(model, test_loader, device)
    print(f'Test accuracy: {test_acc:.4f}')

if __name__ == "__main__":
    main()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/4
----------


Training: 100%|██████████| 690/690 [20:51<00:00,  1.81s/it]


Train loss: 0.5667, accuracy: 0.7012


Evaluating: 100%|██████████| 148/148 [04:27<00:00,  1.81s/it]



Classification Report:
              precision    recall  f1-score   support

       Human       0.77      0.96      0.86     12111
          AI       0.88      0.49      0.63      6813

    accuracy                           0.79     18924
   macro avg       0.82      0.73      0.74     18924
weighted avg       0.81      0.79      0.77     18924

Validation loss: 0.4885, accuracy: 0.7916

Epoch 2/4
----------


Training:  72%|███████▏  | 496/690 [15:06<05:53,  1.82s/it]