In [None]:
import pandas as pd
df_clean = pd.read_csv('df_typos.csv',index_col = 0)

In [None]:
# Feature columns and target column
features = ['generated', 'text_length', 'mean_word_length', 'sentences',
            'sentence_length', 'mean_sentence', 'unique_word_count',
            'proper_noun_count', 'number_count', 'text_with_typos_replaced']

# Separate target and features
X = df_clean[features].drop(columns=['generated'])  # Exclude 'generated' from features
y = df_clean['generated']

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Sample Dataset Class
class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = df['text_with_typos_replaced'].tolist()
        self.labels = df['generated'].tolist()
        self.features = df[['text_length', 'mean_word_length', 'sentence_length', 'mean_sentence',
                            'unique_word_count', 'proper_noun_count', 'number_count']].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        features = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return input_ids, attention_mask, features, label

# Define BERT-based Model
class BertClassifier(nn.Module):
    def __init__(self, num_features):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(768 + num_features, 2)  # 768 for BERT hidden size, + num_features
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask, numerical_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        combined_input = torch.cat((pooled_output, numerical_features), dim=1)
        x = self.dropout(self.relu(combined_input))
        x = self.fc(x)
        return x

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
dataset = TextDataset(df_clean)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
# Model Initialization
import os
num_features = 7
model = BertClassifier(num_features)
# Check if running on Google Colab TPU
if 'COLAB_TPU_ADDR' in os.environ:
    try:
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()  # Use TPU
    except ImportError:
        device = torch.device("cpu")  # Fallback to CPU if torch_xla is not installed
elif torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA if available (NVIDIA GPU)
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Apple's Metal (MPS) for Mac GPU
else:
    device = torch.device("cpu")  # Default to CPU
model.to(device)

# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
pip install tqdm



In [None]:
from tqdm import tqdm

epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0  # Track loss across batches

    # Use tqdm to wrap the dataloader for a progress bar
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)

    for input_ids, attention_mask, numerical_features, labels in progress_bar:
        input_ids, attention_mask, numerical_features, labels = (
            input_ids.to(device),
            attention_mask.to(device),
            numerical_features.to(device),
            labels.to(device),
        )

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, numerical_features)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update tqdm description with the latest loss
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")


Epoch 1/2: 100%|██████████| 1822/1822 [13:52<00:00,  2.19it/s, loss=0.183]


Epoch 1 completed. Average Loss: 0.0679


Epoch 2/2: 100%|██████████| 1822/1822 [13:55<00:00,  2.18it/s, loss=0.00069]

Epoch 2 completed. Average Loss: 0.0151





In [None]:
torch.save(model.state_dict(), "model_weights.pth")


In [None]:
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss.item()
}, "checkpoint.pth")

In [None]:
# Shuffle and split the dataset into training and validation
X = df_clean
y = df_clean['generated']
from sklearn.model_selection import train_test_split
# Shuffle and split 80% for training, 20% for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Tokenizer for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


val_dataset = TextDataset(X_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:

from tqdm import tqdm

# Function to evaluate the model on the validation set
def validate_model(model, val_loader, device):
    model.eval()  # Set the model to evaluation mode
    correct_predictions = 0
    total_predictions = 0
    total_loss = 0

    criterion = nn.CrossEntropyLoss()  # Assuming a classification problem

    with torch.no_grad():  # No need to track gradients during validation
        # Wrap val_loader with tqdm for progress bar
        progress_bar = tqdm(val_loader, desc="Validation Progress", leave=True)

        for input_ids, attention_mask, numerical_features, labels in progress_bar:
            input_ids, attention_mask, numerical_features, labels = (
                input_ids.to(device),
                attention_mask.to(device),
                numerical_features.to(device),
                labels.to(device),
            )

            # Forward pass
            outputs = model(input_ids, attention_mask, numerical_features)
            loss = criterion(outputs, labels)

            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

            # Update the progress bar with the latest loss
            progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(val_loader)
    accuracy = correct_predictions / total_predictions * 100
    print(f"Validation Loss: {avg_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.2f}%")

# Call the validation function
validate_model(model, val_loader, device)


Validation Progress: 100%|██████████| 183/183 [01:15<00:00,  2.44it/s, loss=0.00043]

Validation Loss: 0.0190
Validation Accuracy: 99.33%



