In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import numpy as np
import random

# Function to set seed for reproducibility
# Ensures experiments can be repeated with the same results
def set_seed(value=42):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed_all(value)

set_seed()

# Load the dataset
file_path = '/content/drive/My Drive/Colab Notebooks/train_V2_csv'
df = pd.read_csv(file_path, usecols=['text', 'label'])

df.head()

Unnamed: 0,text,label
0,The pursuit of success is often portrayed as ...,1
1,"Sure, here's my essay on the benefits of offer...",1
2,The Development of Driverless Cars\n\nWhile dr...,1
3,Today technology is arriving into the schools....,0
4,The advantages of limiting car usage are becom...,1


In [None]:
# Preprocess the dataset
# Converts text data into a format that the BERT model can understand (token IDs, attention masks)

def preprocess_dataset(df):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    encoded_data = tokenizer.batch_encode_plus(
        df['text'].values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

  # prevent the model from paying attention to padded tokens
  # not all sentences are of the same length, padding is used to make all sequences the same length for batch processing.
  # However, the model should not consider these padding tokens as part of the input for making predictions
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    if 'label' in df.columns:
      labels = torch.tensor(df['label'].values)
      return TensorDataset(input_ids, attention_masks, labels)
    else:
      return TensorDataset(input_ids, attention_masks)



In [None]:
file_path = '/content/drive/My Drive/Colab Notebooks/val_V2_csv'
val_df = pd.read_csv(file_path, usecols=['text', 'label'])

file_path = '/content/drive/My Drive/Colab Notebooks/test_V2_csv'
test_df = pd.read_csv(file_path, usecols=['text', 'label'])

test_df.head()

Unnamed: 0,text,label
0,Just over twenty yards from the net with the b...,0
1,"Sure, here's my attempt at writing an essay re...",1
2,"PHONES & DRIVING\n\nIn the United States , car...",0
3,"Ralph Waldo Emerson once said, “What lies behi...",1
4,Many people believe that driving makes everyth...,0


In [None]:
# Preprocess the training, validation, and test datasets
train_dataset = preprocess_dataset(df)  # df is your original training dataframe
val_dataset = preprocess_dataset(val_df)
test_dataset = preprocess_dataset(test_df)

# Create DataLoaders for each dataset
batch_size = 32  # Adjust batch size based on GPU capacity

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [None]:
# Load BERT for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2, # Binary classification
    output_attentions=False,
    output_hidden_states=False,
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
# Updates model parameters based on gradients to minimize the loss function
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import roc_auc_score

# Training loop (simplified version)
epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average Training Loss: {avg_train_loss}')

    # validation for AUC calculation
    # The AUC score evaluates the model's ability to distinguish between the two classes at various threshold settings.
    # It does so by considering both the true positive rate (TPR) and the false positive rate (FPR) across different thresholds.
    model.eval()
    predictions, true_labels= [], []

    # for evaluation, PyTorch does not compute or store gradients for operations performed on model
    with torch.no_grad():
      for batch in validation_dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':batch[0], 'attention_mask':batch[1]}
        outputs = model(**inputs)
        logits = outputs[0]
        # the input expected is the score or probability of the positive class
        probs = torch.softmax(logits, dim=1).cpu().numpy()

        predictions.extend(probs[:, 1])
        true_labels.extend(batch[2].cpu().numpy())

    # calculate AUC
    auc = roc_auc_score(true_labels, predictions)
    print(f'Epoch {epoch+1} AUC: {auc}')
    print('-'*30)


Average Training Loss: 0.05953873241746309
Epoch 1 AUC: 0.999520205202418
------------------------------
Average Training Loss: 0.01746154747429389
Epoch 2 AUC: 0.9996982126088919
------------------------------
Average Training Loss: 0.008784682726602766
Epoch 3 AUC: 0.9997299197970742
------------------------------
Average Training Loss: 0.0068769807402995885
Epoch 4 AUC: 0.9997216677980985
------------------------------


In [None]:
torch.save(model, '/content/drive/My Drive/Colab Notebooks/bert_model.pth')



In [None]:
file_path = '/content/drive/My Drive/Colab Notebooks/test_essays_v1.csv'
kaggle_test_df = pd.read_csv(file_path)

ktest_dataset = preprocess_dataset(kaggle_test_df)
ktest_dataloader = DataLoader(ktest_dataset, sampler=SequentialSampler(ktest_dataset), batch_size=batch_size)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# put model in evaluation mode
model.eval()

predictions = []

with torch.no_grad():
  for batch in ktest_dataloader:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids':batch[0], 'attention_mask':batch[1]}
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).cpu().numpy()[:,1]
    predictions.extend(probs)

kaggle_test_df['generated']= predictions

In [None]:
submission = kaggle_test_df[['id', 'generated']]
submission.to_csv('/content/drive/My Drive/Colab Notebooks/bert_submission.csv', index=False)