In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
import os

# Load the training data
train_essays = pd.read_csv('train_essays.csv')
train_prompts = pd.read_csv('train_prompts.csv')

# Merge the essays and prompts data on 'prompt_id'
train_data = pd.merge(train_essays, train_prompts, on='prompt_id', how='left')

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode the text data
tokenized_data = tokenizer(list(train_data['text']), padding=True, truncation=True, return_tensors='pt')

# Create DataLoader
dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], torch.tensor(train_data['generated'].values))
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

# Evaluate the model
model.eval()
val_tokenized_data = tokenizer(list(X_val), padding=True, truncation=True, return_tensors='pt').to(device)
with torch.no_grad():
    val_outputs = model(**val_tokenized_data)
    val_logits = val_outputs.logits

val_predictions = torch.argmax(val_logits, dim=1).cpu().numpy()

# Evaluation
accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_val, val_predictions))


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   7%|████▌                                                             | 12/173 [2:18:53<37:47:49, 845.15s/it]

In [4]:
os.getcwd()

'C:\\Users\\DELL\\Downloads\\VLG-Project'