## Import necessary library

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from transformers import BertModel
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import numpy as np
from tqdm import tqdm

In [20]:
if os.path.exists('../data/train/tokenized_train_data.pkl'):
    data = pd.read_pickle('../data/train/tokenized_train_data.pkl')
else:
    print("File not found.")

If working in Google Colab:

In [ ]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/'
data_path = os.path.join(base_path, 'NLP/data/train/tokenized_train_data.pkl')

if os.path.exists(data_path):
    data = pd.read_pickle(data_path)
else:
    print("File not found. Please ensure the file path is correct and run the previous cell to create the file.")

In [21]:
data.head(5)

Unnamed: 0,comment_text,hate,tokenized
0,Explanation Why the edits made under my userna...,0,"[input_ids, token_type_ids, attention_mask]"
1,Daww He matches this background colour Im seem...,0,"[input_ids, token_type_ids, attention_mask]"
2,Hey man Im really not trying to edit war Its j...,0,"[input_ids, token_type_ids, attention_mask]"
3,More I cant make any real suggestions on impr...,0,"[input_ids, token_type_ids, attention_mask]"
4,You sir are my hero Any chance you remember wh...,0,"[input_ids, token_type_ids, attention_mask]"


## Initalize BERT Model

In [22]:
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)

## Train the model

In [23]:
# First, we need to drop any rows that failed to tokenize
data = data.dropna(subset=['tokenized'])
subset_train = data.iloc[:1000]

# Extract 'input_ids' and 'attention_mask' and create tensors
input_ids = torch.cat(subset_train['tokenized'].apply(lambda x: x['input_ids']).tolist())
attention_mask = torch.cat(subset_train['tokenized'].apply(lambda x: x['attention_mask']).tolist())
labels = torch.tensor(subset_train['hate'].values, dtype=torch.long)

## Check if CUDA is available, otherwise use CPU

In [24]:
# Check if CUDA is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for training")

Using cpu for training


## Create a DataLoader for the training and validation sets

In [26]:

# Assuming `input_ids`, `attention_mask`, and `labels` are your dataset's features
# and labels, respectively, and are all PyTorch tensors.
dataset = TensorDataset(input_ids, attention_mask, labels)

# Split the dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) 
val_loader = DataLoader(val_dataset, batch_size=16)

## Define the model and optimizer

In [None]:
# Define the number of epochs for training
num_epochs = 3

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to(device)  # Move the model to the appropriate device (CPU or GPU)

# Prepare optimizer and schedule (linear warmup and decay)
optimizer = AdamW(model.parameters(), lr=2e-4)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

## Training loop

In [ ]:
# Listen zur Speicherung der Verlustwerte
train_loss_values = []
val_loss_values = []
val_f1_scores = []

# Training and validation loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)  # Move the batch to the appropriate device
        b_input_ids, b_attention_mask, b_labels = batch

        optimizer.zero_grad()
        
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_loader)
    train_loss_values.append(avg_train_loss)

    # Validation phase
    model.eval()
    total_val_loss = 0
    for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            batch_predictions = np.argmax(logits, axis=1).flatten()
            val_f1_scores.append(f1_score(label_ids.flatten(), batch_predictions))

    avg_val_loss = total_val_loss / len(val_loader)
    val_loss_values.append(avg_val_loss)

# Plotting the learning curve
plt.figure(figsize=(10, 6))
plt.plot(train_loss_values, label='Trainingsverlust')
plt.plot(val_loss_values, label='Validierungsverlust')
plt.xlabel('Epochen')
plt.ylabel('Verlust')
plt.title('Lernkurve')
plt.legend()
plt.show()

# Displaying the average F1 score for validation set
average_val_f1_score = np.mean(val_f1_scores)
print(f"Average F1 score on the validation set: {average_val_f1_score}")

Training Epoch 1:  33%|███▎      | 19/57 [10:49<21:33, 34.03s/it]