# Import necessary library

In [1]:
import pandas as pd
import re
import string
import os
import platform

# import data from csv

In [2]:
def check_os_and_load_data(filename):
    if platform.system() == "Windows":
        print("This is a Windows system. Running Windows-specific code.")
        # Assuming the directory path for Windows is '../data/train/'
        path = os.path.join('..', 'data', 'train', filename)
        data = pd.read_csv(path, encoding='utf-8')
        
    elif platform.system() == "Linux":
        print("This is a Linux system. Running Linux-specific code.")
        # Assuming the directory path for Linux is the home directory
        path = os.path.join(os.path.expanduser('~'), 'ownCloud - Michael Saxer (zhaw.ch)@drive.switch.ch', '4 Semester', 'NLP', 'workbench', filename)
        data = pd.read_csv(path, encoding='utf-8')
        
    else:
        print("This is neither a Windows nor a Linux system. You're on your own, sorry.")
        data = None  # Or handle other operating systems as needed
    
    if data is not None:
        if 'id' in data.columns:
            del data['id']
        # data.head()  # Uncomment this if you want to see the first few rows of the dataframe
    return data

# Example usage
# Pass only the filename, not the path.
data = check_os_and_load_data('train.csv')

This is a Windows system. Running Windows-specific code.


In [3]:
data.head(8)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0


Text Cleaning, we only remove special characters, links, and punctuation, because we want to keep the text as close to the original as possible and BERT does not require any further preprocessing. Stopwords are not removed because they can be important for the model to understand the context of the text.

In [4]:
def clean_text(text):

    text = re.sub('\[.*?\]', '', text)
    #pattern = [zero or more character]

    text = re.sub('https?://\S+|www\.\S+', '', text)
    #pattern = removes (http),://, 'and' www.
    
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #pattern = any punctionation

    text = re.sub('\n', ' ', text)
    #pattern = any new line

    text = re.sub('\w*\d\w*', '', text)
    #pattern = any from[a-zA-Z0-9_], any from[0-9], any from [a-zA-Z0-9_]

    return text

In [5]:
data['comment_text'] = data['comment_text'].apply(lambda x: clean_text(x))
data.head(8)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,More I cant make any real suggestions on impr...,0,0,0,0,0,0
4,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0
5,Congratulations from me as well use the tool...,0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0


Concatinate the labels into one column.

In [6]:
def fun(x):
    if x.sum() != 0:
        return 1
    return 0

rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['hate'] = data[rows].apply(lambda x: fun(x), axis=1)

for i in rows:
    del data[i]

In [7]:
data.head(10)

Unnamed: 0,comment_text,hate
0,Explanation Why the edits made under my userna...,0
1,Daww He matches this background colour Im seem...,0
2,Hey man Im really not trying to edit war Its j...,0
3,More I cant make any real suggestions on impr...,0
4,You sir are my hero Any chance you remember wh...,0
5,Congratulations from me as well use the tool...,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,Your vandalism to the Matt Shirvington article...,0
8,Sorry if the word nonsense was offensive to yo...,0
9,alignment on this subject and which are contra...,0


#  Now comes the fun stuff (trying to build a bert based model pipeline)

import neccesary libs

In [8]:
from transformers import BertModel, BertTokenizer
from tqdm.auto import tqdm
import torch

  torch.utils._pytree._register_pytree_node(


initalize BERT Tokenizer and Model

In [9]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Tokenization
now we define the function to tokenize the text 

In [10]:
def tokenize_data(text):
    try:
        # print(" do not forget to save this settings in the model info file before saving the hidden states!!!")
        return tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return None

apply tokenization for each dataframe one row at the time for memory efficency

In [11]:
# This could take a while depending on the size of your dataset
tqdm.pandas(desc="Tokenizing data")
data['tokenized'] = data['comment_text'].progress_apply(tokenize_data)

# save dataframe as pickle
data.to_pickle('../data/train/tokenized_train_data.pkl')

Tokenizing data:   0%|          | 0/159571 [00:00<?, ?it/s]

In [12]:
if os.path.exists('../data/train/tokenized_train_data.pkl'):
    data = pd.read_pickle('../data/train/tokenized_train_data.pkl')
else:
    print("File not found. Please run the previous cell to create the file.")

In [13]:
data.head(5)

Unnamed: 0,comment_text,hate,tokenized
0,Explanation Why the edits made under my userna...,0,"[input_ids, token_type_ids, attention_mask]"
1,Daww He matches this background colour Im seem...,0,"[input_ids, token_type_ids, attention_mask]"
2,Hey man Im really not trying to edit war Its j...,0,"[input_ids, token_type_ids, attention_mask]"
3,More I cant make any real suggestions on impr...,0,"[input_ids, token_type_ids, attention_mask]"
4,You sir are my hero Any chance you remember wh...,0,"[input_ids, token_type_ids, attention_mask]"


# Training of the model

now we extract input and attention masks and store them into tensores

In [14]:
# First, we need to drop any rows that failed to tokenize
data = data.dropna(subset=['tokenized'])


subset_train = data.iloc[:1000]

# Extract 'input_ids' and 'attention_mask' and create tensors
input_ids = torch.cat(subset_train['tokenized'].apply(lambda x: x['input_ids']).tolist())
attention_mask = torch.cat(subset_train['tokenized'].apply(lambda x: x['attention_mask']).tolist())
labels = torch.tensor(subset_train['hate'].values, dtype=torch.long)

In [15]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import numpy as np
from tqdm import tqdm

# Check if CUDA is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the number of epochs for training
num_epochs = 3

# Assuming `input_ids`, `attention_mask`, and `labels` are your dataset's features
# and labels, respectively, and are all PyTorch tensors.
dataset = TensorDataset(input_ids, attention_mask, labels)

# Split the dataset into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) 
val_loader = DataLoader(val_dataset, batch_size=16)


# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to(device)  # Move the model to the appropriate device (CPU or GPU)

# Prepare optimizer and schedule (linear warmup and decay)
optimizer = AdamW(model.parameters(), lr=2e-4)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)  # Move the batch to the appropriate device
        b_input_ids, b_attention_mask, b_labels = batch
        
        model.zero_grad()
        
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()
        
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss}")


# Validation loop
model.eval()
predictions = []
true_labels = []

for batch in tqdm(val_loader, desc="Validating"):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attention_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Collect predictions and true labels for each batch
    batch_predictions = np.argmax(logits, axis=1).flatten()
    predictions.extend(batch_predictions)
    true_labels.extend(label_ids.flatten())

# Calculate the F1 score
f1 = f1_score(true_labels, predictions)
print(f"Validation F1 Score: {f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1:  21%|██        | 12/57 [08:23<31:29, 41.98s/it]


KeyboardInterrupt: 

now we define the batch sizes

here the pros and cons 

Smaller Batch Sizes:


Memory Efficiency: They require less memory, which means you can train the model even with limited GPU memory.

Regularization Effect: They can have a regularizing effect, helping to prevent overfitting. This is because smaller batches tend to add some noise to the learning process, which means that the model is less likely to learn the noise in the training data.

Frequent Updates: They result in more updates to the model parameters per epoch, which can lead to faster learning initially.

Convergence Quality: They may lead to better generalization in some cases since they explore the error landscape more thoroughly.

Stochastic Nature: They make the optimization process more stochastic, which can help the model to escape local minima.

Larger Batch Sizes:

Computational Efficiency: They make better use of GPU parallelization capabilities, leading to faster computation and training time per epoch.

Stable Gradient Estimates: They provide more accurate estimates of the gradient. However, this can sometimes lead to a model that converges to sharp minima that do not generalize as well.

Less Noise: They add less noise to the learning process, which can be a double-edged sword: it can lead to more stable and reliable convergence but can also result in getting stuck in local minima.

Memory Demands: They require more memory, which can be a limiting factor for machines with less RAM or GPU memory.

Generalization Capability: They might lead to poorer generalization on test data if the model overfits to the training data.



In [29]:
batch_size = 16 # You might need to adjust this depending on your GPU memory

Now twe put the tokenized data trough BERT and create the word embedding

In [30]:
last_hidden_states_batches = []

for i in tqdm(range(0, input_ids.size(0), batch_size), desc='Processing'):
    batch_input_ids = input_ids[i:i+batch_size]
    batch_attention_mask = attention_mask[i:i+batch_size]
    
    with torch.no_grad():
        batch_outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
    
    batch_last_hidden_states = batch_outputs.last_hidden_state
    last_hidden_states_batches.append(batch_last_hidden_states)
    
# Combine the results from all batches
last_hidden_states = torch.cat(last_hidden_states_batches)

Processing:   0%|          | 0/9974 [00:00<?, ?it/s]

KeyboardInterrupt: 

now we can save the hidden states to further use later

In [43]:
import datetime
import platform

def save_hidden_state_and_info(last_hidden_states, model_identifier, max_length, padding, truncation, base_filename):
    # Get the current operating system
    system = platform.system()
    
    # Prepare file names with timestamp
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    hidden_states_filename = f'{base_filename}_hidden_states_{timestamp}.pt'
    info_filename = f'{base_filename}_info_{timestamp}.txt'
    
    # Define base path according to the operating system
    if system == "Windows":
        base_path = os.path.join('..', 'data', 'train')
    elif system == "Linux":
        base_path = os.path.join(os.path.expanduser('~'), 'ownCloud - Michael Saxer (zhaw.ch)@drive.switch.ch', '4 Semester', 'NLP', 'workbench')
    else:
        print("Unsupported operating system. You're on your own, sorry.")
        return

    # Construct full paths
    hidden_states_path = os.path.join(base_path, hidden_states_filename)
    info_path = os.path.join(base_path, info_filename)

    # Tentatively save the hidden states tensor to check the file size
    torch.save(last_hidden_states, hidden_states_path)
    file_size_mb = os.path.getsize(hidden_states_path) / (1024 * 1024)  # File size in MB

    print(f"File size of hidden states: {file_size_mb:.2f} MB")
    
    if file_size_mb > 100:
        user_input = input("The file exceeds 100MB. Do you still want to save it? (yes/no): ")
        if user_input.lower() != 'yes':
            os.remove(hidden_states_path)
            print("File not saved.")
            return
    else:
        print(f"Saved hidden states to {hidden_states_path}")
    
    # Save the model info with dynamic parameters
    with open(info_path, 'w') as f:
        f.write(f'Model: {model_identifier}\n')
        f.write(f'Tokenization: max_length={max_length}, padding="{padding}", truncation={truncation}\n')
    print(f"Saved model info to {info_path}")

# Example usage setup:
# last_hidden_states = # your tensor here
# model_identifier = 'bert-base-uncased'
# max_length = 512  # Example variable
# padding = 'max_length'  # Example variable
# truncation = True  # Example variable
# base_filename = 'my_model'

# To use the function, uncomment and populate the above example setup with real values, then call:
# save_hidden_state_and_info(last_hidden_states, model_identifier, max_length, padding, truncation, base_filename)
