In [64]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import re
import string
import torch
import torch.nn as nn

In [65]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [66]:
# Combine the two features into one column
train["Combined"] = train["Description"] + " " + train["Title"]
test["Combined"] = test["Description"] + " " + test["Title"]

In [67]:
X_train = train["Combined"]
X_test = test["Combined"]
y_train = train["Class Index"]
y_test = test["Class Index"]

In [68]:
def clean_text(text):
    # Regular expression pattern to match HTML tags and HTML entities
    regex_html = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

    # Translation table to remove digits and punctuation marks
    remove_digits = str.maketrans('', '', string.digits + string.punctuation)

    # Remove HTML tags and entities from the text
    text = re.sub(regex_html, '', text)

    # Remove digits and punctuation marks from the text
    text = text.translate(remove_digits)

    # Remove special characters, URLs, and usernames from the text
    text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)

    # Split the text into individual words, remove extra whitespace, and convert to lowercase
    text = ' '.join(text.split()).lower()

    return text

In [69]:
# Apply data cleaning
X_train = X_train.apply(clean_text)
X_test = X_test.apply(clean_text)

In [None]:
# Specify the name of the pre-trained BERT model to be used
model_name = "bert-base-uncased"

# Create a tokenizer object using the specified BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a BERT model object using the specified BERT model
# Set the output_hidden_states parameter to True to retrieve hidden states from the model
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

In [72]:
# Calculate the maximum token length among all sequences in the training data
max_token_length = max([len(tokenizer.encode(seq)) for seq in X_train])

## Training set

In [89]:
# Define a function to obtain BERT embeddings
def get_bert_embeddings(text, max_length=max_token_length):
    # Tokenize input text
    tokens = tokenizer.encode(text, add_special_tokens=True, padding="max_length", max_length=max_length)
    # Convert tokens to tensor
    tokens_tensor = torch.tensor(tokens).unsqueeze(0)
    # Obtain BERT embeddings
    with torch.no_grad():
        embeddings = model(tokens_tensor)[0].squeeze(0)
    # Return embeddings
    return embeddings.numpy()

CLS token

In [90]:
# Obtain BERT embeddings for the training set with using [CLS] token

# Specify the batch size for processing the training data
batch_size = 512

# Calculate the number of batches required to process the training data
num_batches = (X_train.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the training embeddings
training_embeddings = np.empty((X_train.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_train[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the BERT embeddings for the current text
        embeddings = get_bert_embeddings(text)
        
        # Store the [CLS] tokens in the batch embeddings array
        batch_embeddings[j, :] = embeddings[0]
    
    # Assign the batch embeddings to the corresponding indices in the training embeddings array
    training_embeddings[start_idx:end_idx, :] = batch_embeddings


In [91]:
np.save("bert_train_embeddings_cls",training_embeddings)

Mean - pooling

In [108]:
# Obtain BERT embeddings for the training set with mean pooling

# Specify the batch size for processing the training data
batch_size = 512

# Calculate the number of batches required to process the training data
num_batches = (X_train.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the training embeddings
training_embeddings = np.empty((X_train.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_train[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the BERT embeddings for the current text
        embeddings = get_bert_embeddings(text)
        
        # Compute the sentence-level embedding by taking the mean of the token embeddings
        sentence_embedding = np.mean(embeddings, 0)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the training embeddings array
    training_embeddings[start_idx:end_idx, :] = batch_embeddings

In [109]:
np.save("bert_train_embeddings_mean",training_embeddings)

Max - pooling

In [13]:
# Obtain BERT embeddings for the training set with max pooling

# Specify the batch size for processing the training data
batch_size = 512

# Calculate the number of batches required to process the training data
num_batches = (X_train.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the training embeddings
training_embeddings = np.empty((X_train.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_train[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the BERT embeddings for the current text
        embeddings = get_bert_embeddings(text)
        
        # Compute the sentence-level embedding by taking the maximum value along the axis 0 (across tokens)
        sentence_embedding = np.max(embeddings, axis=0)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the training embeddings array
    training_embeddings[start_idx:end_idx, :] = batch_embeddings

In [14]:
np.save("bert_train_embeddings_max",training_embeddings)

Attention - pooling

In [114]:
# Obtain BERT embeddings for the training set with attention pooling

# Define an attention layer
attention_layer = nn.Linear(model.config.hidden_size, 1)

def get_attention_pooled_embedding(text):
    tokens = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_token_length,
                                    padding='max_length', return_tensors='pt')
    tokens_tensor = tokens['input_ids']
    with torch.no_grad():
        outputs = model(tokens_tensor, attention_mask=(tokens_tensor > 0))[0]
        hidden_states = outputs

    # Compute attention weights
    attention_weights = torch.softmax(attention_layer(hidden_states), dim=1)

    # Apply attention pooling
    sentence_embedding = torch.sum(hidden_states * attention_weights, dim=1).squeeze()

    return sentence_embedding.detach().numpy()

batch_size = 512
num_batches = (X_train.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the training embeddings
training_embeddings = np.empty((X_train.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_train[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the sentence-level embedding using the get_attention_pooled_embedding function
        sentence_embedding = get_attention_pooled_embedding(text)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the training embeddings array
    training_embeddings[start_idx:end_idx, :] = batch_embeddings

In [115]:
np.save("bert_train_embeddings_attention",training_embeddings)

Mixed - pooling

In [116]:
# Obtain BERT embeddings for the training set with mixed pooling
def get_mixed_pooled_embedding(text):
    tokens = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_token_length, truncation=True,
                                    padding='max_length', return_tensors='pt')
    tokens_tensor = tokens['input_ids']
    with torch.no_grad():
        outputs = model(tokens_tensor, attention_mask=(tokens_tensor > 0))[0]
        hidden_states = outputs
    
    # Mixed pooling
    layer1_embedding = torch.mean(hidden_states[:, :6, :], dim=1) # Average pooling on first 6 layers
    layer2_embedding = torch.max(hidden_states[:, 6:, :], dim=1).values # Max pooling on remaining layers
    
    # Concatenate layer embeddings
    sentence_embedding = torch.cat((layer1_embedding, layer2_embedding), dim=1) 
    return sentence_embedding

batch_size = 512
num_batches = (X_train.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

training_embeddings = np.empty((X_train.shape[0], 1536)) # Updated embedding size due to mixed pooling

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_train[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 1536))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the sentence-level embedding using the get_mixed_pooled_embedding function
        sentence_embedding = get_mixed_pooled_embedding(text)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the training embeddings array
    training_embeddings[start_idx:end_idx, :] = batch_embeddings

In [117]:
np.save("bert_train_embeddings_mixed",training_embeddings)

## Test set

In [119]:
# Define a function to obtain BERT embeddings
def get_bert_embeddings(text, max_length=max_token_length):
    # Tokenize input text
    tokens = tokenizer.encode(text, add_special_tokens=True, padding="max_length", max_length=max_length)
    # Convert tokens to tensor
    tokens_tensor = torch.tensor(tokens).unsqueeze(0)
    # Obtain BERT embeddings
    with torch.no_grad():
        embeddings = model(tokens_tensor)[0].squeeze(0)
    # Return embeddings
    return embeddings.numpy()

CLS Token

In [120]:
# Obtain BERT embeddings for the test set with using [CLS] token

# Specify the batch size for processing the test data
batch_size = 512

# Calculate the number of batches required to process the test data
num_batches = (X_test.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the test embeddings
test_embeddings = np.empty((X_test.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_test.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_test[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the BERT embeddings for the current text
        embeddings = get_bert_embeddings(text)
        
        # Store the [CLS] tokens in the batch embeddings array
        batch_embeddings[j, :] = embeddings[0]
    
    # Assign the batch embeddings to the corresponding indices in the test embeddings array
    test_embeddings[start_idx:end_idx, :] = batch_embeddings

In [121]:
np.save("bert_test_embeddings_cls",test_embeddings)

Mean - pooling

In [123]:
# Obtain BERT embeddings for the test set with mean pooling

# Specify the batch size for processing the test data
batch_size = 512

# Calculate the number of batches required to process the test data
num_batches = (X_test.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the test embeddings
test_embeddings = np.empty((X_test.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_test.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_test[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the BERT embeddings for the current text
        embeddings = get_bert_embeddings(text)
        
        # Compute the sentence-level embedding by taking the mean of the token embeddings
        sentence_embedding = np.mean(embeddings, 0)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the test embeddings array
    test_embeddings[start_idx:end_idx, :] = batch_embeddings

In [124]:
np.save("bert_test_embeddings_mean",test_embeddings)

Max - pooling

In [24]:
# Obtain BERT embeddings for the test set with max pooling

# Specify the batch size for processing the test data
batch_size = 512

# Calculate the number of batches required to process the test data
num_batches = (X_test.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the test embeddings
test_embeddings = np.empty((X_test.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_test.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_test[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the BERT embeddings for the current text
        embeddings = get_bert_embeddings(text)
        
        # Compute the sentence-level embedding by taking the maximum value along the axis 0 (across tokens)
        sentence_embedding = np.max(embeddings, axis=0)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the test embeddings array
    test_embeddings[start_idx:end_idx, :] = batch_embeddings

In [25]:
np.save("bert_test_embeddings_max",test_embeddings)

Attention - pooling

In [126]:
# Obtain BERT embeddings for the test set with attention pooling

# Define an attention layer
attention_layer = nn.Linear(model.config.hidden_size, 1)

def get_attention_pooled_embedding(text):
    tokens = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_token_length,
                                    padding='max_length', return_tensors='pt')
    tokens_tensor = tokens['input_ids']
    with torch.no_grad():
        outputs = model(tokens_tensor, attention_mask=(tokens_tensor > 0))[0]
        hidden_states = outputs

    # Compute attention weights
    attention_weights = torch.softmax(attention_layer(hidden_states), dim=1)

    # Apply attention pooling
    sentence_embedding = torch.sum(hidden_states * attention_weights, dim=1).squeeze()

    return sentence_embedding.detach().numpy()

batch_size = 512
num_batches = (X_test.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

# Create an empty array to store the test embeddings
test_embeddings = np.empty((X_test.shape[0], 768))

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_test.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_test[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 768))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the sentence-level embedding using the get_attention_pooled_embedding function
        sentence_embedding = get_attention_pooled_embedding(text)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the test embeddings array
    test_embeddings[start_idx:end_idx, :] = batch_embeddings

In [127]:
np.save("bert_test_embeddings_attention",test_embeddings)

Mixed - pooling

In [128]:
# Obtain BERT embeddings for the test set with mixed pooling
def get_mixed_pooled_embedding(text):
    tokens = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_token_length, truncation=True,
                                    padding='max_length', return_tensors='pt')
    tokens_tensor = tokens['input_ids']
    with torch.no_grad():
        outputs = model(tokens_tensor, attention_mask=(tokens_tensor > 0))[0]
        hidden_states = outputs
    
    # Mixed pooling
    layer1_embedding = torch.mean(hidden_states[:, :6, :], dim=1) # Average pooling on first 6 layers
    layer2_embedding = torch.max(hidden_states[:, 6:, :], dim=1).values # Max pooling on remaining layers
    
    # Concatenate layer embeddings
    sentence_embedding = torch.cat((layer1_embedding, layer2_embedding), dim=1) 
    return sentence_embedding

batch_size = 512
num_batches = (X_test.shape[0] + batch_size - 1) // batch_size  # Round up to the nearest integer

test_embeddings = np.empty((X_test.shape[0], 1536)) # Updated embedding size due to mixed pooling

# Iterate over the batches
for i in range(num_batches):
    # Determine the start and end indices for the current batch
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_test.shape[0])
    
    # Extract the texts for the current batch
    batch_texts = X_test[start_idx:end_idx]
    
    # Create an empty array to store the embeddings for the current batch
    batch_embeddings = np.empty((len(batch_texts), 1536))
    
    # Iterate over the texts in the current batch
    for j, text in enumerate(batch_texts):
        # Obtain the sentence-level embedding using the get_mixed_pooled_embedding function
        sentence_embedding = get_mixed_pooled_embedding(text)
        
        # Store the sentence-level embedding in the batch embeddings array
        batch_embeddings[j, :] = sentence_embedding
    
    # Assign the batch embeddings to the corresponding indices in the test embeddings array
    test_embeddings[start_idx:end_idx, :] = batch_embeddings

In [129]:
np.save("bert_test_embeddings_mixed",test_embeddings)