In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets transformers
!pip install datasets transformers accelerate



In [3]:
import os
os.chdir("/content/drive/My Drive/nlp_hw4")

In [4]:
import os
import time
import random
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import nltk
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
nltk.download('punkt')
download('stopwords')
download('wordnet')
download('omw-1.4')
random.seed(42)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
!pip install tqdm



In [6]:
from tqdm import tqdm

# read data

In [7]:
train_data_np = np.load('/content/drive/My Drive/IMDB/x_train.npy', allow_pickle=True)
val_data_np = np.load('/content/drive/My Drive/IMDB/x_val.npy', allow_pickle=True)
test_data_np = np.load('/content/drive/My Drive/IMDB/x_test.npy', allow_pickle=True)
unsupervised_data_np = np.load('/content/drive/My Drive/IMDB/unsupervised.npy', allow_pickle=True)

df_train = pd.DataFrame(train_data_np)
df_val = pd.DataFrame(val_data_np)
df_test = pd.DataFrame(test_data_np, columns=["sentence"])

labels_train = pd.read_csv('/content/drive/My Drive/IMDB/y_train.txt', header=None)
labels_val = pd.read_csv('/content/drive/My Drive/IMDB/y_val.txt', header=None)

df_train_combined = pd.concat([df_train, labels_train], axis=1)
df_train_combined.columns = ["sentence", "label"]
df_train_combined['label'] = df_train_combined['label'].astype(int)

df_val_combined = pd.concat([df_val, labels_val], axis=1)
df_val_combined.columns = ["sentence", "label"]
df_val_combined['label'] = df_val_combined['label'].astype(int)

df_unsupervised = pd.DataFrame(unsupervised_data_np, columns=["sentence"])




In [8]:
def split_into_sentences(text):
    return sent_tokenize(text)

split_sentences = []
for text in df_unsupervised['sentence']:
    split_sentences.extend(split_into_sentences(text))

df_split_sentences = pd.DataFrame(split_sentences, columns=["sentence"])


# 1 preprocess

In [9]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords

def preprocess_text_data_fast(df, text_column):

    stops = set(stopwords.words('english'))
    processed_texts = []

    for text in df[text_column]:
        text = text.lower()
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = text.split()
        filtered_words = [word for word in words if word not in stops]
        cleaned_text = ' '.join(filtered_words)
        processed_texts.append(cleaned_text)

    # Create a new column in the DataFrame for the processed text
    df[text_column + '_clean'] = processed_texts
    return df



In [10]:
df_sentences_processed = preprocess_text_data_fast(df_split_sentences, 'sentence')
df_train_processed = preprocess_text_data_fast(df_train_combined, 'sentence')
df_val_processed = preprocess_text_data_fast(df_val_combined, 'sentence')
df_test_processed = preprocess_text_data_fast(df_test, 'sentence')




# 2 BPE

In [11]:
def save_text_to_file(dataframe, text_column, file_path):
    """
    Save text data from a DataFrame column to a text file.

    Args:
    dataframe (pd.DataFrame): The DataFrame containing the text data.
    text_column (str): The name of the column containing the text data.
    file_path (str): The file path where the text data will be saved.
    """
    text_data = dataframe[text_column].tolist()
    with open(file_path, 'w', encoding='utf-8') as file:
        file.writelines(f"{line}\n" for line in text_data)

output_file_path = '/content/drive/My Drive/nlp_hw4/unsupervised_data.txt'
save_text_to_file(df_sentences_processed, 'sentence_clean', output_file_path)

In [12]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# initialize a Byte-Pair Encoding (BPE) model tokenizer
bpe_tokenizer = Tokenizer(BPE())

# Define a BPE trainer with specific special tokens and vocab size
vocab_size = 30000  # Set the desired vocabulary size
bpe_trainer = BpeTrainer(special_tokens=["<UNK>", "<CLS>", "<SEP>", "<PAD>", "<MASK>"], vocab_size=vocab_size)

# set the tokenizer to use whitespace pre-tokenization
bpe_tokenizer.pre_tokenizer = Whitespace()

# train the tokenizer on the preprocessed unsupervised data file
unsupervised_data_file = "/content/drive/My Drive/nlp_hw4/unsupervised_data.txt"
bpe_tokenizer.train(files=[unsupervised_data_file], trainer=bpe_trainer)

tokenizer_path = "/content/drive/My Drive/nlp_hw4/tokenizer.json"
bpe_tokenizer.save(tokenizer_path)

# 3 encoder

In [13]:
def encode_text_column(df, text_column, tokenizer):

    tokens_list = []
    ids_list = []

    for text in df[text_column]:
        encoded = tokenizer.encode(text)
        tokens_list.append(encoded.tokens)
        ids_list.append(encoded.ids)

    df[text_column + "_tokens"] = tokens_list
    df[text_column + "_ids"] = ids_list
    return df

df_train_encoded = encode_text_column(df_train_processed, 'sentence_clean', bpe_tokenizer)
df_val_encoded = encode_text_column(df_val_processed, 'sentence_clean', bpe_tokenizer)




In [14]:
def calculate_token_lengths(df, tokens_column):

    return df[tokens_column].apply(len)

token_lengths_train = calculate_token_lengths(df_train_encoded, 'sentence_clean_tokens')

print(token_lengths_train.head())


0     39
1    356
2     87
3     54
4    252
Name: sentence_clean_tokens, dtype: int64


In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataSequence(Dataset):

    def __init__(self, dataframe, max_seq_len=150):
        self.dataframe = dataframe
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        label = self.dataframe.iloc[index]['label']
        token_ids = self.dataframe.iloc[index]['sentence_clean_ids']

        # ensure the sequence is of maximum length
        if len(token_ids) > self.max_seq_len:
            token_ids = token_ids[:self.max_seq_len]
        else:
            token_ids += [0] * (self.max_seq_len - len(token_ids))

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataSequence(df_train_encoded)
validation_dataset = TextDataSequence(df_val_encoded)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

for batch in train_loader:
    print(batch)
    break


{'input_ids': tensor([[ 1065,  1065,  1233,  ...,     0,     0,     0],
        [ 1173,   537,   182,  ...,     0,     0,     0],
        [ 2462,  1866,  3967,  ...,     0,     0,     0],
        ...,
        [16770,  3465, 16770,  ..., 18618, 10388,  3411],
        [  361,  1725,   888,  ...,   878,   349,  1802],
        [  568,   674,  2403,  ...,     0,     0,     0]]), 'label': tensor([1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0])}


In [33]:

embedding_dimension = 128
batch_size = 64
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#learning_rate = 1e-6
#hidden_layer_size = 512
num_classes = 2
vocab_size = bpe_tokenizer.get_vocab_size()
num_attention_heads = 8
feedforward_dim = 1028
transformer_layers = 2
dropout_rate = 0.1

In [34]:
import torch
import torch.nn as nn

class TextTransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, n_heads, ff_hidden_dim, n_layers, dropout_rate):
        super(TextTransformerModel, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, embedding_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=n_heads, dim_feedforward=ff_hidden_dim, dropout=dropout_rate)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.classifier = nn.Linear(embedding_dim, num_classes)

    def forward(self, input_ids):
        # token and positional embeddings
        seq_len = input_ids.size(1)
        embeddings = self.token_embedding(input_ids) + self.positional_encoding[:, :seq_len, :]

        # transformer encoding
        transformer_output = self.transformer_encoder(embeddings)

        # global average pooling
        pooled_output = transformer_output.mean(dim=1)

        # classification layer
        logits = self.classifier(pooled_output)
        return logits

model = TextTransformerModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dimension,
    num_classes=num_classes,
    n_heads=num_attention_heads,
    ff_hidden_dim=feedforward_dim,
    n_layers=transformer_layers,
    dropout_rate = dropout_rate
)

example_input = torch.randint(0, vocab_size, (32, 50))

# forward pass
output_logits = model(example_input)
print(output_logits.shape)  # expected output shape: (32, 2)


torch.Size([32, 2])


In [35]:
def compute_accuracy(predictions, targets):

    correct_preds = (predictions.argmax(dim=1) == targets).sum().item()
    accuracy = correct_preds / len(targets)
    return accuracy

def train_model(model, dataloader, optimizer, loss_function):
    total_loss = 0
    total_accuracy = 0
    model.train()

    progress_bar = tqdm(total=len(dataloader), desc="Training", unit="batch")

    for batch in dataloader:
        optimizer.zero_grad()

        inputs = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = model(inputs).squeeze(1)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        accuracy = compute_accuracy(outputs.cpu(), labels.cpu())
        total_loss += loss.item()
        total_accuracy += accuracy

        progress_bar.update(1)

    average_loss = total_loss / len(dataloader)
    average_accuracy = total_accuracy / len(dataloader)

    return average_loss, average_accuracy

def evaluate_model(model, dataloader, loss_function):
    total_loss = 0
    total_accuracy = 0
    model.eval()

    progress_bar = tqdm(total=len(dataloader), desc="Evaluating", unit="batch")

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(inputs).squeeze(1)
            loss = loss_function(outputs, labels)

            accuracy = compute_accuracy(outputs.cpu(), labels.cpu())
            total_loss += loss.item()
            total_accuracy += accuracy

            progress_bar.update(1)

    average_loss = total_loss / len(dataloader)
    average_accuracy = total_accuracy / len(dataloader)

    return average_loss, average_accuracy


In [36]:
model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

validation_loss, validation_accuracy = evaluate_model(model, validation_loader, loss_function)
print(f'\nValidation Loss: {validation_loss:.2f} | Validation Accuracy: {validation_accuracy * 100:.2f}%')


Evaluating: 100%|██████████| 196/196 [00:03<00:00, 50.62batch/s]


Validation Loss: 0.70 | Validation Accuracy: 50.03%





In [37]:
# initialize the best validation loss to infinity
lowest_val_loss = float('inf')

# training loop
for epoch in range(epochs):
    start_time = time.time()

    training_loss, training_accuracy = train_model(model, train_loader, optimizer, loss_function)

    validation_loss, validation_accuracy = evaluate_model(model, validation_loader, loss_function)

    # save the model if the validation loss is the best we've seen so far
    if validation_loss < lowest_val_loss:
        torch.save(model.state_dict(), '/content/drive/My Drive/nlp_hw4/best_model.pt')
        lowest_val_loss = validation_loss

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train Loss: {training_loss:.2f} | Train Accuracy: {training_accuracy * 100:.2f}%')
    print(f'Validation Loss: {validation_loss:.2f} | Validation Accuracy: {validation_accuracy * 100:.2f}%')
    print(f'Epoch Time: {time.time() - start_time:.2f} seconds')


Training: 100%|██████████| 391/391 [00:13<00:00, 28.71batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 60.90batch/s]


Epoch 1/10
Train Loss: 0.57 | Train Accuracy: 69.01%
Validation Loss: 0.48 | Validation Accuracy: 77.78%
Epoch Time: 16.92 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.37batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 61.08batch/s]


Epoch 2/10
Train Loss: 0.41 | Train Accuracy: 81.67%
Validation Loss: 0.43 | Validation Accuracy: 81.15%
Epoch Time: 17.07 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.44batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 54.10batch/s]


Epoch 3/10
Train Loss: 0.34 | Train Accuracy: 85.59%
Validation Loss: 0.43 | Validation Accuracy: 81.66%
Epoch Time: 17.39 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.54batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 49.61batch/s]


Epoch 4/10
Train Loss: 0.29 | Train Accuracy: 87.93%
Validation Loss: 0.43 | Validation Accuracy: 82.04%
Epoch Time: 17.67 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.60batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 60.15batch/s]


Epoch 5/10
Train Loss: 0.25 | Train Accuracy: 90.09%
Validation Loss: 0.43 | Validation Accuracy: 83.02%
Epoch Time: 16.94 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.72batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 60.41batch/s]


Epoch 6/10
Train Loss: 0.20 | Train Accuracy: 91.84%
Validation Loss: 0.44 | Validation Accuracy: 83.01%
Epoch Time: 16.88 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.54batch/s]
Evaluating: 100%|██████████| 196/196 [00:04<00:00, 47.49batch/s]


Epoch 7/10
Train Loss: 0.18 | Train Accuracy: 93.13%
Validation Loss: 0.48 | Validation Accuracy: 83.05%
Epoch Time: 17.84 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.83batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 55.40batch/s]


Epoch 8/10
Train Loss: 0.14 | Train Accuracy: 94.65%
Validation Loss: 0.54 | Validation Accuracy: 82.99%
Epoch Time: 17.11 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.64batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 60.88batch/s]


Epoch 9/10
Train Loss: 0.12 | Train Accuracy: 95.53%
Validation Loss: 0.55 | Validation Accuracy: 83.44%
Epoch Time: 16.89 seconds


Training: 100%|██████████| 391/391 [00:13<00:00, 28.83batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 60.81batch/s]

Epoch 10/10
Train Loss: 0.10 | Train Accuracy: 96.43%
Validation Loss: 0.63 | Validation Accuracy: 83.31%
Epoch Time: 16.80 seconds





# 4 predict ofr test

In [46]:
print(type(df_test_processed))
print(df_test_processed[:1])

<class 'pandas.core.frame.DataFrame'>
                                            sentence  \
0  <br /><br />I'm not sure who decides what cate...   

                                      sentence_clean  \
0  im sure decides category movie fits movie horr...   

                               sentence_clean_tokens  \
0  [im, sure, decides, category, movie, fits, mov...   

                                  sentence_clean_ids  
0  [176, 693, 2650, 5193, 182, 3759, 182, 636, 18...  


In [47]:
# Modify TextDataSequence to handle test data without labels
class TextDataSequence(Dataset):
    """
    PyTorch Dataset for text data sequences.

    Args:
    dataframe (pd.DataFrame): DataFrame containing the text data and optionally labels.
    max_seq_len (int): Maximum length of token sequences.
    has_labels (bool): Whether the dataframe contains labels.
    """
    def __init__(self, dataframe, max_seq_len=150, has_labels=True):
        self.dataframe = dataframe
        self.max_seq_len = max_seq_len
        self.has_labels = has_labels

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        token_ids = self.dataframe.iloc[index]['sentence_clean_ids']

        # ensure the sequence is of maximum length
        if len(token_ids) > self.max_seq_len:
            token_ids = token_ids[:self.max_seq_len]
        else:
            token_ids += [0] * (self.max_seq_len - len(token_ids))

        item = {
            'input_ids': torch.tensor(token_ids, dtype=torch.long)
        }

        if self.has_labels:
            label = self.dataframe.iloc[index]['label']
            item['label'] = torch.tensor(label, dtype=torch.long)

        return item

# encode the test data using the BPE tokenizer
df_test_encoded = encode_text_column(df_test_processed, 'sentence_clean', bpe_tokenizer)

test_dataset = TextDataSequence(df_test_encoded, has_labels=False)

batch_size = 64  
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.to(device)

def predict_model(model, dataloader):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            outputs = model(inputs)
            predictions = torch.argmax(outputs, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
    return all_predictions

test_predictions = predict_model(model, test_loader)

with open('/content/drive/My Drive/nlp_hw4/y_test.txt', 'w') as file:
    for prediction in test_predictions:
        file.write(f"{float(prediction)}\n")


# 5 training bert

In [48]:
from transformers import BertConfig, BertForMaskedLM, Trainer, TrainingArguments
from datasets import Dataset
import torch

# load the BPE tokenizer you trained and saved
tokenizer_path = "/content/drive/My Drive/nlp_hw4/tokenizer.json"
bpe_tokenizer = Tokenizer.from_file(tokenizer_path)

# define BERT configuration with specified parameters
bert_config = BertConfig(
    vocab_size=bpe_tokenizer.get_vocab_size(),  # This should be the same as tokenizer's vocab size
    hidden_size=128,  # embedding size
    num_hidden_layers=4,  # number of transformer layers
    num_attention_heads=4,  # number of attention heads
    intermediate_size=512,  # size of the feedforward layer
    max_position_embeddings=128  # this should match the max sequence length
)

# initialize the BERT model for Masked Language Modeling
bert_model = BertForMaskedLM(config=bert_config)

# tokenize the text data using the BPE tokenizer
def tokenize_function(examples):
    return {'input_ids': [bpe_tokenizer.encode(text).ids for text in examples['sentence_clean']]}

# convert DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(df_train_processed[['sentence_clean']])
val_dataset = Dataset.from_pandas(df_val_processed[['sentence_clean']])

# apply the tokenization function to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['sentence_clean'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['sentence_clean'])

# define a custom data collator for MLM
class CustomDataCollator:
    def __init__(self, tokenizer, mlm=True, mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.mlm = mlm
        self.mlm_probability = mlm_probability
        self.pad_token_id = tokenizer.token_to_id('<PAD>')
        self.mask_token_id = tokenizer.token_to_id('<MASK>')

    def __call__(self, examples):
        input_ids = [ex['input_ids'] for ex in examples]
        input_ids = self.pad_sequences(input_ids, 128)
        inputs, labels = self.mask_tokens(torch.tensor(input_ids, dtype=torch.long))
        return {'input_ids': inputs, 'labels': labels}

    def pad_sequences(self, sequences, max_length):
        padded_sequences = []
        for seq in sequences:
            if len(seq) > max_length:
                seq = seq[:max_length]
            else:
                seq = seq + [self.pad_token_id] * (max_length - len(seq))
            padded_sequences.append(seq)
        return padded_sequences

    def mask_tokens(self, inputs):
        labels = inputs.clone()
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        special_tokens_mask = self.get_special_tokens_mask(labels.tolist())
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # we only compute loss on masked tokens
        inputs[masked_indices] = self.mask_token_id
        return inputs, labels

    def get_special_tokens_mask(self, input_ids):
        # manually create the special tokens mask
        special_tokens_mask = []
        for ids in input_ids:
            mask = [1 if token in [self.pad_token_id, self.mask_token_id] else 0 for token in ids]
            special_tokens_mask.append(mask)
        return special_tokens_mask

data_collator = CustomDataCollator(tokenizer=bpe_tokenizer, mlm=True, mlm_probability=0.15)

#  training arguments:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=200,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=bert_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

model_save_path = '/content/drive/My Drive/nlp_hw4/bert-small-mlm'
bert_model.save_pretrained(model_save_path)
bpe_tokenizer.save(model_save_path + '/bpe_tokenizer.json')

#function to count the number of parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_params = count_parameters(bert_model)
print(f'The model has {num_params:,} trainable parameters')


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
500,9.1413,8.758295
1000,8.5138,8.501783
1500,8.4934,8.467978
2000,8.4648,8.446367
2500,8.4611,8.454931
3000,8.448,8.449119
3500,8.4462,8.439943
4000,8.4373,8.443792
4500,8.447,8.435773
5000,8.4453,8.431726


The model has 4,696,752 trainable parameters


In [49]:
model_save_path = '/content/drive/My Drive/nlp_hw4/bert-small-mlm'
bert_model.save_pretrained(model_save_path)
bpe_tokenizer.save(model_save_path + '/bpe_tokenizer.json')


# measurment score for custom bert:

In [50]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

model_save_path = '/content/drive/My Drive/nlp_hw4/bert-small-mlm'
bert_model = BertForMaskedLM.from_pretrained(model_save_path)
bpe_tokenizer = Tokenizer.from_file(model_save_path + '/bpe_tokenizer.json')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

class TextDatasetWithAttention(Dataset):

    def __init__(self, dataframe, max_seq_len=128, pad_token_id=0):
        self.dataframe = dataframe
        self.max_seq_len = max_seq_len
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        label = self.dataframe.iloc[index]['label']
        token_ids = self.dataframe.iloc[index]['sentence_clean_ids']

        # ensure the sequence is of maximum length
        if len(token_ids) > self.max_seq_len:
            token_ids = token_ids[:self.max_seq_len]
        else:
            token_ids += [self.pad_token_id] * (self.max_seq_len - len(token_ids))

        attention_mask = [1 if token_id != self.pad_token_id else 0 for token_id in token_ids]

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

# use the pad_token_id from the tokenizer
pad_token_id = bpe_tokenizer.token_to_id("<PAD>") if "<PAD>" in bpe_tokenizer.get_vocab() else 0

validation_dataset = TextDatasetWithAttention(df_val_encoded, pad_token_id=pad_token_id)

batch_size = 64
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

def compute_accuracy(predictions, labels):
    _, preds = torch.max(predictions, dim=1)
    correct = (preds == labels).sum().item()
    accuracy = correct / len(labels)
    return accuracy

def evaluate_accuracy(model, dataloader):
    model.eval()
    total_accuracy = 0
    total_examples = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=inputs, attention_mask=attention_mask).logits

            predictions = outputs.argmax(dim=1)

            accuracy = compute_accuracy(predictions, labels)
            total_accuracy += accuracy * len(labels)
            total_examples += len(labels)

    avg_accuracy = total_accuracy / total_examples
    return avg_accuracy

validation_accuracy = evaluate_accuracy(bert_model, validation_loader)
print(f'Validation Accuracy: {validation_accuracy * 100:.2f}%')


Evaluating: 100%|██████████| 196/196 [00:08<00:00, 22.91it/s]

Validation Accuracy: 0.44%





# 6 fully connected on BERT

In [52]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


class TextDataset(Dataset):
    def __init__(self, dataframe, max_seq_len=128):
        self.dataframe = dataframe
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        label = self.dataframe.iloc[index]['label']
        token_ids = self.dataframe.iloc[index]['sentence_clean_ids']

        # ensure the sequence is of maximum length
        if len(token_ids) > self.max_seq_len:
            token_ids = token_ids[:self.max_seq_len]
        else:
            token_ids += [0] * (self.max_seq_len - len(token_ids))

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(df_train_encoded)
validation_dataset = TextDataset(df_val_encoded)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # extract the [CLS] token output
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

# load the trained BERT model
model_path = '/content/drive/My Drive/nlp_hw4/bert-small-mlm'
bert_model = BertModel.from_pretrained(model_path)

# freeze BERT model parameters
for param in bert_model.parameters():
    param.requires_grad = False

# instantiate the classifier with the BERT model
num_classes = 2  # For binary classification
classifier_model = BertClassifier(bert_model, num_classes)

classifier_model.to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier_model.parameters(), lr=2e-5)

def compute_accuracy(predictions, targets):
    correct_preds = (predictions.argmax(dim=1) == targets).sum().item()
    accuracy = correct_preds / len(targets)
    return accuracy

# Training function for the classifier
def train_classifier(model, dataloader, optimizer, loss_function):
    total_loss = 0
    total_accuracy = 0
    model.train()

    progress_bar = tqdm(total=len(dataloader), desc="Training", unit="batch")

    for batch in dataloader:
        optimizer.zero_grad()

        inputs = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        outputs = model(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        accuracy = compute_accuracy(outputs.cpu(), labels.cpu())
        total_loss += loss.item()
        total_accuracy += accuracy

        progress_bar.update(1)

    average_loss = total_loss / len(dataloader)
    average_accuracy = total_accuracy / len(dataloader)

    return average_loss, average_accuracy

def evaluate_classifier(model, dataloader, loss_function):
    total_loss = 0
    total_accuracy = 0
    model.eval()

    progress_bar = tqdm(total=len(dataloader), desc="Evaluating", unit="batch")

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(inputs)
            loss = loss_function(outputs, labels)

            accuracy = compute_accuracy(outputs.cpu(), labels.cpu())
            total_loss += loss.item()
            total_accuracy += accuracy

            progress_bar.update(1)

    average_loss = total_loss / len(dataloader)
    average_accuracy = total_accuracy / len(dataloader)

    return average_loss, average_accuracy

epochs = 10
lowest_val_loss = float('inf')

for epoch in range(epochs):
    start_time = time.time()

    training_loss, training_accuracy = train_classifier(classifier_model, train_loader, optimizer, loss_function)

    validation_loss, validation_accuracy = evaluate_classifier(classifier_model, validation_loader, loss_function)

    if validation_loss < lowest_val_loss:
        torch.save(classifier_model.state_dict(), '/content/drive/My Drive/nlp_hw4/best_classifier_model.pt')
        lowest_val_loss = validation_loss

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train Loss: {training_loss:.2f} | Train Accuracy: {training_accuracy * 100:.2f}%')
    print(f'Validation Loss: {validation_loss:.2f} | Validation Accuracy: {validation_accuracy * 100:.2f}%')
    print(f'Epoch Time: {time.time() - start_time:.2f} seconds')


Some weights of BertModel were not initialized from the model checkpoint at /content/drive/My Drive/nlp_hw4/bert-small-mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 391/391 [00:15<00:00, 25.84batch/s]
Evaluating: 100%|██████████| 196/196 [00:04<00:00, 41.84batch/s]


Epoch 1/10
Train Loss: 0.72 | Train Accuracy: 51.18%
Validation Loss: 0.69 | Validation Accuracy: 54.34%
Epoch Time: 19.91 seconds


Training: 100%|██████████| 391/391 [00:14<00:00, 26.10batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 54.48batch/s]


Epoch 2/10
Train Loss: 0.69 | Train Accuracy: 55.38%
Validation Loss: 0.66 | Validation Accuracy: 60.33%
Epoch Time: 18.67 seconds


Training: 100%|██████████| 391/391 [00:15<00:00, 25.84batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 52.33batch/s]


Epoch 3/10
Train Loss: 0.62 | Train Accuracy: 64.98%
Validation Loss: 0.55 | Validation Accuracy: 72.54%
Epoch Time: 19.00 seconds


Training: 100%|██████████| 391/391 [00:15<00:00, 25.63batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 51.67batch/s]


Epoch 4/10
Train Loss: 0.44 | Train Accuracy: 79.50%
Validation Loss: 0.41 | Validation Accuracy: 82.24%
Epoch Time: 19.15 seconds


Training: 100%|██████████| 391/391 [00:14<00:00, 26.19batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 56.07batch/s]


Epoch 5/10
Train Loss: 0.31 | Train Accuracy: 87.30%
Validation Loss: 0.38 | Validation Accuracy: 83.93%
Epoch Time: 18.51 seconds


Training: 100%|██████████| 391/391 [00:14<00:00, 26.12batch/s]
Evaluating: 100%|██████████| 196/196 [00:04<00:00, 41.98batch/s]


Epoch 6/10
Train Loss: 0.24 | Train Accuracy: 90.14%
Validation Loss: 0.38 | Validation Accuracy: 84.60%
Epoch Time: 19.66 seconds


Training: 100%|██████████| 391/391 [00:14<00:00, 26.38batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 55.93batch/s]


Epoch 7/10
Train Loss: 0.20 | Train Accuracy: 92.50%
Validation Loss: 0.40 | Validation Accuracy: 84.90%
Epoch Time: 18.34 seconds


Training: 100%|██████████| 391/391 [00:14<00:00, 26.12batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 55.87batch/s]


Epoch 8/10
Train Loss: 0.16 | Train Accuracy: 93.95%
Validation Loss: 0.43 | Validation Accuracy: 84.74%
Epoch Time: 18.49 seconds


Training: 100%|██████████| 391/391 [00:15<00:00, 25.83batch/s]
Evaluating: 100%|██████████| 196/196 [00:04<00:00, 43.53batch/s]


Epoch 9/10
Train Loss: 0.13 | Train Accuracy: 95.03%
Validation Loss: 0.48 | Validation Accuracy: 84.12%
Epoch Time: 19.66 seconds


Training: 100%|██████████| 391/391 [00:15<00:00, 25.85batch/s]
Evaluating: 100%|██████████| 196/196 [00:03<00:00, 56.55batch/s]

Epoch 10/10
Train Loss: 0.11 | Train Accuracy: 96.17%
Validation Loss: 0.48 | Validation Accuracy: 85.30%
Epoch Time: 18.61 seconds





In [53]:
# Ensure test data is preprocessed and tokenized
df_test_processed = preprocess_text_data_fast(df_test, 'sentence')
df_test_encoded = encode_text_column(df_test_processed, 'sentence_clean', bpe_tokenizer)

# Define the custom dataset for the test data
class TextTestDataset(Dataset):
    def __init__(self, dataframe, max_seq_len=128):
        self.dataframe = dataframe
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        token_ids = self.dataframe.iloc[index]['sentence_clean_ids']

        # Ensure the sequence is of maximum length
        if len(token_ids) > self.max_seq_len:
            token_ids = token_ids[:self.max_seq_len]
        else:
            token_ids += [0] * (self.max_seq_len - len(token_ids))

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long)
        }

# Create Dataset instance
test_dataset = TextTestDataset(df_test_encoded)

# Create DataLoader instance
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load the trained classifier model
classifier_model.load_state_dict(torch.load('/content/drive/My Drive/nlp_hw4/best_classifier_model.pt'))
classifier_model.eval()

# Prediction function
def predict(model, dataloader):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            inputs = batch['input_ids'].to(device)
            outputs = model(inputs)
            predictions = torch.argmax(outputs, dim=1)
            all_predictions.extend(predictions.cpu().numpy())

    return all_predictions

# Predict on the test data
test_predictions = predict(classifier_model, test_loader)

# Save predictions to a file
output_file_path = '/content/drive/My Drive/nlp_hw4/y_test2.txt'
np.savetxt(output_file_path, test_predictions, fmt='%d')


Predicting: 100%|██████████| 196/196 [00:02<00:00, 77.11it/s]


# 7 train both bert and fully connected

In [56]:
from transformers import AdamW, get_linear_schedule_with_warmup

# adding a classification head to the BERT model
class BertWithClassificationHead(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertWithClassificationHead, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_classes)

    def forward(self, input_ids):
        outputs = self.bert(input_ids)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# instantiate the model with the classification head
model_with_classifier = BertWithClassificationHead(bert_model, num_classes)

model_with_classifier.to(device)

optimizer = AdamW(model_with_classifier.parameters(), lr=2e-5)

# total number of training steps is [number of batches] x [number of epochs]
total_steps = len(train_loader) * epochs

#  learning rate scheduler:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # default value in transformers
    num_training_steps=total_steps
)

def train_model(model, dataloader, optimizer, loss_function, scheduler):
    total_loss = 0
    total_accuracy = 0
    model.train()

    progress_bar = tqdm(total=len(dataloader), desc="Training", unit="batch")

    for batch in dataloader:
        optimizer.zero_grad()

        inputs = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        logits = model(inputs)
        loss = loss_function(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        accuracy = compute_accuracy(logits.cpu(), labels.cpu())
        total_loss += loss.item()
        total_accuracy += accuracy

        progress_bar.update(1)

    average_loss = total_loss / len(dataloader)
    average_accuracy = total_accuracy / len(dataloader)

    return average_loss, average_accuracy

lowest_val_loss = float('inf')

for epoch in range(epochs):
    start_time = time.time()

    training_loss, training_accuracy = train_model(model_with_classifier, train_loader, optimizer, loss_function, scheduler)

    validation_loss, validation_accuracy = evaluate_model(model_with_classifier, validation_loader, loss_function)

    # save the model if the validation loss is the best we've seen so far
    if validation_loss < lowest_val_loss:
        torch.save(model_with_classifier.state_dict(), '/content/drive/My Drive/nlp_hw4/best_model.pt')
        lowest_val_loss = validation_loss

    print(f'Epoch {epoch + 1}/{epochs}')
    print(f'Train Loss: {training_loss:.2f} | Train Accuracy: {training_accuracy * 100:.2f}%')
    print(f'Validation Loss: {validation_loss:.2f} | Validation Accuracy: {validation_accuracy * 100:.2f}%')
    print(f'Epoch Time: {time.time() - start_time:.2f} seconds')




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   0%|          | 1/391 [00:00<01:00,  6.47batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:32, 11.92batch/s][A[A

Training:   1%|▏         | 5/391 [00:00<00:27, 14.26batch/s][A[A

Training:   2%|▏         | 7/391 [00:00<00:24, 15.57batch/s][A[A

Training:   3%|▎         | 10/391 [00:00<00:20, 19.04batch/s][A[A

Training:   3%|▎         | 13/391 [00:00<00:17, 21.14batch/s][A[A

Training:   4%|▍         | 16/391 [00:00<00:16, 22.44batch/s][A[A

Training:   5%|▍         | 19/391 [00:00<00:15, 23.72batch/s][A[A

Training:   6%|▌         | 22/391 [00:01<00:14, 24.79batch/s][A[A

Training:   6%|▋         | 25/391 [00:01<00:14, 25.47batch/s][A[A

Training:   7%|▋         | 28/391 [00:01<00:14, 25.58batch/s][A[A

Training:   8%|▊         | 31/391 [00:01<00:14, 25.10batch/s][A[A

Training:   9%|▊         | 34/391 [00:01<00:14, 24.82batch/s][A[A

Training:   9%|▉         | 37/391 [00:01<00:1

Epoch 1/10
Train Loss: 0.31 | Train Accuracy: 89.91%
Validation Loss: 0.38 | Validation Accuracy: 85.04%
Epoch Time: 19.88 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:17, 21.98batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:16, 23.47batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:15, 24.40batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:17, 22.03batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:17, 22.10batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:16, 22.69batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:15, 23.19batch/s][A[A

Training:   6%|▌         | 24/391 [00:01<00:15, 23.07batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:15, 23.04batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:15, 22.89batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:16, 21.59batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:16, 21.59batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:16, 21.66batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 2/10
Train Loss: 0.21 | Train Accuracy: 92.14%
Validation Loss: 0.38 | Validation Accuracy: 85.14%
Epoch Time: 20.90 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:17, 22.36batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:15, 24.50batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:15, 24.81batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:15, 24.97batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:15, 24.34batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:15, 24.72batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:14, 25.21batch/s][A[A

Training:   6%|▌         | 24/391 [00:00<00:14, 25.42batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:14, 25.39batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:14, 25.37batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:13, 25.65batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:13, 25.57batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:14, 24.89batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 3/10
Train Loss: 0.18 | Train Accuracy: 93.31%
Validation Loss: 0.39 | Validation Accuracy: 85.30%
Epoch Time: 20.05 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:15, 25.84batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:15, 25.50batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:15, 25.40batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:15, 24.80batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:15, 24.14batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:14, 24.98batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:14, 25.38batch/s][A[A

Training:   6%|▌         | 24/391 [00:00<00:14, 25.15batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:14, 24.88batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:14, 25.11batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:14, 24.78batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:15, 23.16batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:16, 21.45batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 4/10
Train Loss: 0.16 | Train Accuracy: 94.25%
Validation Loss: 0.41 | Validation Accuracy: 85.53%
Epoch Time: 21.37 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:18, 20.50batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:18, 21.27batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:16, 22.74batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:16, 23.66batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:15, 23.86batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:15, 24.33batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:15, 24.54batch/s][A[A

Training:   6%|▌         | 24/391 [00:01<00:14, 24.74batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:14, 24.72batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:14, 24.67batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:14, 24.59batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:14, 24.82batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:13, 25.18batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 5/10
Train Loss: 0.14 | Train Accuracy: 95.17%
Validation Loss: 0.43 | Validation Accuracy: 85.30%
Epoch Time: 20.60 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:17, 22.14batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:16, 23.92batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:15, 24.22batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:15, 24.13batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:15, 23.77batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:15, 24.20batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:15, 24.21batch/s][A[A

Training:   6%|▌         | 24/391 [00:00<00:14, 24.64batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:14, 24.27batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:14, 24.62batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:14, 24.50batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:14, 24.42batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:15, 23.03batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 6/10
Train Loss: 0.13 | Train Accuracy: 95.49%
Validation Loss: 0.44 | Validation Accuracy: 85.46%
Epoch Time: 21.97 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:18, 20.97batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:17, 22.21batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:16, 23.30batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:15, 23.80batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:16, 23.15batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:15, 23.71batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:15, 24.28batch/s][A[A

Training:   6%|▌         | 24/391 [00:01<00:15, 24.34batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:14, 24.66batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:14, 24.87batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:14, 25.06batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:14, 25.24batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:14, 24.43batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 7/10
Train Loss: 0.11 | Train Accuracy: 96.17%
Validation Loss: 0.45 | Validation Accuracy: 85.44%
Epoch Time: 20.60 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:17, 22.71batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:16, 23.67batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:15, 24.43batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:15, 24.80batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:15, 24.94batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:16, 23.02batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:16, 22.70batch/s][A[A

Training:   6%|▌         | 24/391 [00:01<00:17, 20.83batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:17, 21.12batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:16, 21.74batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:16, 22.22batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:16, 21.74batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:16, 21.68batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 8/10
Train Loss: 0.11 | Train Accuracy: 96.25%
Validation Loss: 0.46 | Validation Accuracy: 85.39%
Epoch Time: 21.61 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:17, 22.54batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:16, 23.50batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:16, 23.41batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:16, 23.53batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:15, 23.88batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:15, 24.32batch/s][A[A

Training:   5%|▌         | 21/391 [00:00<00:15, 24.66batch/s][A[A

Training:   6%|▌         | 24/391 [00:00<00:14, 24.80batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:14, 24.82batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:14, 24.81batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:14, 24.60batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:14, 24.37batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:14, 24.21batch/s][A[A

Training:  11%|█         | 42/391 [00:01<00:

Epoch 9/10
Train Loss: 0.10 | Train Accuracy: 96.54%
Validation Loss: 0.47 | Validation Accuracy: 85.38%
Epoch Time: 20.64 seconds




Training:   0%|          | 0/391 [00:00<?, ?batch/s][A[A

Training:   1%|          | 3/391 [00:00<00:17, 22.69batch/s][A[A

Training:   2%|▏         | 6/391 [00:00<00:18, 20.32batch/s][A[A

Training:   2%|▏         | 9/391 [00:00<00:18, 20.38batch/s][A[A

Training:   3%|▎         | 12/391 [00:00<00:18, 20.73batch/s][A[A

Training:   4%|▍         | 15/391 [00:00<00:18, 20.40batch/s][A[A

Training:   5%|▍         | 18/391 [00:00<00:18, 20.10batch/s][A[A

Training:   5%|▌         | 21/391 [00:01<00:17, 20.63batch/s][A[A

Training:   6%|▌         | 24/391 [00:01<00:18, 20.06batch/s][A[A

Training:   7%|▋         | 27/391 [00:01<00:18, 19.70batch/s][A[A

Training:   8%|▊         | 30/391 [00:01<00:17, 20.08batch/s][A[A

Training:   8%|▊         | 33/391 [00:01<00:17, 20.88batch/s][A[A

Training:   9%|▉         | 36/391 [00:01<00:17, 20.17batch/s][A[A

Training:  10%|▉         | 39/391 [00:01<00:17, 20.62batch/s][A[A

Training:  11%|█         | 42/391 [00:02<00:

Epoch 10/10
Train Loss: 0.10 | Train Accuracy: 96.67%
Validation Loss: 0.47 | Validation Accuracy: 85.34%
Epoch Time: 21.72 seconds





In [58]:
# function to preprocess and encode test data
def preprocess_and_encode_test_data(test_df, tokenizer, max_seq_len=128):
    test_df_clean = preprocess_text_data_fast(test_df, 'sentence')

    test_encoded = encode_text_column(test_df_clean, 'sentence_clean', tokenizer)

    # ensure the sequences are of maximum length with padding
    class TextDataSequenceTest(Dataset):
        def __init__(self, dataframe, max_seq_len=150):
            self.dataframe = dataframe
            self.max_seq_len = max_seq_len

        def __len__(self):
            return len(self.dataframe)

        def __getitem__(self, index):
            token_ids = self.dataframe.iloc[index]['sentence_clean_ids']
            if len(token_ids) > self.max_seq_len:
                token_ids = token_ids[:self.max_seq_len]
            else:
                token_ids += [0] * (self.max_seq_len - len(token_ids))

            return {
                'input_ids': torch.tensor(token_ids, dtype=torch.long)
            }

    test_dataset = TextDataSequenceTest(test_encoded, max_seq_len=max_seq_len)

    return test_dataset

test_dataset = preprocess_and_encode_test_data(df_test, bpe_tokenizer)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

def predict_test(model, dataloader):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            outputs = model(inputs)
            logits = outputs
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
    return all_predictions

y_test_predictions = predict_test(model_with_classifier, test_loader)

# save predictions to y_test3.txt
output_file_path = '/content/drive/My Drive/nlp_hw4/y_test3.txt'
with open(output_file_path, 'w') as file:
    for prediction in y_test_predictions:
        file.write(f"{prediction:.1f}\n")

print(f"Predictions saved to {output_file_path}")


Predictions saved to /content/drive/My Drive/nlp_hw4/y_test3.txt
