## BERT Grammer Checker

In [None]:
# Install the latest version of the 'transformers' library directly from the Hugging Face GitHub repository
# The '--quiet' flag suppresses the output for a cleaner installation process
%pip install --quiet git+https://github.com/huggingface/transformers

In [None]:
# Display the current status of the NVIDIA GPU, including memory usage, GPU utilization, and running processes
!nvidia-smi

###	Importing Dependencies

In [None]:
# Suppress warnings to keep the output clean
import warnings
warnings.filterwarnings('ignore')

# Import necessary libraries for model training and evaluation
import torch
import pandas as pd
from sklearn.utils import shuffle  # Shuffle data for randomness in training and validation splits
from transformers import RobertaTokenizer, BertTokenizer  # Tokenizers for Roberta and Bert models
from torch.utils.data import TensorDataset  # Dataset to hold input/output tensors
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler  # DataLoader for batching and shuffling
from transformers import BertForSequenceClassification, BertConfig, RobertaForSequenceClassification  # Pretrained models for classification tasks
from transformers import AdamW  # Optimizer for training
import numpy as np
from transformers import get_linear_schedule_with_warmup  # Learning rate scheduler
import time  # Track time for training and evaluation
import datetime  # Format time into a readable format
from sklearn.metrics import confusion_matrix, classification_report  # For evaluation metrics
import random  # Set random seeds for reproducibility
import matplotlib.pyplot as plt  # Plotting for visualizing training progress
%matplotlib inline  # Enable inline plotting for Jupyter Notebooks
import seaborn as sns  # For creating better plots
import weightwatcher as ww  # For monitoring the training process and analyzing weight changes
import os  # To handle file and directory operations

### Setting up GPU

In [3]:
# Check if a GPU is available and set the device accordingly (use GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the name of the GPU being used (if available)
print('We will use the GPU:', torch.cuda.get_device_name(0))

We will use the GPU: NVIDIA GeForce RTX 3050 Laptop GPU


### Loading and transforming data for use

In [4]:
# Load the cleaned Lang-8 dataset into a DataFrame
df_lang_0 = pd.read_csv('./Cleaned_Lang8_new.csv')

In [5]:
# Select the first 90,000 sentences from column '0', assign label 0, and rename the column to 'sentence'
df_lang_t0 = df_lang_0[0:90000]['0'].to_frame()
df_lang_t0['label'] = 0
df_lang_t0 = df_lang_t0.rename(columns={'0': 'sentence'})

# Select sentences from rows 110,000 to 200,000 from column '1', assign label 1, and rename the column to 'sentence'
df_lang_t1 = df_lang_0[110000:200000]['1'].to_frame()
df_lang_t1['label'] = 1
df_lang_t1 = df_lang_t1.rename(columns={'1': 'sentence'})

# Combine the labeled dataframes for training and shuffle the rows
df_train = pd.concat([df_lang_t0, df_lang_t1], ignore_index=True)
df_train = shuffle(df_train)

In [6]:
# Create a validation set by sampling 10,000 sentences from rows 90,000 to 100,000 in column '0', assign label 0, and rename the column
df_val0 = df_lang_0[90000:100000]['0'].sample(10000).to_frame()
df_val0['label'] = 0
df_val0 = df_val0.rename(columns={'0': 'sentence'})

# Create a validation set by sampling 10,000 sentences from rows 100,000 to 110,000 in column '1', assign label 1, and rename the column
df_val1 = df_lang_0[100000:110000]['1'].sample(10000).to_frame()
df_val1['label'] = 1
df_val1 = df_val1.rename(columns={'1': 'sentence'})

# Combine the validation dataframes and shuffle the rows
df_val = pd.concat([df_val0, df_val1], ignore_index=True)
df_val = shuffle(df_val)

In [None]:
# Print the number of sentences in the training and validation sets
print('Number of training sentences: {:,}'.format(df_train.shape[0]))
print('Number of validation sentences: {:,}'.format(df_val.shape[0]))

In [None]:
df_train['label'].value_counts(), df_val['label'].value_counts()

In [9]:
# Extract training sentences and their corresponding labels from the training DataFrame
sentences_train = df_train.sentence.values
labels_train = df_train.label.values

# Extract validation sentences and their corresponding labels from the validation DataFrame
sentences_val = df_val.sentence.values
labels_val = df_val.label.values

### Loading Tokenizer

In [10]:
# Initialize the tokenizer for RoBERTa (or BERT, commented out)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### Tokenization and encoding

In [11]:
# Preparing the training data
input_ids_train = []  # List to store tokenized input IDs for training data
attention_masks_train = []  # List to store attention masks for training data

# Tokenize each training sentence and create input IDs and attention masks
for sent in sentences_train:
    encoded_dict = tokenizer.encode_plus(
        sent,                      # The sentence to encode
        add_special_tokens=True,   # Add [CLS] and [SEP] tokens
        max_length=64,             # Truncate sentences to a maximum length of 64
        pad_to_max_length=True,    # Pad sentences shorter than 64 to this length
        return_attention_mask=True,  # Generate attention masks
        return_tensors='pt',       # Return PyTorch tensors
    )
    input_ids_train.append(encoded_dict['input_ids'])  # Append tokenized input IDs
    attention_masks_train.append(encoded_dict['attention_mask'])  # Append attention masks

# Convert lists to tensors for PyTorch
input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)
labels_train = torch.tensor(labels_train)  # Convert labels to a tensor

# Create a TensorDataset for training data
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)


# Preparing the validation data
input_ids_val = []  # List to store tokenized input IDs for validation data
attention_masks_val = []  # List to store attention masks for validation data

# Tokenize each validation sentence and create input IDs and attention masks
for sent in sentences_val:
    encoded_dict = tokenizer.encode_plus(
        sent,                      # The sentence to encode
        add_special_tokens=True,   # Add [CLS] and [SEP] tokens
        max_length=64,             # Truncate sentences to a maximum length of 64
        pad_to_max_length=True,    # Pad sentences shorter than 64 to this length
        return_attention_mask=True,  # Generate attention masks
        return_tensors='pt',       # Return PyTorch tensors
    )
    input_ids_val.append(encoded_dict['input_ids'])  # Append tokenized input IDs
    attention_masks_val.append(encoded_dict['attention_mask'])  # Append attention masks

# Convert lists to tensors for PyTorch
input_ids_val = torch.cat(input_ids_val, dim=0)
attention_masks_val = torch.cat(attention_masks_val, dim=0)
labels_val = torch.tensor(labels_val)  # Convert labels to a tensor

# Create a TensorDataset for validation data
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Data Batching and Loading

In [12]:
batch_size = 32  # Define the batch size for both training and validation

# Create a DataLoader for the training data
# RandomSampler shuffles the data at each epoch for better generalization
train_dataloader = DataLoader(
    dataset_train,              # The training dataset
    sampler=RandomSampler(dataset_train),  # Random sampling
    batch_size=batch_size       # Number of samples per batch
)

# Create a DataLoader for the validation data
# SequentialSampler ensures data is not shuffled for evaluation
validation_dataloader = DataLoader(
    dataset_val,                # The validation dataset
    sampler=SequentialSampler(dataset_val),  # Sequential sampling
    batch_size=batch_size       # Number of samples per batch
)

### Loading Model

In [None]:
# Initialize the RoBERTa model for sequence classification with 2 output labels
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",           # Pretrained model to use ('roberta-base' or 'roberta-large')
    num_labels=2,             # Number of output labels (binary classification)
    output_attentions=False,  # Do not return attention weights
    output_hidden_states=False  # Do not return hidden states
)

# Adjust the dropout rate for the last encoder layer and the classifier layer to 0.65 for better regularization
model.roberta.encoder.layer[-1].output.dropout = torch.nn.Dropout(0.65)  # type: ignore
model.classifier.dropout = torch.nn.Dropout(0.65)  # type: ignore

# Move the model to the specified device (GPU or CPU)
model.to(device)


# Initialize the BERT model for sequence classification (commented out)
# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased",        # Pretrained model to use ('bert-base-uncased' or 'bert-large-uncased')
#     num_labels=2,              # Number of output labels (binary classification)
#     output_attentions=False,   # Do not return attention weights
#     output_hidden_states=False  # Do not return hidden states
# )
# Adjust the dropout rate for the last encoder layer and the classifier layer to 0.65 for better regularization
# model.bert.encoder.layer[-1].output.dropout = torch.nn.Dropout(0.65)  # type: ignore
# model.classifier.dropout = torch.nn.Dropout(0.65)  # type: ignore

# Move the model to the specified device (GPU or CPU)
# model.to(device)

### Getting initial parameters and selecting layers to freeze

#### Option 1

In [15]:
# Store a copy of the initial parameters of the model for potential reference or comparison
initial_params = {}
for name, param in model.named_parameters():  # Iterate through all model parameters
    initial_params[name] = param.clone().detach()  # Clone and detach each parameter to avoid altering the original

In [31]:
# Specify the layers to freeze (prevent updates during training) by their parameter names
layers_to_freeze = [
    'bert.encoder.layer.9.attention.self.value',     # Layer 9 attention self-value
    'bert.encoder.layer.11.attention.self.value',   # Layer 11 attention self-value
    'bert.encoder.layer.11.attention.output.dense'  # Layer 11 attention output dense
]

# Iterate through model parameters and freeze the specified layers
for name, param in model.named_parameters():  # Iterate through all model parameters
    if any(layer in name for layer in layers_to_freeze):  # Check if the parameter belongs to the specified layers
        param.requires_grad = False  # Freeze the parameter (no gradient updates during training)

#### Option 2

In [32]:
# Freeze all layers in the model by disabling gradient updates
for param in model.parameters():  # Iterate through all model parameters
    param.requires_grad = False  # Disable gradient computation for all parameters

In [None]:
# Specify the layers to unfreeze (enable updates during training) by their parameter names
layers_to_unfreeze = [
    'bert.encoder.layer.7.attention.output.dense',   # Layer 7 attention output dense
    'bert.encoder.layer.8.attention.self.value',    # Layer 8 attention self-value
    'bert.encoder.layer.9.attention.self.value',    # Layer 9 attention self-value
    'bert.encoder.layer.11.attention.self.value',   # Layer 11 attention self-value
    'bert.encoder.layer.11.attention.output.dense'  # Layer 11 attention output dense
]

# Iterate through model parameters and selectively unfreeze the specified layers
for name, param in model.named_parameters():  # Iterate through all model parameters with their names
    if any(layer in name for layer in layers_to_unfreeze):  # Check if the parameter belongs to the specified layers
        param.requires_grad = True  # Enable gradient computation for the specified layers

In [None]:
# Unfreeze all parameters in the last encoder layer of the BERT model (Layer -1)
for param in model.bert.encoder.layer[-1].parameters():  # Iterate through all parameters in the last encoder layer
    param.requires_grad = True  # Enable gradient computation for the last encoder layer's parameters

### Setting up Optimizer, Epochs, Steps and Scheduler

In [14]:
# Initialize the AdamW optimizer with weight decay for regularization
# The commented line would filter parameters based on whether they require gradients (i.e., only trainable parameters)
# Used when freezing layers
# optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr = 2e-5, eps = 1e-8, weight_decay=0.2)

# Current optimizer setup: AdamW for all model parameters with a learning rate of 2e-5, epsilon of 1e-8, and weight decay of 0.5
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8, weight_decay=0.5)

In [15]:
# Set the number of training epochs
epochs = 4

# Calculate the total number of steps based on the number of training batches and epochs
total_steps = len(train_dataloader) * epochs

# Initialize a learning rate scheduler (linear warm-up followed by linear decay) for gradual learning rate adjustment
scheduler = get_linear_schedule_with_warmup(
    optimizer,                # The optimizer to adjust the learning rate for
    num_warmup_steps=0,       # No warm-up steps, meaning the learning rate starts immediately at the initial value
    num_training_steps=total_steps  # Total number of training steps (batches * epochs)
)

### Defined functions to calculate accuracy, time and prediction, labels

In [16]:
# Function to compute the accuracy of model predictions
def flat_accuracy(preds, labels):
    # Flatten the predicted values and labels, then compute accuracy
    pred_flat = np.argmax(preds, axis=1).flatten()  # Get the predicted class by finding the max value
    labels_flat = labels.flatten()  # Flatten the true labels
    return np.sum(pred_flat == labels_flat) / len(labels_flat)  # Calculate accuracy as the ratio of correct predictions

In [17]:
# Function to format elapsed time into a more readable format (HH:MM:SS)
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))  # Round elapsed time to nearest integer
    return str(datetime.timedelta(seconds=elapsed_rounded))  # Convert seconds to a string representation of time

In [18]:
# Function to obtain model predictions and labels from the dataloader
def get_predictions_and_labels(model, dataloader):
    model.eval()  # Set the model to evaluation mode (disables dropout layers)

    all_preds = []  # List to store all predictions
    all_labels = []  # List to store all true labels

    # Iterate through batches in the dataloader
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)  # Move the batch data to the specified device (GPU/CPU)
        b_input_ids, b_input_mask, b_labels = batch  # Unpack batch into input IDs, attention masks, and labels

        # Disable gradient computation for evaluation (no backpropagation needed)
        with torch.no_grad():
            # Get model outputs
            outputs = model(b_input_ids,
                            token_type_ids=None,  # No token type IDs for single-sentence classification
                            attention_mask=b_input_mask,
                            labels=b_labels,
                            return_dict=True)  # Return outputs as a dictionary

        # Extract logits from model output (the raw predictions)
        if isinstance(outputs, tuple):
            logits = outputs[1]  # If the output is a tuple, the second element is logits
        elif isinstance(outputs, dict):
            logits = outputs['logits']  # If the output is a dictionary, get logits from the 'logits' key
        else:
            raise ValueError("Unsupported model output format")  # Raise error if output format is unsupported

        # Get predictions by choosing the class with the highest probability
        preds = np.argmax(logits.cpu().numpy(), axis=1)
        labels = b_labels.cpu().numpy()  # Convert labels to numpy for easier handling

        # Store predictions and labels
        all_preds.extend(preds)
        all_labels.extend(labels)

    return all_preds, all_labels  # Return all predictions and labels for evaluation

### Fine-Tuning/Training Model

In [None]:
# Set the random seed for reproducibility across all libraries (random, numpy, torch)
seed_val = 42
random.seed(seed_val)  # Set seed for Python's random module
np.random.seed(seed_val)  # Set seed for NumPy
torch.manual_seed(seed_val)  # Set seed for PyTorch on CPU
torch.cuda.manual_seed_all(seed_val)  # Set seed for all GPUs if available

# Initialize a list to store training statistics across epochs
training_stats = []

# Track total time for training
total_t0 = time.time()

# Loop over each epoch for training
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Track time for the current epoch
    t0 = time.time()

    # Initialize metrics for tracking training progress
    total_train_loss = 0
    total_train_accuracy = 0
    model.train()  # Set model to training mode

    # Iterate through each batch in the training dataloader
    for step, batch in enumerate(train_dataloader):
        # Print progress every 40 steps
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,} of {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Move the batch to the appropriate device (GPU or CPU)
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Zero the gradients before backward pass
        model.zero_grad()

        # Perform forward pass
        loss, logits = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels,
                             return_dict=False)  # Get loss and logits from the model

        # Accumulate training loss
        total_train_loss += loss.item()

        # Convert logits and labels to numpy for accuracy computation
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Perform backward pass (gradient calculation)
        loss.backward()

        # Clip gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update model parameters using the optimizer
        optimizer.step()

        # Update the learning rate using the scheduler
        scheduler.step()

        # Accumulate training accuracy
        total_train_accuracy += flat_accuracy(logits, label_ids)

    # Compute average training accuracy and loss for the epoch
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    print("Training Accuracy: {0:.2f}%".format(avg_train_accuracy * 100))

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Format and print training time for the epoch
    training_time = format_time(time.time() - t0)
    print("Training Loss: {0:.2f}".format(avg_train_loss))
    print("Training took: {:}".format(training_time))

    # Run validation after each epoch
    print("Running Validation...")

    # Track time for the validation phase
    t1 = time.time()

    # Set model to evaluation mode
    model.eval()

    # Initialize metrics for validation
    total_eval_accuracy = 0
    total_eval_loss = 0

    # Iterate through validation batches
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Disable gradient calculation during validation
        with torch.no_grad():
            # Perform forward pass during validation
            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels,
                                 return_dict=False)

        # Accumulate validation loss
        total_eval_loss += loss.item()

        # Convert logits and labels to numpy for accuracy computation
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Accumulate validation accuracy
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Compute average validation accuracy and loss
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("Validation Accuracy: {0:.2f}%".format(avg_val_accuracy * 100))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Format and print validation time
    validation_time = format_time(time.time() - t1)
    print("Validation Loss: {0:.2f}".format(avg_val_loss))
    print("Validation took: {:}".format(validation_time))

    # Append epoch stats to the training statistics list
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Accur.': avg_train_accuracy,
            'Training Time': training_time,
            'Test Loss': avg_val_loss,
            'Test Accur.': avg_val_accuracy,
            'Test Time': validation_time
        }
    )

# Print training completion message
print("")
print("Training complete!")

# Print total training time
print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))

### Getting updated parameters and calculating changes (Used when using freeze layers part)

In [23]:
# Update parameters to CPU and check for changes between initial and updated parameters
updated_params = {name: param.cpu() for name, param in model.named_parameters()}

In [None]:
# Compare the updated parameters with the initial ones
for name, param in updated_params.items():
    # If the parameter has changed, print details of the change
    if not torch.allclose(initial_params[name].cpu().detach(), param):
        print(f"Parameter {name} changed:")
        print(f"\tShape: {initial_params[name].shape}")
        print(f"\tPrevious Value: {initial_params[name].cpu().detach().numpy()}")
        print(f"\tShape: {updated_params[name].shape}")
        print(f"\tNew Value: {param.cpu().detach().numpy()}")

In [28]:
# Convert initial parameters to NumPy arrays
initial_params_cpu = {name: param.cpu().detach().numpy() for name, param in initial_params.items()}
# Convert updated parameters to NumPy arrays
updated_params_cpu = {name: param.cpu().detach().numpy() for name, param in updated_params.items()}

In [29]:
# Calculate the percentage change for each parameter
percentage_change = {}
for name in initial_params_cpu.keys():
    # Calculate the absolute difference and normalized change percentage
    numerator = np.abs(updated_params_cpu[name] - initial_params_cpu[name])
    denominator = np.maximum(np.abs(initial_params_cpu[name]), np.abs(updated_params_cpu[name]))
    percentage_change[name] = (numerator / denominator) * 100

In [30]:
# Compute the mean and standard deviation of percentage change for each parameter
mean_percentage_change = {name: np.mean(change) for name, change in percentage_change.items()}
std_percentage_change = {name: np.std(change) for name, change in percentage_change.items()}

In [31]:
# Create a DataFrame to store the mean and std of percentage changes
df = pd.DataFrame({
    'Parameter': list(initial_params_cpu.keys()),
    'Mean Percentage Change': list(mean_percentage_change.values()),
    'Std Percentage Change': list(std_percentage_change.values())
})

In [32]:
# Save the DataFrame to a CSV file
df.to_csv("Param_percent_change_mean_std_roberta_base.csv", index=False)

In [41]:
# Identify and retrieve the parameters of the classifier's linear layer (weights)
linear_layer_params = None
for name, param in model.named_parameters():
    # Look for the classifier weight parameter in the model
    if 'classifier.weight' in name:
        linear_layer_params = param
        break

In [None]:
# Display the data of the linear layer weights
linear_layer_params.data

### Getting predictions, labels with classification report

In [None]:
# Get predictions and labels from the validation set
val_preds, val_labels = get_predictions_and_labels(model, validation_dataloader)

# Compute the confusion matrix based on the true labels and predicted labels
conf_mat_validation = confusion_matrix(val_labels, val_preds)

# Extract values from the confusion matrix (True Positives, False Positives, False Negatives, True Negatives)
TN_val, FP_val, FN_val, TP_val = conf_mat_validation.ravel()

# Print out the values of the confusion matrix for the validation set
print("Validation Set:")
print(f"True Positives (TP): {TP_val}")
print(f"True Negatives (TN): {TN_val}")
print(f"False Negatives (FN): {FN_val}")
print(f"False Positives (FP): {FP_val}")

# Generate and print the classification report, which includes precision, recall, and F1-score
cr_val = classification_report(val_labels, val_preds)
print(cr_val)

### Using WeightWatcher to analyze model layer by layer

In [39]:
# Initialize the WeightWatcher object with the model
watcher = ww.WeightWatcher(model=model)

In [40]:
# Analyze the model's weights using the WeightWatcher
analyze = watcher.analyze()

In [None]:
# Get the detailed information about the model's weights and their analysis
details = watcher.get_details()

# Save the detailed analysis of the weights to a CSV file
details.to_csv("all_UT_freezed.csv", index=False) # type: ignore

# Display the detailed analysis of the model's weights
details

In [None]:
# Get and display a summary of the model's weight analysis
watcher.get_summary()

### Loading training stats and plotting graph

In [21]:
# Convert the training statistics into a DataFrame for easier manipulation and visualization
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')  # Set 'epoch' as the index for better clarity
df_stats

Unnamed: 0_level_0,Training Loss,Training Accur.,Training Time,Test Loss,Test Accur.,Test Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.342279,0.858011,0:59:17,0.375627,0.89105,0:01:59
2,0.217442,0.917861,0:59:49,0.375876,0.90275,0:02:00
3,0.148877,0.948278,0:59:29,0.412361,0.9042,0:02:00
4,0.104273,0.967822,0:59:37,0.530868,0.9047,0:02:01


In [None]:
# Plotting Loss for Training and Testing

# Set the visual style for the plot
sns.set(style='darkgrid')
sns.set(font_scale=1.5)  # Increase font size for readability
plt.rcParams["figure.figsize"] = (12, 6)  # Set figure size

# Plot the Training Loss and Test Loss for each epoch
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")  # Training loss in blue
plt.plot(df_stats['Test Loss'], 'r-o', label="Testing")  # Test loss in red

# Add title, labels, and legend to the plot
plt.title("Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks(range(1, epochs+1))  # Set x-axis ticks for each epoch

# Display the plot
plt.show()

# Plotting Accuracy for Training and Testing

# Set the visual style for the accuracy plot (same as above)
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12, 6)

# Plot the Training Accuracy and Test Accuracy for each epoch
plt.plot(df_stats['Training Accur.'], 'b-o', label="Training")  # Training accuracy in blue
plt.plot(df_stats['Test Accur.'], 'r-o', label="Testing")  # Test accuracy in red

# Add title, labels, and legend to the accuracy plot
plt.title("Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.xticks(range(1, epochs+1))  # Set x-axis ticks for each epoch

# Display the accuracy plot
plt.show()

### Saving Fine-Tuned Model

In [None]:
# Define the output directory where the model and tokenizer will be saved
output_dir = './model_20k_roberta_base/'

# Check if the directory exists, and create it if it doesn't
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Print a message indicating where the model will be saved
print("Saving model to %s" % output_dir)

# Determine whether the model is wrapped in a DataParallel module and save the model
model_to_save = model.module if hasattr(model, 'module') else model  # Ensure saving the model correctly
model_to_save.save_pretrained(output_dir)  # Save the model's weights and configuration
tokenizer.save_pretrained(output_dir)  # Save the tokenizer configuration and vocab

In [24]:
# Create a checkpoint dictionary to save the optimizer and scheduler states
checkpoint = {
    'optimizer_state_dict': optimizer.state_dict(),  # Save the optimizer's state
    'scheduler_state_dict': scheduler.state_dict(),  # Save the scheduler's state
}

# Save the checkpoint (optimizer and scheduler states) to the output directory
torch.save(checkpoint, os.path.join(output_dir, 'optimizer_scheduler_checkpoint.pth'))