In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader,TensorDataset
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from torch.nn.parallel import DataParallel
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("text.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4


# We need to traverse out dataset and try to find the distribution of each **classes**

In [None]:
df.shape

(416809, 3)

In [None]:
df.label.value_counts()


label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

# We can see the distribution of each class is unbalanced, such bias existing within data set may cause the prediction made by our model to proned to a certain class with more data points.Therefore, we need to eliminate such bias by balancing the number of each class passed into model.





In [None]:
class balanced_data(Dataset):
    def __init__(self, df, length=None):
        if length is not None and length > df.shape[0]:
            raise ValueError("Length parameter cannot be greater than the size of the dataset.")
        self.length = length if length is not None else len(df)
        self.df = self.stratify(df)

    def stratify(self, df):
        min_count = df['label'].value_counts().min()
        df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
        return df.sample(self.length)
    def get_all(self):
        return self.df

df_balanced = balanced_data(df,25000)

  df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [None]:
df_balanced =df_balanced.get_all()

# We can manually adjust the number of data points visible to the model by changing the second parameter of class balanced_data

In [None]:
df_balanced.label.value_counts()

label
1    4233
3    4203
5    4183
0    4181
4    4114
2    4086
Name: count, dtype: int64

# Because the english texts stored in dataframe are hard to understand and interpret for our model,tokenization is needed for converting text sequence into words and further encoding is also applied for the convertion of vectors.
# After proper pre-processing, the dataset and dataloader were created for loading data into model


In [None]:
# Function to tokenize texts
def tokenize_texts(tokenizer, texts, max_length=128):
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Function to create a dataset from tokenized texts and labels
def create_dataset(encodings, labels):
    labels_tensor = torch.tensor(labels, dtype=torch.float64)
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels_tensor)
    return dataset

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df_balanced["text"], df_balanced["label"], test_size=0.2, random_state=42)

# Tokenize texts
train_encodings = tokenize_texts(tokenizer, train_texts.tolist())
val_encodings = tokenize_texts(tokenizer, val_texts.tolist())

# Create datasets
train_dataset = create_dataset(train_encodings, np.array(train_labels))
val_dataset = create_dataset(val_encodings, np.array(val_labels))

# Initialize DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)



In [None]:
len(train_dataloader)

625

# We consider to construct a nn with Bert and a layer of fully connected layer with pooling. We will implement backpropagation for optimization and using adam for optimzer and cross entropy loss as loss function to minimize

In [None]:
class PlainBert_with_fcl(nn.Module):
    def __init__(self, transformer_model, num_classes):
        super(PlainBert_with_fcl, self).__init__()
        self.transformer = transformer_model
        self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        logits = self.fc(pooled_output)
        return logits

num_classes =6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PlainBert_with_fcl(model, num_classes)
model = DataParallel(model)
model = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

num_epochs=10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)

        labels = labels.to(device).long()
        outputs = outputs.float()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / len(train_dataset)

    # Validation loop
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    val_predicted = []
    val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            labels = labels.to(device).long()

            outputs = outputs.float()
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val_correct += (predicted == labels).sum().item()
            val_predicted.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = total_val_correct / len(val_dataset)

    print('[%d] loss: %.3f |  Val loss: %.3f | acc: %.2f  | Val acc: %.2f | ' % (epoch + 1, train_loss, val_loss, train_accuracy, val_accuracy))

[1] loss: 0.762 |  Val loss: 0.202 | acc: 0.72  | Val acc: 0.94 | 
[2] loss: 0.190 |  Val loss: 0.155 | acc: 0.94  | Val acc: 0.95 | 
[3] loss: 0.135 |  Val loss: 0.137 | acc: 0.95  | Val acc: 0.95 | 
[4] loss: 0.115 |  Val loss: 0.133 | acc: 0.95  | Val acc: 0.96 | 
[5] loss: 0.101 |  Val loss: 0.146 | acc: 0.96  | Val acc: 0.95 | 
[6] loss: 0.093 |  Val loss: 0.143 | acc: 0.96  | Val acc: 0.95 | 
[7] loss: 0.086 |  Val loss: 0.152 | acc: 0.96  | Val acc: 0.95 | 
[8] loss: 0.076 |  Val loss: 0.171 | acc: 0.97  | Val acc: 0.95 | 
[9] loss: 0.064 |  Val loss: 0.177 | acc: 0.97  | Val acc: 0.94 | 
[10] loss: 0.059 |  Val loss: 0.196 | acc: 0.98  | Val acc: 0.95 | 


# Using other form of evaluation metric

In [None]:
from sklearn.metrics import classification_report

val_predicted = np.array(val_predicted)
val_labels = np.array(val_labels)

report = classification_report(val_labels, val_predicted, target_names=[f'Class {i}' for i in range(num_classes)])

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.97      0.95      0.96       839
     Class 1       0.98      0.89      0.94       811
     Class 2       0.92      0.99      0.95       804
     Class 3       0.96      0.95      0.95       869
     Class 4       0.94      0.91      0.92       820
     Class 5       0.92      1.00      0.96       857

    accuracy                           0.95      5000
   macro avg       0.95      0.95      0.95      5000
weighted avg       0.95      0.95      0.95      5000



# Test our model performance on all data points(420,000) instead of our sliced dataset

In [None]:
test_encodings = tokenize_texts(tokenizer, df_balanced["text"].tolist())
test_dataset = create_dataset(test_encodings, np.array(df_balanced["label"]))
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
def test_model(model):
    total_test_correct = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask)
            labels = labels.to(device).long()
            _, predicted = torch.max(outputs.data, 1)
            total_test_correct += (predicted == labels).sum().item()
    print('Testing accuracy is: %d %%' % (100 * total_test_correct / len(test_dataset)))
test_model(model)

Testing accuracy is: 97 %


We will then try to do some hyperparameter tuning by comparing the performance for big(32) and small(4) batch size, large(1e-5) and small(0.0001) learning rate.Therefore, we have 4 combinations of hyperparameters

In [None]:
small_batch_trainloader=DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
# Function to tokenize texts
def tokenize_texts(tokenizer, texts, max_length=128):
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Function to create a dataset from tokenized texts and labels
def create_dataset(encodings, labels):
    labels_tensor = torch.tensor(labels, dtype=torch.float64)
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels_tensor)
    return dataset

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df_balanced["text"], df_balanced["label"], test_size=0.2, random_state=42)

# Tokenize texts
train_encodings = tokenize_texts(tokenizer, train_texts.tolist())
val_encodings = tokenize_texts(tokenizer, val_texts.tolist())

# Create datasets
train_dataset = create_dataset(train_encodings, np.array(train_labels))
val_dataset = create_dataset(val_encodings, np.array(val_labels))

# Initialize DataLoaders
small_batch_trainloader=DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)


In [None]:
num_classes =6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_ids = [0, 1]
model = PlainBert_with_fcl(model, num_classes)
model = nn.DataParallel(model, device_ids=device_ids)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

num_epochs=10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0

    for batch in small_batch_trainloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)

        labels = labels.to(device).long()
        outputs = outputs.float()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / len(train_dataset)

    # Validation loop
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    val_predicted = []
    val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            labels = labels.to(device).long()

            outputs = outputs.float()
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val_correct += (predicted == labels).sum().item()
            val_predicted.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = total_val_correct / len(val_dataset)

    print('[%d] loss: %.3f |  Val loss: %.3f | acc: %.2f  | Val acc: %.2f | ' % (epoch + 1, train_loss, val_loss, train_accuracy, val_accuracy))


[1] loss: 3.164 |  Val loss: 0.151 | acc: 0.86  | Val acc: 0.95 | 
[2] loss: 1.142 |  Val loss: 0.129 | acc: 0.95  | Val acc: 0.95 | 
[3] loss: 0.905 |  Val loss: 0.130 | acc: 0.95  | Val acc: 0.96 | 
[4] loss: 0.827 |  Val loss: 0.137 | acc: 0.96  | Val acc: 0.96 | 
[5] loss: 0.781 |  Val loss: 0.152 | acc: 0.96  | Val acc: 0.95 | 
[6] loss: 0.723 |  Val loss: 0.161 | acc: 0.96  | Val acc: 0.96 | 
[7] loss: 0.624 |  Val loss: 0.183 | acc: 0.97  | Val acc: 0.95 | 
[8] loss: 0.532 |  Val loss: 0.192 | acc: 0.97  | Val acc: 0.94 | 
[9] loss: 0.420 |  Val loss: 0.206 | acc: 0.98  | Val acc: 0.94 | 
[10] loss: 0.354 |  Val loss: 0.256 | acc: 0.98  | Val acc: 0.94 | 


In [None]:
# Function to tokenize texts
def tokenize_texts(tokenizer, texts, max_length=128):
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Function to create a dataset from tokenized texts and labels
def create_dataset(encodings, labels):
    labels_tensor = torch.tensor(labels, dtype=torch.float64)
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels_tensor)
    return dataset

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df_balanced["text"], df_balanced["label"], test_size=0.2, random_state=42)

# Tokenize texts
train_encodings = tokenize_texts(tokenizer, train_texts.tolist())
val_encodings = tokenize_texts(tokenizer, val_texts.tolist())

# Create datasets
train_dataset = create_dataset(train_encodings, np.array(train_labels))
val_dataset = create_dataset(val_encodings, np.array(val_labels))

# Initialize DataLoaders
small_batch_trainloader=DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

In [None]:
num_classes =6



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_ids = [0, 1]
model = PlainBert_with_fcl(model, num_classes)
model = nn.DataParallel(model, device_ids=device_ids)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

num_epochs=5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0

    for batch in small_batch_trainloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)

        labels = labels.to(device).long()
        outputs = outputs.float()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / len(train_dataset)

    # Validation loop
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    val_predicted = []
    val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            labels = labels.to(device).long()

            outputs = outputs.float()
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val_correct += (predicted == labels).sum().item()
            val_predicted.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = total_val_correct / len(val_dataset)

    print('[%d] loss: %.3f |  Val loss: %.3f | acc: %.2f  | Val acc: %.2f | ' % (epoch + 1, train_loss, val_loss, train_accuracy, val_accuracy))


[1] loss: 14.521 |  Val loss: 1.823 | acc: 0.17  | Val acc: 0.16 | 
[2] loss: 14.502 |  Val loss: 1.809 | acc: 0.17  | Val acc: 0.17 | 
[3] loss: 14.509 |  Val loss: 1.806 | acc: 0.17  | Val acc: 0.16 | 
[4] loss: 14.503 |  Val loss: 1.834 | acc: 0.17  | Val acc: 0.17 | 
[5] loss: 14.507 |  Val loss: 1.804 | acc: 0.16  | Val acc: 0.16 | 


In [None]:
# Function to tokenize texts
def tokenize_texts(tokenizer, texts, max_length=128):
    return tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# Function to create a dataset from tokenized texts and labels
def create_dataset(encodings, labels):
    labels_tensor = torch.tensor(labels, dtype=torch.float64)
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels_tensor)
    return dataset

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df_balanced["text"], df_balanced["label"], test_size=0.2, random_state=42)

# Tokenize texts
train_encodings = tokenize_texts(tokenizer, train_texts.tolist())
val_encodings = tokenize_texts(tokenizer, val_texts.tolist())

# Create datasets
train_dataset = create_dataset(train_encodings, np.array(train_labels))
val_dataset = create_dataset(val_encodings, np.array(val_labels))

# Initialize DataLoaders
train_dataloader=DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

In [None]:
num_classes =6

class PlainBert_with_fcl(nn.Module):
    def __init__(self, transformer_model, num_classes):
        super(PlainBert_with_fcl, self).__init__()
        self.transformer = transformer_model
        self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        logits = self.fc(pooled_output)
        return logits

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_ids = [0, 1]
model = PlainBert_with_fcl(model, num_classes)
model = nn.DataParallel(model, device_ids=device_ids)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

num_epochs=5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)

        labels = labels.to(device).long()
        outputs = outputs.float()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / len(train_dataset)

    # Validation loop
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    val_predicted = []
    val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            labels = labels.to(device).long()

            outputs = outputs.float()
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val_correct += (predicted == labels).sum().item()
            val_predicted.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = total_val_correct / len(val_dataset)

    print('[%d] loss: %.3f |  Val loss: %.3f | acc: %.2f  | Val acc: %.2f | ' % (epoch + 1, train_loss, val_loss, train_accuracy, val_accuracy))


[1] loss: 0.405 |  Val loss: 0.157 | acc: 0.86  | Val acc: 0.94 | 
[2] loss: 0.158 |  Val loss: 0.147 | acc: 0.94  | Val acc: 0.94 | 
[3] loss: 0.167 |  Val loss: 0.176 | acc: 0.94  | Val acc: 0.94 | 
[4] loss: 0.137 |  Val loss: 0.151 | acc: 0.95  | Val acc: 0.94 | 
[5] loss: 0.133 |  Val loss: 0.173 | acc: 0.95  | Val acc: 0.94 | 


# We can see three models shared similar performance whereas the combination of small batch size and large lr model showed a poor performance in term of low training and validation accuracy, this may caused by bad initialization of parameter and the difficulty for finding a correct way of updating weight.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

train_texts = train_texts.tolist()
val_texts = val_texts.tolist()
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels,dtype=torch.float64))
val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels,dtype=torch.float64))

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)


# We tried to add more layers to our model so that this model may better capture the inherent pattern within dataset. A convolutional later was added with kernel size of 3 and 256 filters will be learnt.

In [None]:
class PlainBert_with_fclWithConv(nn.Module):
    def __init__(self, transformer_model, num_classes, kernel_size=3, num_filters=256):
        super(PlainBert_with_fclWithConv, self).__init__()
        self.transformer = transformer_model
        self.conv = nn.Conv1d(in_channels=768, out_channels=num_filters, kernel_size=kernel_size, padding=1)
        self.fc = nn.Linear(num_filters, num_classes)

    def forward(self, input_ids, attention_mask):
        output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        pooled_output = pooled_output.unsqueeze(2)

        conv_out = F.relu(self.conv(pooled_output))
        pooled_conv_out, _ = torch.max(conv_out, dim=2)
        logits = self.fc(pooled_conv_out)
        return logits

In [None]:
num_classes =6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device_ids = [0, 1]


model = PlainBert_with_fclWithConv(model, num_classes)
model = nn.DataParallel(model, device_ids=device_ids)
model = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()


num_epochs=3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_correct = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)

        labels = labels.to(device).long()
        outputs = outputs.float()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total_correct += (predicted == labels).sum().item()

    train_loss = total_loss / len(train_dataloader)
    train_accuracy = total_correct / len(train_dataset)

    # Validation loop
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    val_predicted = []
    val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask)
            labels = labels.to(device).long()

            outputs = outputs.float()
            loss = criterion(outputs, labels)

            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_val_correct += (predicted == labels).sum().item()
            val_predicted.extend(predicted.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = total_val_correct / len(val_dataset)

    print('[%d] loss: %.3f |  Val loss: %.3f | acc: %.2f  | Val acc: %.2f | ' % (epoch + 1, train_loss, val_loss, train_accuracy, val_accuracy))

[1] loss: 0.943 |  Val loss: 0.328 | acc: 0.66  | Val acc: 0.92 | 
[2] loss: 0.247 |  Val loss: 0.198 | acc: 0.93  | Val acc: 0.94 | 
[3] loss: 0.158 |  Val loss: 0.172 | acc: 0.95  | Val acc: 0.95 | 


# We may find a 2% increase in overall performance after convolution layer was added

In [None]:
from sklearn.metrics import classification_report

val_predicted = np.array(val_predicted)
val_labels = np.array(val_labels)

report = classification_report(val_labels, val_predicted, target_names=[f'Class {i}' for i in range(num_classes)])

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.97      0.94      0.96       839
     Class 1       0.99      0.89      0.93       811
     Class 2       0.92      0.99      0.95       804
     Class 3       0.96      0.96      0.96       869
     Class 4       0.94      0.91      0.92       820
     Class 5       0.92      1.00      0.96       857

    accuracy                           0.95      5000
   macro avg       0.95      0.95      0.95      5000
weighted avg       0.95      0.95      0.95      5000

