In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

data_path = '/content/drive/My Drive/Data_Mining/training_set_rel3.tsv'

df = pd.read_csv(data_path, delimiter='\t', encoding='ISO-8859-1')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_texts(texts, tokenizer, max_len=512):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len, return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask']

input_ids, attention_mask = preprocess_texts(df['essay'], tokenizer)

import torch
labels = torch.tensor(df['domain1_score'].values)

X_train, X_temp, y_train, y_temp = train_test_split(input_ids, labels, test_size=0.3, random_state=42)
train_mask, temp_mask = train_test_split(attention_mask, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
val_mask, test_mask = train_test_split(temp_mask, test_size=0.5, random_state=42)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import BertModel
import torch.nn as nn

class BERTRegressor(nn.Module):
    def __init__(self, dropout=0.3):
        super(BERTRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.bert.config.hidden_size, 256)
        self.relu = nn.ReLU()
        self.out = nn.Linear(256, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_token)
        x = self.relu(self.linear(x))
        x = self.dropout(x)
        x = self.out(x)
        return x

# Initialize the model
model = BERTRegressor()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
import numpy as np

train_data = TensorDataset(X_train, train_mask, y_train)
val_data = TensorDataset(X_val, val_mask, y_val)
test_data = TensorDataset(X_test, test_mask, y_test)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)
test_loader = DataLoader(test_data, batch_size=16)

optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = nn.MSELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def train(model, train_loader, val_loader, optimizer, loss_fn, epochs=10):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_loss = 0
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [item.to(device) for item in batch]
                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs.squeeze(), labels.float())
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")

train(model, train_loader, val_loader, optimizer, loss_fn, epochs=10)




Epoch 1/10, Train Loss: 42.36474877942196, Val Loss: 11.332362229218248
Epoch 2/10, Train Loss: 6.710108077022391, Val Loss: 5.841879755014279
Epoch 3/10, Train Loss: 3.9840647572467867, Val Loss: 4.312326348707324
Epoch 4/10, Train Loss: 3.311872138891002, Val Loss: 3.580108459244986
Epoch 5/10, Train Loss: 2.640248112170629, Val Loss: 3.589663312449807
Epoch 6/10, Train Loss: 2.475804371680592, Val Loss: 3.677722912342822
Epoch 7/10, Train Loss: 2.3274308241851314, Val Loss: 3.2738018465823813
Epoch 8/10, Train Loss: 2.0107503380941254, Val Loss: 2.7970713071647237
Epoch 9/10, Train Loss: 1.8046970986826738, Val Loss: 2.8060486806464975
Epoch 10/10, Train Loss: 1.567525113185107, Val Loss: 3.4169895282534304


In [None]:
import torch

torch.save(model.state_dict(), data_path + 'bert_model.pth')

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
import numpy as np

def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = [item.to(device) for item in batch]
            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.squeeze().cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return predictions, true_labels

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, np.round(y_pred).astype(int), weights='quadratic')
    return mse, rmse, mae, kappa

def print_metrics(set_name, mse, rmse, mae, kappa):
    print(f"{set_name} set evaluation:")
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Root Mean Squared Error (RMSE): {rmse}')
    print(f'Mean Absolute Error (MAE): {mae}')
    print(f'Cohen\'s Kappa Score: {kappa}')
    print()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

y_train_pred, y_train_true = evaluate(model, train_loader)
train_mse, train_rmse, train_mae, train_kappa = calculate_metrics(y_train_true, y_train_pred)
print_metrics("Training", train_mse, train_rmse, train_mae, train_kappa)

y_val_pred, y_val_true = evaluate(model, val_loader)
val_mse, val_rmse, val_mae, val_kappa = calculate_metrics(y_val_true, y_val_pred)
print_metrics("Validation", val_mse, val_rmse, val_mae, val_kappa)

y_test_pred, y_test_true = evaluate(model, test_loader)
test_mse, test_rmse, test_mae, test_kappa = calculate_metrics(y_test_true, y_test_pred)
print_metrics("Test", test_mse, test_rmse, test_mae, test_kappa)