In [1]:
import pandas as pd
import numpy as np
import pickle

connection = pickle.load(open('../data/connection_clean.pkl', 'rb'))
subject = pickle.load(open('../data/subject_clean.pkl', 'rb'))
objective = pickle.load(open('../data/objective_clean.pkl', 'rb'))

In [21]:
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [3]:
# source: https://www.kaggle.com/code/nayansakhiya/text-classification-using-bert

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# connection_bert_tokens, connection_bert_masks, connection_bert_segments = bert_encode(connection['text'], tokenizer, max_len=512)
# subject_bert_tokens, subject_bert_masks, subject_bert_segments = bert_encode(subject['text'], tokenizer, max_len=512)
# objective_bert_tokens, objective_bert_masks, objective_bert_segments = bert_encode(objective['text'], tokenizer, max_len=512)

In [4]:
# Data Loading and Encoding
def load_and_encode_data(data, labels, tokenizer, multi_label=False):
    tokens, masks, segments = bert_encode(data['text'], tokenizer, max_len=512)
    if multi_label:
        labels_encoded = np.array(labels)
    else:
        labels_encoded = LabelEncoder().fit_transform(labels)
    return tokens, masks, segments, labels_encoded

In [5]:
connection_bert_tokens, connection_bert_masks, connection_bert_segments, connection_y_encoded = load_and_encode_data(connection, connection['connection'], tokenizer)
subject_bert_tokens, subject_bert_masks, subject_bert_segments, subject_y_encoded = load_and_encode_data(subject, subject['subject'], tokenizer)
objective_bert_tokens, objective_bert_masks, objective_bert_segments, objective_y_encoded = load_and_encode_data(objective, objective.iloc[:,1:14], tokenizer, multi_label=True)

In [14]:
# build BERT classifier
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


In [18]:
# define train function
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[3].to(device)
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [17]:
# build evaluation
def evaluate(model, data_loader, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[3].to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            fin_targets.extend(preds.cpu().tolist())
            fin_outputs.extend(labels.cpu().tolist())
    return accuracy_score(fin_outputs, fin_targets), classification_report(fin_outputs, fin_targets)

---

### CONNECTION

In [9]:
X_train, X_test, y_train, y_test = train_test_split(connection_bert_tokens, connection_y_encoded, test_size=0.3, random_state=42, stratify=connection_y_encoded)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, C=100),
    "KNN": KNeighborsClassifier(n_neighbors=20),
    "Random Forest": RandomForestClassifier(n_estimators=10),
    "SVM": SVC(C=10)
}

In [None]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model_name)
    print(classification_report(y_test, y_pred))
    print()

---

BERT Classifier

In [11]:
# random split masks, segments, tokens, y_encoded
from torch.utils.data.dataset import random_split

# connection
dataset = TensorDataset(torch.tensor(connection_bert_tokens), torch.tensor(connection_bert_masks), torch.tensor(connection_bert_segments), torch.tensor(connection_y_encoded))
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [12]:
# Model Training
dataloader_train = DataLoader(train_dataset, batch_size=16, shuffle=True)
dataloader_val = DataLoader(val_dataset, batch_size=16, shuffle=True)

**Set Parameters**

1. scheduler: adjusts the learning rate during training. a well tuned learning rate can help the model converge faster and better.
2. optimizer: the optimizer used to update the weights of the model during training.
3. steps: the total number of training steps, which is the number of times the model is trained with the data.


In [22]:
# Instantiate the model
num_classes=2
model = BERTClassifier('bert-base-uncased', num_classes=num_classes)
device = 'mps'
model.to(device)
num_epochs = 5
learning_rate = 2e-5
optimizer = torch.optim.Adam(params=model.parameters(), lr = learning_rate)
total_steps = len(dataloader_train) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps = total_steps)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(num_epochs):
    train(model, dataloader_train, optimizer, scheduler, device)
    accuracy, report = evaluate(model, dataloader_val, device)
    print(f"Epoch {epoch+1} Accuracy: {accuracy}")
    print(report)

---

## SUBJECT

In [26]:
# repeat for subject
# clear memory
del connection_bert_tokens, connection_bert_masks, connection_bert_segments, connection_y_encoded
del dataset, train_dataset, val_dataset, dataloader_train, dataloader_val

In [27]:
X_train, X_test, y_train, y_test = train_test_split(subject_bert_tokens, subject_y_encoded, test_size=0.3, random_state=42, stratify=subject_y_encoded)

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model_name)
    print(classification_report(y_test, y_pred))
    print()

Logistic Regression
              precision    recall  f1-score   support

           0       0.57      0.47      0.52        34
           1       0.20      0.25      0.22         4
           2       0.35      0.50      0.42        22
           3       0.00      0.00      0.00         5

    accuracy                           0.43        65
   macro avg       0.28      0.31      0.29        65
weighted avg       0.43      0.43      0.42        65


KNN
              precision    recall  f1-score   support

           0       0.60      0.74      0.66        34
           1       0.00      0.00      0.00         4
           2       0.48      0.50      0.49        22
           3       0.00      0.00      0.00         5

    accuracy                           0.55        65
   macro avg       0.27      0.31      0.29        65
weighted avg       0.47      0.55      0.51        65


Random Forest
              precision    recall  f1-score   support

           0       0.52      0.79  

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# dataset and dataloader
dataset = TensorDataset(torch.tensor(subject_bert_tokens), torch.tensor(subject_bert_masks), torch.tensor(subject_bert_segments), torch.tensor(subject_y_encoded))
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

dataloader_train = DataLoader(train_dataset, batch_size=16, shuffle=True)
dataloader_val = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [None]:
num_classes = 4
model = BERTClassifier('bert-base-uncased', num_classes= num_classes)
device = 'mps'
model.to(device)
num_epochs = 5
learning_rate = 2e-5
optimizer = torch.optim.Adam(params=model.parameters(), lr = learning_rate)
total_steps = len(dataloader_train) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps = total_steps)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(num_epochs):
    train(model, dataloader_train, optimizer, scheduler, device)
    accuracy, report = evaluate(model, dataloader_val, device)
    print(f"Epoch {epoch+1} Accuracy: {accuracy}")
    print(report)

---
### OBJECTIVE

In [30]:
del subject_bert_tokens, subject_bert_masks, subject_bert_segments, subject_y_encoded
del dataset, train_dataset, val_dataset, dataloader_train, dataloader_val
del X_train, X_test, y_train, y_test

In [33]:
X_train, X_test, y_train, y_test = train_test_split(objective_bert_tokens, objective_y_encoded, test_size=0.3, random_state=42)

In [34]:
from sklearn.multioutput import MultiOutputClassifier

models = {
    "Logistic Regression": MultiOutputClassifier(LogisticRegression(max_iter=1000, C=100)),
    "KNN": MultiOutputClassifier(KNeighborsClassifier(n_neighbors=20)),
    "Random Forest": MultiOutputClassifier(RandomForestClassifier(n_estimators=10)),
    "SVM": MultiOutputClassifier(SVC(C=10))
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model_name)
    print(classification_report(y_test, y_pred))
    print()

Logistic Regression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       0.44      0.48      0.46        29
           3       0.00      0.00      0.00         5
           4       1.00      0.17      0.29         6
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         3
           7       0.29      0.17      0.21        12
           8       0.00      0.00      0.00         1
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         5
          11       0.43      0.60      0.50         5
          12       0.00      0.00      0.00         2

   micro avg       0.38      0.27      0.31        79
   macro avg       0.24      0.15      0.16        79
weighted avg       0.33      0.27      0.27        79
 samples avg       0.25      0.24      0.23        79


KNN


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       0.48      0.38      0.42        29
           3       0.00      0.00      0.00         5
           4       1.00      0.17      0.29         6
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         1
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         5
          11       1.00      0.20      0.33         5
          12       0.00      0.00      0.00         2

   micro avg       0.52      0.18      0.26        79
   macro avg       0.27      0.10      0.13        79
weighted avg       0.34      0.18      0.21        79
 samples avg       0.18      0.14      0.15        79


SVM
      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


---

BERT Classifier - Multilabel

In [50]:
class BERTClassifierMultilabel(nn.Module):
    def __init__(self, bert_model_name, num_labels):
        super(BERTClassifierMultilabel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [51]:
num_labels = 13
model = BERTClassifierMultilabel('bert-base-uncased', num_labels=num_labels)
# loss function 
criterion = nn.BCEWithLogitsLoss()
# optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr = learning_rate)


In [54]:
dataset = TensorDataset(torch.tensor(objective_bert_tokens), torch.tensor(objective_bert_masks), torch.tensor(objective_bert_segments), torch.tensor(objective_y_encoded))
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

dataloader_train = DataLoader(train_dataset, batch_size=16, shuffle=True)
dataloader_val = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [None]:
num_epochs = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader_train:
        input_ids, attention_mask, _, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.float().to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
with torch.no_grad():
    for batch in dataloader_val:
        input_ids, attention_mask, _, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        test_outputs = model(input_ids, attention_mask)
        test_preds = (torch.sigmoid(test_outputs) > 0.5).cpu().numpy()

    print(classification_report(labels.cpu().numpy(), test_preds))