In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!nvidia-smi

Sat Mar 11 06:51:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    25W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers

In [4]:
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import shutil
import sys
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
train_path = "/content/drive/MyDrive/KLTN/Dataset/act/csv/train.csv"
test_path = "/content/drive/MyDrive/KLTN/Dataset/act/csv/test.csv"

In [6]:
train_df = pd.read_csv(train_path, delimiter="\t")
test_df = pd.read_csv(test_path, delimiter="\t")

In [7]:
label_list = train_df.columns[1:]
label_list

Index(['Attraction-Inform', 'Attraction-NoOffer', 'Attraction-Recommend',
       'Attraction-Request', 'Attraction-Select', 'Booking-Book',
       'Booking-Inform', 'Booking-NoBook', 'Booking-Request',
       'Hospital-Inform', 'Hospital-Request', 'Hotel-Inform', 'Hotel-NoOffer',
       'Hotel-Recommend', 'Hotel-Request', 'Hotel-Select', 'Police-Inform',
       'Police-Request', 'Restaurant-Inform', 'Restaurant-NoOffer',
       'Restaurant-Recommend', 'Restaurant-Request', 'Restaurant-Select',
       'Taxi-Inform', 'Taxi-Request', 'Train-Inform', 'Train-NoOffer',
       'Train-OfferBook', 'Train-OfferBooked', 'Train-Request', 'Train-Select',
       'general-bye', 'general-greet', 'general-reqmore', 'general-thank',
       'general-welcome'],
      dtype='object')

In [8]:
def get_labels(df):
    labels = []
    for i in range(len(df)):
        row = []
        for j in label_list:
            if ((j in df.columns) and (df.iloc[i][j] == 1)):
                row.append(1)
            else:
                row.append(0)
        labels.append(row)
    return labels

# Y_test = get_labels(test_df)

In [9]:
from transformers import BertTokenizer, BertModel

In [10]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 1e-05

In [11]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, is_test_df):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.text = self.df['utterance']
        if (is_test_df):
            self.labels = get_labels(self.df)
        else:
            self.labels = self.df[label_list].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'labels': torch.FloatTensor(self.labels[index])
        }

In [12]:
class BERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BERTClassifier, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs['pooler_output']
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

In [13]:
def predict(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device, dtype = torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)

            logits = model(input_ids, attention_mask, token_type_ids)

            predictions.append(torch.sigmoid(logits).cpu().detach().numpy())
    return np.concatenate(predictions)

In [14]:
def load_checkpoint(checkpoint_path, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # Load checkpoint
    checkpoint = torch.load(checkpoint_path)
    # Initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # Initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # Initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # Return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_checkpoint(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

**Main:**

In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [None]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BERTClassifier(len(label_list))
model.to(device)

In [17]:
# Initialize optimizer and loss function
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.BCEWithLogitsLoss()

In [18]:
# Load data
train_size = 0.8
train_df2 = train_df.sample(frac=train_size, random_state=200)
val_df = train_df.drop(train_df2.index).reset_index(drop=True)
train_df=train_df2.reset_index(drop=True)

train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN, False)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)

valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN, False)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)

test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN, True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False, num_workers=0)

**K-Nearest Neighbors Classification:**

In [19]:
# Load the model
checkpoint_path = "/content/drive/MyDrive/KLTN/Model/BERT/best_model.pt"

model, optimizer, epoch, loss = load_checkpoint(checkpoint_path, model, optimizer)

In [20]:
def get_features_and_labels(model, dataloader, device):
    model.eval()
    features = []
    labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device, dtype = torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)

            labels.append(batch['labels'])

            last_hidden_states = model(input_ids, attention_mask, token_type_ids).cpu().numpy()
            
            features.append(last_hidden_states)
    return np.concatenate(features), np.concatenate(labels)

In [21]:
train_features, train_labels = get_features_and_labels(model, train_dataloader, device)

In [22]:
test_features, test_labels = get_features_and_labels(model, test_dataloader, device)

In [23]:
# Create KNN Classifier object
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

# Train the model using the training sets
knn.fit(train_features, train_labels)

In [24]:
# Find the distances and indices of the k-neighbors of each point
distances, indices = knn.kneighbors(test_features)

In [25]:
# Define custom weight function
def my_weight_func(distances, temperature):
    weights = []

    for i in distances:
        weights_item = []
        
        denominator = 0
        for j in i:
            denominator += math.exp((-1)*(j)/temperature)

        for k in i:
            numerator = math.exp((-1)*(k)/temperature)
            alpha = numerator / denominator
            weights_item.append(alpha)

        weights.append(weights_item)

    return np.array(weights)

In [26]:
# Compute weights for each test point
weights = my_weight_func(distances, 1)

In [27]:
# Compute the weighted sum of the y values for each test point
y_pred_KNN = np.zeros((test_features.shape[0], train_features.shape[1]))
for i in range(test_features.shape[0]):
    for j in range(indices.shape[1]):
        y_pred_KNN[i] += weights[i, j] * knn._y[indices[i, j]]

In [28]:
# Print the predicted y values
print("Predicted y values:", y_pred_KNN)

Predicted y values: [[0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [1.  0.  0.  ... 1.  0.  0. ]
 [0.  0.  0.  ... 0.  1.  0. ]
 [0.  0.  0.  ... 0.  0.2 0.2]]


In [29]:
# Evaluate KNN model
threshold = 0.5

y_true = test_labels
y_pred = y_pred_KNN

for item in y_pred:
    for j in range(len(y_pred[0])):
        if (item[j] > threshold):
            item[j] = 1
        else:
            item[j] = 0

print("Test Accuracy : {}".format(accuracy_score(y_true, y_pred)))
print("\nClassification Report : ")
print(classification_report(y_true, y_pred, target_names=label_list))

Test Accuracy : 0.7767227346717309

Classification Report : 
                      precision    recall  f1-score   support

   Attraction-Inform       0.90      0.89      0.90      1522
  Attraction-NoOffer       0.88      0.87      0.87        60
Attraction-Recommend       0.72      0.66      0.69       148
  Attraction-Request       0.78      0.71      0.74       676
   Attraction-Select       0.64      0.51      0.57        55
        Booking-Book       0.90      0.96      0.93       537
      Booking-Inform       0.93      0.90      0.91       564
      Booking-NoBook       0.97      0.97      0.97       131
     Booking-Request       0.93      0.94      0.93       321
     Hospital-Inform       0.00      0.00      0.00         0
    Hospital-Request       0.00      0.00      0.00         0
        Hotel-Inform       0.89      0.89      0.89      2156
       Hotel-NoOffer       0.86      0.85      0.86        67
     Hotel-Recommend       0.72      0.64      0.68       140
       H

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Combine 2 models
lamb = 0.5

y_pred_BERT = predict(model, test_dataloader, device)
y_pred_KNN

output = np.add(np.multiply(lamb, y_pred_KNN), np.multiply((1 - lamb), y_pred_BERT))

In [31]:
# Evaluate BERT
threshold = 0.5

y_true = test_labels
y_pred = y_pred_BERT

for item in y_pred:
    for j in range(len(y_pred[0])):
        if (item[j] > threshold):
            item[j] = 1
        else:
            item[j] = 0

print("Test Accuracy : {}".format(accuracy_score(y_true, y_pred)))
print("\nClassification Report : ")
print(classification_report(y_true, y_pred, target_names=label_list))

Test Accuracy : 0.7755018990775909

Classification Report : 
                      precision    recall  f1-score   support

   Attraction-Inform       0.90      0.90      0.90      1522
  Attraction-NoOffer       0.89      0.90      0.89        60
Attraction-Recommend       0.74      0.62      0.67       148
  Attraction-Request       0.87      0.62      0.73       676
   Attraction-Select       0.67      0.53      0.59        55
        Booking-Book       0.90      0.94      0.92       537
      Booking-Inform       0.92      0.90      0.91       564
      Booking-NoBook       0.98      0.93      0.95       131
     Booking-Request       0.93      0.93      0.93       321
     Hospital-Inform       0.00      0.00      0.00         0
    Hospital-Request       0.00      0.00      0.00         0
        Hotel-Inform       0.89      0.90      0.90      2156
       Hotel-NoOffer       0.83      0.88      0.86        67
     Hotel-Recommend       0.79      0.66      0.72       140
       H

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
# Evaluate BERT + kNN
threshold = 0.5

y_true = test_labels
y_pred = output

for item in y_pred:
    for j in range(len(y_pred[0])):
        if (item[j] > threshold):
            item[j] = 1
        else:
            item[j] = 0

print("Test Accuracy : {}".format(accuracy_score(y_true, y_pred)))
print("\nClassification Report : ")
print(classification_report(y_true, y_pred, target_names=label_list))

Test Accuracy : 0.7767227346717309

Classification Report : 
                      precision    recall  f1-score   support

   Attraction-Inform       0.90      0.89      0.90      1522
  Attraction-NoOffer       0.88      0.87      0.87        60
Attraction-Recommend       0.72      0.66      0.69       148
  Attraction-Request       0.78      0.71      0.74       676
   Attraction-Select       0.64      0.51      0.57        55
        Booking-Book       0.90      0.96      0.93       537
      Booking-Inform       0.93      0.90      0.91       564
      Booking-NoBook       0.97      0.97      0.97       131
     Booking-Request       0.93      0.94      0.93       321
     Hospital-Inform       0.00      0.00      0.00         0
    Hospital-Request       0.00      0.00      0.00         0
        Hotel-Inform       0.89      0.89      0.89      2156
       Hotel-NoOffer       0.86      0.85      0.86        67
     Hotel-Recommend       0.72      0.64      0.68       140
       H

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
