# Install necessary packages

In [None]:
!pip install sentencepiece
!pip install datasets



# Set working directory

In [None]:
# Change path if not match your path of shared directory DS
# Make sure that you've mounted with your drive
%cd /content/drive/MyDrive/AI/DS

/content/drive/MyDrive/AI/DS


# **Setup - Import project's dependencies**

In [None]:
from transformers import T5EncoderModel, AutoTokenizer, DataCollatorWithPadding

import torch
from torch import nn
from torch.nn import functional as tf
from torch.utils.data import DataLoader

from datasets import Dataset

import numpy as np
from sklearn.preprocessing import MinMaxScaler

from matplotlib import pyplot as plt

import json
import math
from tqdm import tqdm

In [None]:
pretrained_model_name = "VietAI/vit5-base"

# **Design model**

In [None]:
class AttentionPooling(nn.Module):
    def __init__(self, input_dim):
        super(AttentionPooling, self).__init__()
        self.proj = nn.Linear(2*input_dim, input_dim)
        self.attention = nn.Linear(input_dim, 1)
        self.scaling = math.sqrt(input_dim)

    def forward(self, x, attention_mask=None):
        # x shape: [batch_size, seq_len, input_dim]
        x = self.proj(x)
        scores = self.attention(x)
        scores = scores / self.scaling

        if attention_mask is not None:
          attention_mask = (1.0 - attention_mask[:, :, None].to(dtype=torch.float32)) * torch.finfo(torch.float32).min
          scores = scores + attention_mask

        attention_weights = tf.softmax(scores, dim=1)
        # attention_weights shape: [batch_size, seq_len, 1]
        output = torch.sum(attention_weights * x, dim=1)
        # output shape: [batch_size, input_dim]
        return output

In [None]:
class HousePricePredictionModel(nn.Module):
  def __init__(self, pretrained_model_name):
    super(HousePricePredictionModel, self).__init__()

    self.base_model = T5EncoderModel.from_pretrained(pretrained_model_name, output_hidden_states=True)

    self.pooler = AttentionPooling(768)
    self.regression_fc = nn.Linear(768, 1)

  def forward(self, input_ids, attention_mask):
    encoder_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
    x = encoder_outputs.hidden_states[-2:]
    x = torch.cat([x[0], x[1]], dim=-1)
    x = tf.gelu(self.pooler(x, attention_mask))
    logits = self.regression_fc(x)
    return logits


# **Transform data**

## **Label**
- LogTransform -> MinMaxScaler
1. `Logarithmize labels` to avoid uneven distribution of value ranges
2. `Apply MinMaxScaler` to bring the value into the range `[0, 1]`

## **Input features**

- Feature columns: `["estate_type", "description", "square"]`
- Combine: `"Loại: {estate_type}. Diện tích: {square}. Mô tả: {description}"`


### Load json data

In [None]:
data_dir = "data/"

In [None]:
with open(data_dir + "train_dict.json", "r", encoding="utf-8") as f:
  train_dict = json.load(f)

with open(data_dir + "validation_dict.json", "r", encoding="utf-8") as f:
  validation_dict = json.load(f)

with open(data_dir + "test_dict.json", "r", encoding="utf-8") as f:
  test_dict = json.load(f)

### Process data

In [None]:
class Scaler:
  def __init__(self, scale_down_factor=1_000_000):
    self.scale_down_factor = scale_down_factor
    self.minmaxscaler = MinMaxScaler()

  def fit(self, data):
    if not isinstance(data, np.ndarray):
      raise ValueError()

    assert len(data.shape) == 2

    data = data/self.scale_down_factor
    data = np.log(data)
    self.minmaxscaler.fit(data)

  def transform(self, data):
    if not isinstance(data, np.ndarray):
      raise ValueError()

    assert len(data.shape) == 2

    scaled_data = data/self.scale_down_factor
    scaled_data = np.log(scaled_data)
    scaled_data = self.minmaxscaler.transform(scaled_data)
    return scaled_data

  def invert(self, data):
    if not isinstance(data, np.ndarray):
      raise ValueError()

    assert len(data.shape) == 2

    inverted_data = self.minmaxscaler.inverse_transform(data)
    inverted_data = np.exp(inverted_data)
    inverted_data = inverted_data*self.scale_down_factor
    return inverted_data

In [None]:
def normalized_label(scaler, labels, mode=None):
  if mode == "test":
    return [None]*len(labels)

  labels = np.asarray(labels)
  labels = labels[:, None]
  if mode == "train":
    scaler.fit(labels)

  return scaler.transform(labels)

def merge_and_encode_input(tokenizer, data):
  inputs = []
  for i in tqdm(range(len(data["price"]))):
    input_text = f"Loại: {data['estate_type'][i]}{tokenizer.eos_token}Diện tích: {data['square'][i]}{tokenizer.eos_token}Mô tả: {data['description'][i]}"

    inputs.append(tokenizer(input_text, return_attention_mask=False).input_ids)

  return inputs

def filter_length(data, max_ids_length):
  filtered_ids = []
  filtered_labels = []
  for _, (ids, label) in enumerate(tqdm(zip(data["input_ids"], data["labels"]))):
    if len(ids) <= max_ids_length:
      filtered_ids.append(ids)
      filtered_labels.append(label)

  return Dataset.from_dict({
      "input_ids": filtered_ids,
      "labels": filtered_labels
  })


def process_data(tokenizer, train, validation=None, test=None, max_ids_length=1024):
  scaler = Scaler(min(train["price"]))

  # train set
  train_data = {"input_ids": merge_and_encode_input(tokenizer, train),
                "labels": normalized_label(scaler, train["price"], mode="train")}
  train_data = filter_length(train_data, max_ids_length)
  # validation set

  if validation is not None:
    validation_data = {"input_ids": merge_and_encode_input(tokenizer, validation),
                "labels": normalized_label(scaler, validation["price"])}
    validation_data = filter_length(validation_data, max_ids_length)
  else:
    validation_data = None

  # test set
  if test is not None:
    test_data = {"input_ids": merge_and_encode_input(tokenizer, test),
                "labels": normalized_label(scaler, test["price"], mode="test")}
    test_data = filter_length(test_data, max_ids_length)
  else:
    test_data = None

  return {
      "scaler": scaler,
      "train": train_data,
      "validation": validation_data,
      "test": test_data
  }


In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [None]:
scaled_data = process_data(tokenizer, train_dict, validation_dict, test_dict)

100%|██████████| 199911/199911 [02:31<00:00, 1316.07it/s]
199911it [00:00, 1283462.76it/s]
100%|██████████| 43961/43961 [00:28<00:00, 1524.02it/s]
43961it [00:00, 1023705.82it/s]
100%|██████████| 6924/6924 [00:03<00:00, 1955.10it/s]
6924it [00:00, 1590174.72it/s]


In [None]:
scaled_data["scaler"].invert(np.asarray(scaled_data["validation"]["labels"]))

array([[3.0e+09],
       [3.8e+09],
       [3.3e+09],
       ...,
       [2.0e+09],
       [9.4e+09],
       [3.5e+09]])

In [None]:
# scaled_data["train"] = Dataset.from_dict(scaled_data["train"])
# scaled_data["validation"] = Dataset.from_dict(scaled_data["validation"])
# scaled_data["test"] = Dataset.from_dict(scaled_data["test"])

# **Training**

In [None]:
model = HousePricePredictionModel(pretrained_model_name)

In [None]:
train_dataloader = DataLoader(scaled_data["train"], collate_fn=DataCollatorWithPadding(tokenizer), batch_size=4, shuffle=True)
val_dataloader = DataLoader(scaled_data["validation"], collate_fn=DataCollatorWithPadding(tokenizer), batch_size=4)

In [None]:
epochs = 10
gradient_accumulation_steps = 8
steps_per_epoch = len(train_dataloader)
total_steps = epochs*(len(train_dataloader) // gradient_accumulation_steps)
warmup_steps = int(0.05*total_steps)

In [None]:
class WarmupLinearLR(torch.optim.lr_scheduler.LambdaLR):
    """ Linear warmup and then linear decay.
        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
        Linearly decreases learning rate from 1. to 0. over remaining `total_steps - warmup_steps` steps.
    """
    def __init__(self, optimizer, warmup_steps, total_steps, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        super(WarmupLinearLR, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        return max(0.0, float(self.total_steps - step) / float(max(1.0, self.total_steps - self.warmup_steps)))

In [None]:
no_decay = ['bias', 'layer_norm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': 0.001},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
      'weight_decay': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=2e-5, weight_decay=0.0)
scheduler = WarmupLinearLR(optimizer, warmup_steps, total_steps)

In [None]:
total_steps

62450

In [None]:
history = {
    "train": [],
    "validation": []
}

criterion = nn.MSELoss()

optimizer.zero_grad()

global_steps = 0

best_val_loss = float("inf")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Found device:", device)

model = model.to(device)

model.train()

for ep in range(1, epochs + 1):
  print("Epoch:", ep)
  epoch_loss = 0.0
  local_steps = 0
  for step, batch in enumerate(tqdm(train_dataloader)):
    logits = model(batch["input_ids"].to(device), batch["attention_mask"].to(device))
    loss = criterion(logits, batch["labels"].to(device))
    epoch_loss += loss.item()
    loss = loss / gradient_accumulation_steps
    loss.backward()

    if ((step + 1) % gradient_accumulation_steps == 0) or (step + 1 == steps_per_epoch):
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()

      local_steps += 1
      global_steps += 1

      newline = False

      if local_steps % 100 == 0 or (step + 1 == steps_per_epoch):
        print()
        print(f"    mse_loss:{round(epoch_loss / (step + 1), 4)}")
        newline = True

      if global_steps % 100 == 0:
        history["train"].append((global_steps, epoch_loss/(step + 1)))

      if (local_steps % 500 == 0) or (step + 1 == steps_per_epoch):
        model.eval()
        total_val_loss = 0
        for _, val_batch in enumerate(val_dataloader):
          val_logits = model(val_batch["input_ids"].to(device), val_batch["attention_mask"].to(device))
          val_loss = criterion(val_logits, val_batch["labels"].to(device))
          total_val_loss += val_loss.item()

        total_val_loss /= len(val_dataloader)
        if not newline:
          print()
        print(f"    val_mse_loss: {round(total_val_loss, 4)}")
        history["validation"].append((global_steps, total_val_loss))
        model.train()

        if total_val_loss < best_val_loss:
          print(f"Update val_loss from {best_val_loss} to {total_val_loss}.")
          best_val_loss = total_val_loss

          torch.save(model.state_dict(), "checkpoint/model_state_dict.pt")



Found device: cuda
Epoch: 1


  0%|          | 0/49964 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  2%|▏         | 800/49964 [03:02<3:35:11,  3.81it/s]


    mse_loss:0.343


  3%|▎         | 1600/49964 [06:04<2:39:45,  5.05it/s]


    mse_loss:0.192


  5%|▍         | 2400/49964 [09:11<2:47:25,  4.73it/s]


    mse_loss:0.1356


  6%|▋         | 3200/49964 [12:14<2:30:22,  5.18it/s]


    mse_loss:0.1058


  8%|▊         | 3999/49964 [15:19<2:50:17,  4.50it/s]


    mse_loss:0.0879
    val_mse_loss: 0.0125
Update val_loss from inf to 0.012494099604589465.


 10%|▉         | 4800/49964 [32:52<3:12:35,  3.91it/s]


    mse_loss:0.0756


 11%|█         | 5601/49964 [35:54<2:27:12,  5.02it/s]


    mse_loss:0.0666


 13%|█▎        | 6400/49964 [38:57<2:54:02,  4.17it/s]


    mse_loss:0.0596


 14%|█▍        | 7200/49964 [42:01<3:47:53,  3.13it/s]


    mse_loss:0.054


 16%|█▌        | 7999/49964 [45:06<3:01:20,  3.86it/s]


    mse_loss:0.0494
    val_mse_loss: 0.0124
Update val_loss from 0.012494099604589465 to 0.012356780385189158.


 18%|█▊        | 8800/49964 [1:02:50<2:37:43,  4.35it/s]


    mse_loss:0.0456


 19%|█▉        | 9600/49964 [1:05:54<3:14:28,  3.46it/s]


    mse_loss:0.0425


 21%|██        | 10400/49964 [1:08:55<2:32:57,  4.31it/s]


    mse_loss:0.0399


 22%|██▏       | 11201/49964 [1:11:59<2:29:52,  4.31it/s]


    mse_loss:0.0376


 24%|██▍       | 11999/49964 [1:15:00<1:51:33,  5.67it/s]


    mse_loss:0.0355
    val_mse_loss: 0.0075
Update val_loss from 0.012356780385189158 to 0.007519347937811621.


 26%|██▌       | 12800/49964 [1:32:40<3:51:49,  2.67it/s]


    mse_loss:0.0337


 27%|██▋       | 13600/49964 [1:35:44<2:12:36,  4.57it/s]


    mse_loss:0.0321


 29%|██▉       | 14400/49964 [1:38:50<1:48:54,  5.44it/s]


    mse_loss:0.0307


 30%|███       | 15200/49964 [1:41:56<2:07:18,  4.55it/s]


    mse_loss:0.0296


 32%|███▏      | 15999/49964 [1:44:59<2:02:53,  4.61it/s]


    mse_loss:0.0284
    val_mse_loss: 0.007
Update val_loss from 0.007519347937811621 to 0.007012768230643227.


 34%|███▎      | 16800/49964 [2:02:30<2:22:29,  3.88it/s]


    mse_loss:0.0275


 35%|███▌      | 17600/49964 [2:05:37<2:49:58,  3.17it/s]


    mse_loss:0.0265


 37%|███▋      | 18400/49964 [2:08:38<1:55:47,  4.54it/s]


    mse_loss:0.0256


 38%|███▊      | 19200/49964 [2:11:41<1:54:19,  4.48it/s]


    mse_loss:0.0248


 40%|████      | 19999/49964 [2:14:45<1:35:05,  5.25it/s]


    mse_loss:0.024


 40%|████      | 20000/49964 [2:29:15<2174:28:16, 261.25s/it]

    val_mse_loss: 0.0079


 42%|████▏     | 20800/49964 [2:32:17<2:09:50,  3.74it/s]


    mse_loss:0.0232


 43%|████▎     | 21600/49964 [2:35:21<1:30:15,  5.24it/s]


    mse_loss:0.0226


 45%|████▍     | 22400/49964 [2:38:27<1:37:00,  4.74it/s]


    mse_loss:0.0219


 46%|████▋     | 23201/49964 [2:41:30<1:29:23,  4.99it/s]


    mse_loss:0.0213


 48%|████▊     | 23999/49964 [2:44:33<1:35:53,  4.51it/s]


    mse_loss:0.0209
    val_mse_loss: 0.0033
Update val_loss from 0.007012768230643227 to 0.003270867454466307.


 50%|████▉     | 24800/49964 [3:02:18<1:47:10,  3.91it/s]


    mse_loss:0.0203


 51%|█████     | 25600/49964 [3:05:25<1:29:56,  4.51it/s]


    mse_loss:0.0198


 53%|█████▎    | 26400/49964 [3:08:29<1:19:05,  4.97it/s]


    mse_loss:0.0194


 54%|█████▍    | 27200/49964 [3:11:31<1:19:48,  4.75it/s]


    mse_loss:0.0189


 56%|█████▌    | 27999/49964 [3:14:33<1:21:36,  4.49it/s]


    mse_loss:0.0186


 56%|█████▌    | 28001/49964 [3:28:59<1111:02:18, 182.11s/it]

    val_mse_loss: 0.0036


 58%|█████▊    | 28800/49964 [3:32:00<1:16:46,  4.59it/s]


    mse_loss:0.0182


 59%|█████▉    | 29600/49964 [3:35:05<1:13:47,  4.60it/s]


    mse_loss:0.0178


 61%|██████    | 30400/49964 [3:38:10<1:06:46,  4.88it/s]


    mse_loss:0.0174


 62%|██████▏   | 31200/49964 [3:41:16<57:23,  5.45it/s]  


    mse_loss:0.0171


 64%|██████▍   | 31999/49964 [3:44:25<1:27:44,  3.41it/s]


    mse_loss:0.0168


 64%|██████▍   | 32001/49964 [3:58:54<911:03:44, 182.59s/it] 

    val_mse_loss: 0.0037


 66%|██████▌   | 32801/49964 [4:01:52<1:16:34,  3.74it/s]


    mse_loss:0.0165


 66%|██████▌   | 32938/49964 [4:02:23<2:05:17,  2.26it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-21-74f2061d1db8>", line 30, in <cell line: 21>
    loss.backward()
  File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 492, in backward
    torch.autograd.backward(
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 251, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, anothe

In [25]:
!pwd

shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
pwd: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected


In [None]:
torch.save(history, "/content/history.pt")

In [None]:
history

{'train': [(100, 0.34296076580649243),
  (200, 0.19204115401253147),
  (300, 0.13557909917184588),
  (400, 0.10578545405207479),
  (500, 0.08789204291180795),
  (600, 0.07555573743092889),
  (700, 0.06660222473950593),
  (800, 0.059637631347272874),
  (900, 0.05400406651394911),
  (1000, 0.04941117544225562),
  (1100, 0.04562390729314674),
  (1200, 0.042510896982338786),
  (1300, 0.03992522787426354),
  (1400, 0.03762528118944504),
  (1500, 0.03552466011215377),
  (1600, 0.03373269474016013),
  (1700, 0.03214559967702732),
  (1800, 0.030693790687194376),
  (1900, 0.02961599357585723),
  (2000, 0.028444992745657715),
  (2100, 0.027455283573894488),
  (2200, 0.026469039698565976),
  (2300, 0.025570998157608715),
  (2400, 0.02477071847555436),
  (2500, 0.02396307180886606),
  (2600, 0.0232318351419661),
  (2700, 0.022599394484175937),
  (2800, 0.021942619631843904),
  (2900, 0.02134170469466512),
  (3000, 0.02086111937586755),
  (3100, 0.0203286289113232),
  (3200, 0.019829305913787464),
