In [None]:
!pip install sentencepiece
!pip install datasets

In [1]:
# Change path if not match your path of shared directory DS
# Make sure that you've mounted with your drive
%cd /content/drive/MyDrive/AI/DS

/content/drive/MyDrive/AI/DS


In [3]:
from transformers import T5EncoderModel, AutoTokenizer, DataCollatorWithPadding

import torch
from torch import nn
from torch.nn import functional as tf
from torch.utils.data import DataLoader

# from datasets import Dataset

import numpy as np
from sklearn.preprocessing import MinMaxScaler

from matplotlib import pyplot as plt

import json
import math
from tqdm import tqdm

from modeling import create_model
from scaler import Scaler

In [None]:
pretrained_model_name = "VietAI/vit5-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
scaler = torch.load("scaler.pt")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = create_model(pretrained_model_name, state_dict_path="checkpoint/model_state_dict.pt", device=device)

In [None]:
def transform_input(tokenizer, input_dict):
  input_text = f"Loại: {input_dict['estate_type']}{tokenizer.eos_token}Diện tích: {input_dict['square']}{tokenizer.eos_token}Mô tả: {input_dict['description']}"

  return tokenizer(input_text, return_tensors="pt")

In [None]:
def predict(model, scaler, input_encoding):
  model.eval()
  with torch.no_grad():
    logits = model(**input_encoding)

  prices = scaler.invert(logits.cpu().numpy())
  return prices.tolist()

In [None]:
transform_input(tokenizer)

In [4]:
data_dir = "data/"

with open(data_dir + "validation_dict.json", "r", encoding="utf-8") as f:
  validation_dict = json.load(f)

with open(data_dir + "test_dict.json", "r", encoding="utf-8") as f:
  test_dict = json.load(f)

In [None]:
random_index

568

In [6]:
i = 568
input_dict = {}

for k, v in validation_dict.items():
  input_dict[k] = v[i]


In [7]:
input_dict

{'description': 'Bán đất Cổ Linh, gần ngã tư Thạch Bàn, Thế giới di động, Aeonmall khu vực đang phát triển từng ngày Diện tích 140 m2, MT 5,5 m có vỉa hè, đường ô tô tránh, cách mặt đường Cổ Linh khoảng 40 m thích hợp xây tòa nhà kinh doanh, đón đầu tương lai Sổ đỏ full thổ cư, pháp lý chuẩn, sẵn sàng giao dịch Vui lòng liên hệ để biết thêm thông tin chi tiết 0902116316',
 'estate_type': 'Đất',
 'square': 140.0,
 'price': 13300000000.0}

Best indices: [568, 28, 1041, 7498, 4791]

In [None]:
# random_index = np.random.randint(0, 10000)
selected_indices = []
for i in tqdm(range(len(validation_dict["price"]))):
  input_dict = {}

  for k, v in validation_dict.items():
    input_dict[k] = v[i]

  # print(input_dict["price"])

  input_encoding = transform_input(tokenizer, input_dict)
  prices = predict(model, scaler, input_encoding)

  price = prices[0][0]
  if np.abs(price - input_dict["price"])/input_dict["price"] <= 0.01:
    print("---------------------------")
    print("True:", input_dict["price"])
    print("Pred:", price)
    print("---------------------------")
    selected_indices.append(i)

  0%|          | 11/43961 [00:21<15:24:18,  1.26s/it]

---------------------------
True: 4000000000.0
Pred: 4008560128.0
---------------------------


  0%|          | 29/43961 [00:39<10:31:08,  1.16it/s]

---------------------------
True: 3000000000.0
Pred: 2989390336.0
---------------------------


  0%|          | 74/43961 [01:18<17:23:15,  1.43s/it]

---------------------------
True: 6500000000.0
Pred: 6564337152.0
---------------------------


  0%|          | 91/43961 [01:30<8:35:44,  1.42it/s]

---------------------------
True: 4800000000.0
Pred: 4837413888.0
---------------------------


  0%|          | 162/43961 [02:23<9:46:46,  1.24it/s]

---------------------------
True: 5000000000.0
Pred: 5022181888.0
---------------------------


  1%|          | 349/43961 [05:00<13:35:51,  1.12s/it]

---------------------------
True: 23000000000.0
Pred: 22937497600.0
---------------------------


  1%|          | 399/43961 [05:42<9:49:21,  1.23it/s]

---------------------------
True: 170000000000.0
Pred: 168603009024.0
---------------------------


  1%|          | 411/43961 [05:53<9:12:17,  1.31it/s]

---------------------------
True: 2550000000.0
Pred: 2558163456.0
---------------------------


  1%|          | 463/43961 [06:33<9:35:45,  1.26it/s]

---------------------------
True: 24000000000.0
Pred: 24106174464.0
---------------------------


  1%|▏         | 569/43961 [07:56<7:10:57,  1.68it/s]

---------------------------
True: 13300000000.0
Pred: 13202822144.0
---------------------------


  1%|▏         | 644/43961 [08:52<6:38:33,  1.81it/s]

---------------------------
True: 5800000000.0
Pred: 5777718784.0
---------------------------


  2%|▏         | 693/43961 [09:32<14:10:25,  1.18s/it]

---------------------------
True: 4500000000.0
Pred: 4534204928.0
---------------------------


  2%|▏         | 720/43961 [09:53<12:08:04,  1.01s/it]

---------------------------
True: 2700000000.0
Pred: 2686789888.0
---------------------------


  2%|▏         | 983/43961 [13:42<16:57:37,  1.42s/it]

---------------------------
True: 5500000000.0
Pred: 5462982656.0
---------------------------


  2%|▏         | 995/43961 [13:50<6:37:48,  1.80it/s]

---------------------------
True: 7900000000.0
Pred: 7883261440.0
---------------------------


  2%|▏         | 1074/43961 [14:51<9:34:08,  1.24it/s] 

---------------------------
True: 4500000000.0
Pred: 4480089088.0
---------------------------


  3%|▎         | 1237/43961 [17:11<9:03:11,  1.31it/s] 

---------------------------
True: 4000000000.0
Pred: 3992482816.0
---------------------------


  3%|▎         | 1297/43961 [17:58<6:23:28,  1.85it/s]

---------------------------
True: 5650000000.0
Pred: 5617570816.0
---------------------------


  3%|▎         | 1361/43961 [18:46<11:46:03,  1.01it/s]

---------------------------
True: 3360000000.0
Pred: 3340183296.0
---------------------------


  3%|▎         | 1415/43961 [19:35<8:55:49,  1.32it/s]

---------------------------
True: 31000000000.0
Pred: 30951317504.0
---------------------------


  3%|▎         | 1436/43961 [19:54<10:28:46,  1.13it/s]

---------------------------
True: 20000000000.0
Pred: 19995453440.0
---------------------------


  3%|▎         | 1473/43961 [20:21<8:14:48,  1.43it/s]

---------------------------
True: 3800000000.0
Pred: 3824655616.0
---------------------------


  3%|▎         | 1497/43961 [20:41<8:15:31,  1.43it/s]

---------------------------
True: 19500000000.0
Pred: 19471054848.0
---------------------------


  3%|▎         | 1510/43961 [20:49<8:54:12,  1.32it/s]

---------------------------
True: 3000000000.0
Pred: 2973901824.0
---------------------------


  3%|▎         | 1526/43961 [21:01<6:53:31,  1.71it/s]

---------------------------
True: 5500000000.0
Pred: 5495877632.0
---------------------------


  4%|▎         | 1602/43961 [22:05<11:01:30,  1.07it/s]

---------------------------
True: 3750000000.0
Pred: 3721140736.0
---------------------------


  4%|▎         | 1609/43961 [22:12<8:54:19,  1.32it/s]

---------------------------
True: 5900000000.0
Pred: 5849632768.0
---------------------------


  4%|▍         | 1672/43961 [23:10<8:24:29,  1.40it/s]

---------------------------
True: 3850000000.0
Pred: 3847373568.0
---------------------------


  4%|▍         | 1876/43961 [26:17<8:47:34,  1.33it/s] 

---------------------------
True: 11860000000.0
Pred: 11858387968.0
---------------------------


  4%|▍         | 1892/43961 [26:32<10:45:53,  1.09it/s]

---------------------------
True: 4200000000.0
Pred: 4159721216.0
---------------------------


  5%|▍         | 2158/43961 [30:07<7:50:03,  1.48it/s]

---------------------------
True: 6200000000.0
Pred: 6231059968.0
---------------------------


  5%|▍         | 2197/43961 [30:34<9:19:36,  1.24it/s] 

---------------------------
True: 2990000000.0
Pred: 2976481024.0
---------------------------


  5%|▌         | 2242/43961 [31:17<16:26:05,  1.42s/it]

---------------------------
True: 8800000000.0
Pred: 8758582272.0
---------------------------


  5%|▌         | 2293/43961 [32:04<13:04:55,  1.13s/it]

---------------------------
True: 17200000000.0
Pred: 17211746304.0
---------------------------


  5%|▌         | 2357/43961 [32:51<5:13:23,  2.21it/s]

---------------------------
True: 3900000000.0
Pred: 3868518144.0
---------------------------


  5%|▌         | 2381/43961 [33:11<10:54:17,  1.06it/s]

---------------------------
True: 8666000000.0
Pred: 8603628544.0
---------------------------


  6%|▌         | 2477/43961 [34:34<8:13:45,  1.40it/s]

---------------------------
True: 4900000000.0
Pred: 4858297856.0
---------------------------


  6%|▌         | 2517/43961 [35:05<8:31:24,  1.35it/s]

---------------------------
True: 13800000000.0
Pred: 13816620032.0
---------------------------


  6%|▌         | 2574/43961 [35:52<8:14:37,  1.39it/s]

---------------------------
True: 10000000000.0
Pred: 10003959808.0
---------------------------


  6%|▌         | 2734/43961 [38:04<7:26:54,  1.54it/s]

---------------------------
True: 2800000000.0
Pred: 2810214912.0
---------------------------


  7%|▋         | 2893/43961 [40:12<7:58:08,  1.43it/s]

---------------------------
True: 12500000000.0
Pred: 12426356736.0
---------------------------


  7%|▋         | 2961/43961 [41:05<7:26:04,  1.53it/s]

---------------------------
True: 3400000000.0
Pred: 3415140352.0
---------------------------


  7%|▋         | 3040/43961 [42:12<6:14:01,  1.82it/s]

---------------------------
True: 4550000000.0
Pred: 4575963648.0
---------------------------


  7%|▋         | 3046/43961 [42:16<8:10:28,  1.39it/s]

---------------------------
True: 3300000000.0
Pred: 3276962560.0
---------------------------


  7%|▋         | 3187/43961 [44:11<9:48:15,  1.16it/s] 

---------------------------
True: 4099999999.9999995
Pred: 4140971776.0
---------------------------


  7%|▋         | 3196/43961 [44:18<7:49:15,  1.45it/s]

---------------------------
True: 5600000000.0
Pred: 5596966912.0
---------------------------


  7%|▋         | 3223/43961 [44:42<9:52:26,  1.15it/s]

---------------------------
True: 7900000000.0
Pred: 7902704128.0
---------------------------


  7%|▋         | 3226/43961 [44:45<10:36:01,  1.07it/s]

---------------------------
True: 12200000000.0
Pred: 12316186624.0
---------------------------


  8%|▊         | 3306/43961 [45:57<13:55:52,  1.23s/it]

---------------------------
True: 3750000000.0
Pred: 3753227776.0
---------------------------


  8%|▊         | 3330/43961 [46:16<11:02:27,  1.02it/s]

---------------------------
True: 3750000000.0
Pred: 3753227776.0
---------------------------


  8%|▊         | 3379/43961 [46:57<8:25:47,  1.34it/s]

---------------------------
True: 7000000000.0
Pred: 7050596352.0
---------------------------


  8%|▊         | 3478/43961 [48:22<8:31:25,  1.32it/s]

---------------------------
True: 16000000000.0
Pred: 15879023616.0
---------------------------


  8%|▊         | 3480/43961 [48:23<8:24:54,  1.34it/s]

---------------------------
True: 13000000000.0
Pred: 13047723008.0
---------------------------


  8%|▊         | 3583/43961 [50:05<12:21:30,  1.10s/it]

---------------------------
True: 680000000.0
Pred: 674029440.0
---------------------------


  8%|▊         | 3629/43961 [50:51<10:26:38,  1.07it/s]

---------------------------
True: 5550000000.0
Pred: 5522883584.0
---------------------------


  9%|▊         | 3808/43961 [53:29<7:15:41,  1.54it/s]

---------------------------
True: 20000000000.0
Pred: 20188135424.0
---------------------------


  9%|▊         | 3844/43961 [54:06<9:26:24,  1.18it/s]

---------------------------
True: 4099999999.9999995
Pred: 4074428416.0
---------------------------


  9%|▉         | 3848/43961 [54:10<12:07:51,  1.09s/it]

---------------------------
True: 3400000000.0
Pred: 3387794688.0
---------------------------


  9%|▉         | 3867/43961 [54:27<11:01:37,  1.01it/s]

---------------------------
True: 6000000000.0
Pred: 6012317696.0
---------------------------


  9%|▉         | 3974/43961 [56:00<6:41:14,  1.66it/s]

---------------------------
True: 4000000000.0
Pred: 3984620544.0
---------------------------


  9%|▉         | 4045/43961 [57:00<10:18:23,  1.08it/s]

---------------------------
True: 6850000000.0
Pred: 6889608192.0
---------------------------


  9%|▉         | 4149/43961 [58:23<7:51:08,  1.41it/s]

---------------------------
True: 2750000000.0
Pred: 2750366208.0
---------------------------


  9%|▉         | 4159/43961 [58:33<11:31:12,  1.04s/it]

---------------------------
True: 18000000000.0
Pred: 17965172736.0
---------------------------


  9%|▉         | 4164/43961 [58:35<7:07:10,  1.55it/s]

---------------------------
True: 4000000000.0
Pred: 4033313024.0
---------------------------


 10%|▉         | 4235/43961 [59:35<8:49:12,  1.25it/s]

---------------------------
True: 11000000000.0
Pred: 11085683712.0
---------------------------


 10%|▉         | 4259/43961 [59:55<10:55:15,  1.01it/s]

---------------------------
True: 3500000000.0
Pred: 3498132736.0
---------------------------


 10%|▉         | 4271/43961 [1:00:07<9:04:15,  1.22it/s]

---------------------------
True: 4700000000.0
Pred: 4741073920.0
---------------------------


 10%|█         | 4401/43961 [1:01:53<8:58:13,  1.23it/s]

---------------------------
True: 12500000000.0
Pred: 12593486848.0
---------------------------


 10%|█         | 4407/43961 [1:01:58<8:24:53,  1.31it/s]

---------------------------
True: 4000000000.0
Pred: 4027470336.0
---------------------------


 10%|█         | 4424/43961 [1:02:17<11:34:36,  1.05s/it]

---------------------------
True: 6200000000.0
Pred: 6216631296.0
---------------------------


 10%|█         | 4507/43961 [1:03:33<8:49:48,  1.24it/s]

---------------------------
True: 4500000000.0
Pred: 4515438080.0
---------------------------


 11%|█         | 4623/43961 [1:05:10<9:37:40,  1.13it/s]

---------------------------
True: 11600000000.0
Pred: 11501065216.0
---------------------------


 11%|█         | 4682/43961 [1:06:01<12:30:48,  1.15s/it]

---------------------------
True: 14700000000.0
Pred: 14633508864.0
---------------------------


 11%|█         | 4743/43961 [1:06:52<8:47:30,  1.24it/s]

---------------------------
True: 3700000000.0
Pred: 3681820416.0
---------------------------


 11%|█         | 4767/43961 [1:07:14<13:07:10,  1.21s/it]

---------------------------
True: 4500000000.0
Pred: 4460597760.0
---------------------------


 11%|█         | 4795/43961 [1:07:38<7:48:29,  1.39it/s]

---------------------------
True: 25000000000.0
Pred: 25172846592.0
---------------------------


 11%|█         | 4911/43961 [1:09:06<8:35:31,  1.26it/s]

---------------------------
True: 10900000000.0
Pred: 10942487552.0
---------------------------


 11%|█▏        | 4954/43961 [1:09:42<9:12:40,  1.18it/s] 

---------------------------
True: 4500000000.0
Pred: 4500395520.0
---------------------------


 11%|█▏        | 4972/43961 [1:09:58<7:34:04,  1.43it/s]

---------------------------
True: 22000000000.0
Pred: 22098952192.0
---------------------------


 11%|█▏        | 5049/43961 [1:10:55<6:37:42,  1.63it/s]

---------------------------
True: 10000000000.0
Pred: 9902178304.0
---------------------------


 12%|█▏        | 5088/43961 [1:11:32<7:13:12,  1.50it/s]

---------------------------
True: 3800000000.0
Pred: 3785930752.0
---------------------------


 12%|█▏        | 5113/43961 [1:11:53<6:43:36,  1.60it/s]

---------------------------
True: 8900000000.0
Pred: 8817087488.0
---------------------------


 12%|█▏        | 5123/43961 [1:12:03<10:05:07,  1.07it/s]

---------------------------
True: 5500000000.0
Pred: 5522167296.0
---------------------------


 12%|█▏        | 5366/43961 [1:15:31<11:27:55,  1.07s/it]

---------------------------
True: 5350000000.0
Pred: 5392651776.0
---------------------------


 12%|█▏        | 5418/43961 [1:16:15<7:52:11,  1.36it/s]

---------------------------
True: 5000000000.0
Pred: 5035303424.0
---------------------------


 12%|█▏        | 5437/43961 [1:16:32<7:44:36,  1.38it/s]

---------------------------
True: 3550000000.0
Pred: 3550273536.0
---------------------------


 13%|█▎        | 5532/43961 [1:17:56<10:44:44,  1.01s/it]

---------------------------
True: 3200000000.0
Pred: 3218874624.0
---------------------------


 13%|█▎        | 5558/43961 [1:18:20<10:21:08,  1.03it/s]

---------------------------
True: 4540000000.0
Pred: 4552268288.0
---------------------------


 13%|█▎        | 5634/43961 [1:19:31<8:28:04,  1.26it/s]

---------------------------
True: 6900000000.0
Pred: 6955680256.0
---------------------------


 13%|█▎        | 5655/43961 [1:19:50<9:39:55,  1.10it/s] 

---------------------------
True: 5800000000.0
Pred: 5782023680.0
---------------------------


 13%|█▎        | 5656/43961 [1:19:51<8:16:15,  1.29it/s]

---------------------------
True: 4099999999.9999995
Pred: 4115782144.0
---------------------------


 13%|█▎        | 5671/43961 [1:20:10<11:15:51,  1.06s/it]

---------------------------
True: 4500000000.0
Pred: 4485463040.0
---------------------------


 13%|█▎        | 5728/43961 [1:20:54<8:12:51,  1.29it/s]

---------------------------
True: 6500000000.0
Pred: 6548329984.0
---------------------------


 13%|█▎        | 5761/43961 [1:21:22<6:34:20,  1.61it/s]

---------------------------
True: 4800000000.0
Pred: 4755994624.0
---------------------------


 13%|█▎        | 5804/43961 [1:21:58<6:55:10,  1.53it/s]

---------------------------
True: 5200000000.0
Pred: 5227784192.0
---------------------------


 13%|█▎        | 5850/43961 [1:22:35<6:58:51,  1.52it/s]

---------------------------
True: 5000000000.0
Pred: 4972612608.0
---------------------------


 14%|█▎        | 5948/43961 [1:24:00<8:33:24,  1.23it/s]

---------------------------
True: 3150000000.0
Pred: 3137979136.0
---------------------------


 14%|█▎        | 5962/43961 [1:24:10<8:46:14,  1.20it/s]

---------------------------
True: 4200000000.0
Pred: 4214349568.0
---------------------------


 14%|█▎        | 6010/43961 [1:24:54<7:56:02,  1.33it/s]

---------------------------
True: 9700000000.0
Pred: 9631978496.0
---------------------------


 14%|█▍        | 6063/43961 [1:25:41<10:54:38,  1.04s/it]

---------------------------
True: 5700000000.0
Pred: 5646306304.0
---------------------------


 14%|█▍        | 6128/43961 [1:26:46<8:38:15,  1.22it/s]

---------------------------
True: 4400000000.0
Pred: 4427536384.0
---------------------------


 14%|█▍        | 6176/43961 [1:27:30<7:10:19,  1.46it/s]

---------------------------
True: 70000000000.0
Pred: 70073638912.0
---------------------------


 14%|█▍        | 6231/43961 [1:28:12<6:09:30,  1.70it/s]

---------------------------
True: 1456000000.0
Pred: 1446232576.0
---------------------------


 14%|█▍        | 6313/43961 [1:29:24<8:00:30,  1.31it/s]

---------------------------
True: 5200000000.0
Pred: 5200717824.0
---------------------------


 14%|█▍        | 6343/43961 [1:29:53<8:41:36,  1.20it/s]

---------------------------
True: 4650000000.0
Pred: 4632373248.0
---------------------------


 15%|█▍        | 6441/43961 [1:31:04<6:57:23,  1.50it/s]

---------------------------
True: 12000000000.0
Pred: 12115440640.0
---------------------------


 15%|█▍        | 6455/43961 [1:31:11<4:28:25,  2.33it/s]

---------------------------
True: 7500000000.0
Pred: 7447503872.0
---------------------------


 15%|█▍        | 6567/43961 [1:32:35<5:25:29,  1.91it/s]

---------------------------
True: 2750000000.0
Pred: 2751456512.0
---------------------------


 15%|█▍        | 6569/43961 [1:32:36<5:19:24,  1.95it/s]

---------------------------
True: 4000000000.0
Pred: 3976670976.0
---------------------------


 15%|█▌        | 6678/43961 [1:34:07<6:31:17,  1.59it/s]

---------------------------
True: 1800000000.0
Pred: 1808301056.0
---------------------------


 15%|█▌        | 6715/43961 [1:34:35<10:13:58,  1.01it/s]

---------------------------
True: 7900000000.0
Pred: 7826771456.0
---------------------------


 16%|█▌        | 6829/43961 [1:36:07<6:34:08,  1.57it/s]

---------------------------
True: 3900000000.0
Pred: 3894343936.0
---------------------------


 16%|█▌        | 7026/43961 [1:38:48<4:35:19,  2.24it/s]

---------------------------
True: 2369000000.0
Pred: 2348080640.0
---------------------------


 16%|█▌        | 7055/43961 [1:39:12<10:17:35,  1.00s/it]

---------------------------
True: 4600000000.0
Pred: 4572360704.0
---------------------------


 16%|█▌        | 7112/43961 [1:39:59<6:23:48,  1.60it/s]

---------------------------
True: 6000000000.0
Pred: 6003116544.0
---------------------------


 16%|█▋        | 7182/43961 [1:40:54<8:28:23,  1.21it/s]

---------------------------
True: 10500000000.0
Pred: 10471769088.0
---------------------------


 16%|█▋        | 7225/43961 [1:41:27<7:38:09,  1.34it/s]

---------------------------
True: 23500000000.0
Pred: 23549057024.0
---------------------------


 17%|█▋        | 7333/43961 [1:42:54<4:50:20,  2.10it/s]

---------------------------
True: 4630000000.0
Pred: 4630076416.0
---------------------------


 17%|█▋        | 7412/43961 [1:44:06<5:58:32,  1.70it/s]

---------------------------
True: 1200000000.0
Pred: 1194705152.0
---------------------------


 17%|█▋        | 7453/43961 [1:44:39<6:54:18,  1.47it/s]

---------------------------
True: 5500000000.0
Pred: 5553220096.0
---------------------------


 17%|█▋        | 7530/43961 [1:45:39<9:01:25,  1.12it/s]

---------------------------
True: 2400000000.0
Pred: 2415554560.0
---------------------------


 17%|█▋        | 7556/43961 [1:46:08<10:21:07,  1.02s/it]

---------------------------
True: 15800000000.0
Pred: 15802824704.0
---------------------------


 17%|█▋        | 7606/43961 [1:46:46<8:15:40,  1.22it/s]

---------------------------
True: 6800000000.0
Pred: 6745303552.0
---------------------------


 17%|█▋        | 7633/43961 [1:47:07<10:51:57,  1.08s/it]

---------------------------
True: 5200000000.0
Pred: 5164248064.0
---------------------------


 18%|█▊        | 7865/43961 [1:50:22<8:10:55,  1.23it/s]

---------------------------
True: 8000000000.0
Pred: 7921485824.0
---------------------------


 18%|█▊        | 7890/43961 [1:50:44<6:23:55,  1.57it/s]

---------------------------
True: 3500000000.0
Pred: 3467234560.0
---------------------------


 18%|█▊        | 7912/43961 [1:51:04<10:47:40,  1.08s/it]

---------------------------
True: 12000000000.0
Pred: 12115300352.0
---------------------------


 18%|█▊        | 7940/43961 [1:51:28<9:37:51,  1.04it/s]

---------------------------
True: 10500000000.0
Pred: 10604102656.0
---------------------------


 18%|█▊        | 8049/43961 [1:52:57<6:28:13,  1.54it/s]

---------------------------
True: 7900000000.0
Pred: 7899803648.0
---------------------------


 19%|█▊        | 8184/43961 [1:54:51<7:02:17,  1.41it/s]

---------------------------
True: 5600000000.0
Pred: 5575875072.0
---------------------------


 19%|█▊        | 8185/43961 [1:54:51<7:03:21,  1.41it/s]

---------------------------
True: 15500000000.0
Pred: 15607653376.0
---------------------------


 19%|█▉        | 8317/43961 [1:57:01<8:22:09,  1.18it/s]

---------------------------
True: 4500000000.0
Pred: 4484599296.0
---------------------------


 19%|█▉        | 8364/43961 [1:57:41<9:13:42,  1.07it/s] 

---------------------------
True: 5300000000.0
Pred: 5251048960.0
---------------------------


 19%|█▉        | 8369/43961 [1:57:44<6:09:27,  1.61it/s]

---------------------------
True: 620000000.0
Pred: 615733568.0
---------------------------


 19%|█▉        | 8522/43961 [1:59:53<6:32:14,  1.51it/s]

---------------------------
True: 5300000000.0
Pred: 5319172608.0
---------------------------


 20%|█▉        | 8643/43961 [2:01:37<6:50:03,  1.44it/s]

---------------------------
True: 7200000000.0
Pred: 7228453376.0
---------------------------


 20%|█▉        | 8689/43961 [2:02:16<7:40:34,  1.28it/s]

---------------------------
True: 3600000000.0
Pred: 3629924608.0
---------------------------


 20%|█▉        | 8725/43961 [2:02:46<7:21:48,  1.33it/s]

---------------------------
True: 3700000000.0
Pred: 3710293760.0
---------------------------


 20%|█▉        | 8756/43961 [2:03:11<7:13:17,  1.35it/s]

---------------------------
True: 2000000000.0
Pred: 1982348800.0
---------------------------


 20%|█▉        | 8763/43961 [2:03:18<9:50:03,  1.01s/it] 

---------------------------
True: 5900000000.0
Pred: 5876914688.0
---------------------------


 20%|██        | 8810/43961 [2:03:59<6:38:02,  1.47it/s]

---------------------------
True: 1900000000.0
Pred: 1902205824.0
---------------------------


 20%|██        | 8825/43961 [2:04:14<12:00:04,  1.23s/it]

---------------------------
True: 18500000000.0
Pred: 18382032896.0
---------------------------


 20%|██        | 9003/43961 [2:06:40<10:07:30,  1.04s/it]

---------------------------
True: 11800000000.0
Pred: 11802157056.0
---------------------------


 21%|██        | 9030/43961 [2:06:59<5:26:05,  1.79it/s]

---------------------------
True: 18600000000.0
Pred: 18647341056.0
---------------------------


 21%|██        | 9077/43961 [2:07:46<9:59:55,  1.03s/it] 

---------------------------
True: 3200000000.0
Pred: 3180982016.0
---------------------------


 21%|██        | 9104/43961 [2:08:04<5:26:05,  1.78it/s]

---------------------------
True: 4650000000.0
Pred: 4640893952.0
---------------------------


 21%|██        | 9145/43961 [2:08:38<10:41:45,  1.11s/it]

---------------------------
True: 21800000000.0
Pred: 21718132736.0
---------------------------


 21%|██        | 9174/43961 [2:09:09<11:40:23,  1.21s/it]

---------------------------
True: 5000000000.0
Pred: 5031208960.0
---------------------------


 21%|██        | 9298/43961 [2:11:04<8:14:39,  1.17it/s]

---------------------------
True: 4800000000.0
Pred: 4773872128.0
---------------------------


 21%|██        | 9327/43961 [2:11:30<9:33:03,  1.01it/s]

---------------------------
True: 2850000000.0
Pred: 2854936320.0
---------------------------


 21%|██▏       | 9418/43961 [2:12:54<6:58:42,  1.37it/s]

---------------------------
True: 3700000000.0
Pred: 3673921280.0
---------------------------


 21%|██▏       | 9434/43961 [2:13:07<5:21:58,  1.79it/s]

---------------------------
True: 5000000000.0
Pred: 5043083776.0
---------------------------


 22%|██▏       | 9544/43961 [2:14:48<7:49:15,  1.22it/s] 

---------------------------
True: 4200000000.0
Pred: 4219159296.0
---------------------------


 22%|██▏       | 9650/43961 [2:16:21<7:28:16,  1.28it/s]

---------------------------
True: 3650000000.0
Pred: 3639154688.0
---------------------------


 22%|██▏       | 9677/43961 [2:16:53<13:42:45,  1.44s/it]

---------------------------
True: 16850000000.000002
Pred: 16776904704.0
---------------------------


 22%|██▏       | 9839/43961 [2:19:38<9:29:38,  1.00s/it]

---------------------------
True: 3250000000.0
Pred: 3280004608.0
---------------------------


 22%|██▏       | 9871/43961 [2:20:04<7:39:43,  1.24it/s]

---------------------------
True: 4600000000.0
Pred: 4556563968.0
---------------------------


 23%|██▎       | 9905/43961 [2:20:36<7:12:48,  1.31it/s]

---------------------------
True: 2750000000.0
Pred: 2734457088.0
---------------------------


 23%|██▎       | 10098/43961 [2:23:29<9:12:04,  1.02it/s] 

---------------------------
True: 820000000.0
Pred: 823994048.0
---------------------------


 23%|██▎       | 10106/43961 [2:23:35<7:24:17,  1.27it/s]

---------------------------
True: 2049999999.9999998
Pred: 2059621632.0
---------------------------


 23%|██▎       | 10197/43961 [2:24:56<9:57:54,  1.06s/it] 

---------------------------
True: 3700000000.0
Pred: 3667703808.0
---------------------------


 23%|██▎       | 10253/43961 [2:25:52<10:36:42,  1.13s/it]

---------------------------
True: 7700000000.0
Pred: 7756980224.0
---------------------------


 23%|██▎       | 10295/43961 [2:26:27<9:30:21,  1.02s/it] 

---------------------------
True: 3800000000.0
Pred: 3803072768.0
---------------------------


 24%|██▎       | 10341/43961 [2:27:06<5:41:51,  1.64it/s]

---------------------------
True: 4850000000.0
Pred: 4861110784.0
---------------------------


 24%|██▎       | 10392/43961 [2:27:52<9:17:52,  1.00it/s]

---------------------------
True: 2650000000.0
Pred: 2674589184.0
---------------------------


 24%|██▎       | 10406/43961 [2:28:06<9:38:15,  1.03s/it]

---------------------------
True: 4500000000.0
Pred: 4532553216.0
---------------------------


 24%|██▍       | 10476/43961 [2:29:13<6:00:36,  1.55it/s]

---------------------------
True: 4400000000.0
Pred: 4368498176.0
---------------------------


 24%|██▍       | 10614/43961 [2:31:34<10:23:48,  1.12s/it]

---------------------------
True: 5200000000.0
Pred: 5228816384.0
---------------------------


 24%|██▍       | 10668/43961 [2:32:27<7:30:35,  1.23it/s]

---------------------------
True: 2700000000.0
Pred: 2704523008.0
---------------------------


 25%|██▍       | 10811/43961 [2:35:03<8:05:43,  1.14it/s]

---------------------------
True: 1310000000.0
Pred: 1303383168.0
---------------------------


 25%|██▍       | 10949/43961 [2:37:11<9:57:45,  1.09s/it] 

---------------------------
True: 2200000000.0
Pred: 2216937984.0
---------------------------


 25%|██▌       | 10992/43961 [2:37:53<10:16:06,  1.12s/it]

---------------------------
True: 4420000000.0
Pred: 4403494912.0
---------------------------


 25%|██▌       | 11014/43961 [2:38:14<10:28:38,  1.14s/it]

---------------------------
True: 6800000000.0
Pred: 6773544448.0
---------------------------


 25%|██▌       | 11059/43961 [2:38:53<7:30:43,  1.22it/s]

---------------------------
True: 3950000000.0
Pred: 3924839424.0
---------------------------


 25%|██▌       | 11183/43961 [2:41:08<8:45:24,  1.04it/s]

---------------------------
True: 5050000000.0
Pred: 5033517568.0
---------------------------


 25%|██▌       | 11198/43961 [2:41:23<8:28:35,  1.07it/s]

---------------------------
True: 3200000000.0
Pred: 3202016512.0
---------------------------


 26%|██▌       | 11231/43961 [2:41:52<10:41:06,  1.18s/it]

---------------------------
True: 15000000000.0
Pred: 15001790464.0
---------------------------


 26%|██▌       | 11236/43961 [2:41:57<7:26:06,  1.22it/s]

---------------------------
True: 2900000000.0
Pred: 2895099136.0
---------------------------


 26%|██▌       | 11290/43961 [2:42:42<7:10:48,  1.26it/s]

---------------------------
True: 10500000000.0
Pred: 10591953920.0
---------------------------


 26%|██▌       | 11457/43961 [2:45:18<6:27:50,  1.40it/s]

---------------------------
True: 10900000000.0
Pred: 10996466688.0
---------------------------


 26%|██▌       | 11464/43961 [2:45:25<9:25:59,  1.05s/it]

---------------------------
True: 11000000000.0
Pred: 10985650176.0
---------------------------


 26%|██▌       | 11476/43961 [2:45:36<8:22:32,  1.08it/s]

---------------------------
True: 5350000000.0
Pred: 5302453248.0
---------------------------


 26%|██▋       | 11558/43961 [2:46:54<8:54:40,  1.01it/s]

---------------------------
True: 3000000000.0
Pred: 3018447360.0
---------------------------


 27%|██▋       | 11672/43961 [2:48:48<8:42:49,  1.03it/s]

---------------------------
True: 3000000000.0
Pred: 3001983488.0
---------------------------


 27%|██▋       | 11761/43961 [2:50:30<8:33:37,  1.04it/s]

---------------------------
True: 7000000000.0
Pred: 6996474880.0
---------------------------


 27%|██▋       | 11856/43961 [2:52:05<7:44:24,  1.15it/s]

---------------------------
True: 4650000000.0
Pred: 4678880768.0
---------------------------


 27%|██▋       | 11967/43961 [2:54:10<7:45:39,  1.15it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-128-d662d4c4bba5>", line 12, in <cell line: 3>
    prices = predict(model, scaler, input_encoding)
  File "<ipython-input-14-6bca2d629542>", line 4, in predict
    logits = model(**input_encoding)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/drive/MyDrive/AI/DS/modeling.py", line 43, in forward
    encoder_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._

In [None]:
torch.save(selected_indices, "/content/selected_indices.pt")

In [None]:
np.abs(9_445_432_576 - 9_345_432_576)/9345432576

0.010700414259775406

In [None]:
selected_indices

In [None]:
input_encoding = transform_input(tokenizer, input_dict)
prices = predict(model, scaler, input_encoding)

prices

[[6749865472.0]]

In [None]:
np.exp(np.log(8199999999/1000000))

8199.999999000005

In [None]:
np.log(7229101056/1000000)

8.88586997215283

In [None]:
Ground Truth: 15000000000
Prediction  : 18185609216

In [None]:
def normalized_label(scaler, labels, mode=None):
  if mode == "test":
    return [None]*len(labels)

  labels = np.asarray(labels)
  labels = labels[:, None]

  return scaler.transform(labels)

def merge_and_encode_input(tokenizer, data):
  inputs = []
  for i in tqdm(range(len(data["price"]))):
    input_text = f"Loại: {data['estate_type'][i]}{tokenizer.eos_token}Diện tích: {data['square'][i]}{tokenizer.eos_token}Mô tả: {data['description'][i]}"

    inputs.append(tokenizer(input_text, return_attention_mask=False).input_ids)

  return inputs

def filter_length(data, max_ids_length):
  filtered_ids = []
  filtered_labels = []
  for _, (ids, label) in enumerate(tqdm(zip(data["input_ids"], data["labels"]))):
    if len(ids) <= max_ids_length:
      filtered_ids.append(ids)
      filtered_labels.append(label)

  return Dataset.from_dict({
      "input_ids": filtered_ids,
      "labels": filtered_labels
  })


def process_data(tokenizer, data, max_ids_length=1024, mode=None):
  scaler = torch.load("scaler.pt")

  encoding_data = {"input_ids": merge_and_encode_input(tokenizer, data),
                "labels": normalized_label(scaler, data["price"], mode)}
  encoding_data = filter_length(encoding_data, max_ids_length)
  # validation set

  return {
      "scaler": scaler,
      "data": encoding_data
  }


In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

tokenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

In [None]:
scaled_data = process_data(tokenizer, train_dict, validation_dict, test_dict)

100%|██████████| 199911/199911 [03:06<00:00, 1069.61it/s]
199911it [00:00, 1103547.47it/s]
100%|██████████| 43961/43961 [01:06<00:00, 656.83it/s]
43961it [00:00, 308922.76it/s]
100%|██████████| 6924/6924 [00:09<00:00, 746.00it/s]
6924it [00:00, 371467.91it/s]


In [None]:
torch.save(scaled_data["scaler"], "scaler.pt")

In [None]:
scaled_data["scaler"]

NameError: ignored

In [9]:
from flask import Flask, request, jsonify
import torch
import torch.nn as nn
from torchvision import transforms
# from your_data_preprocessing_module import preprocess_input_data, postprocess_output

app = Flask(__name__)

# Định nghĩa mô hình PyTorch của bạn
# class YourModel(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(YourModel, self).__init__()
#         # Định nghĩa kiến trúc mạng nơ-ron của bạn ở đây

#     def forward(self, x):
#         # Định nghĩa quá trình lan truyền thuận của mạng nơ-ron của bạn ở đây
#         return x

# # Khởi tạo mô hình
# model = YourModel(input_size, hidden_size, output_size)
# model.load_state_dict(torch.load('your_model_path.pth'))  # Đường dẫn đến trọng số đã được lưu

# # Chuyển mô hình sang chế độ đánh giá
# model.eval()

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()

    # Xử lý dữ liệu đầu vào
    input_data = preprocess_input_data(data['type'], data['area'], data['description'])

    # Chuyển đổi input_data thành tensor PyTorch
    input_tensor = torch.Tensor(input_data)

    # Dự đoán giá
    with torch.no_grad():
        prediction = model(input_tensor)

    # Chuyển đổi tensor PyTorch thành giá trị có thể trả về
    output_price = postprocess_output(prediction.numpy())

    return jsonify({'price': output_price})

if __name__ == '__main__':
    print("run")
    app.run(port=5000)

!sudo lsof -i -P -n

run
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


COMMAND   PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
node        7 root   21u  IPv6  18971      0t0  TCP *:8080 (LISTEN)
node        7 root   27u  IPv4  24222      0t0  TCP 172.28.0.12:52088->172.28.0.12:6000 (ESTABLISHED)
node        7 root   28u  IPv6 108026      0t0  TCP 172.28.0.12:8080->172.28.0.1:57554 (ESTABLISHED)
kernel_ma  21 root    3u  IPv4  24223      0t0  TCP 172.28.0.12:6000->172.28.0.12:52088 (ESTABLISHED)
kernel_ma  21 root    7u  IPv4  18503      0t0  TCP 172.28.0.12:6000 (LISTEN)
kernel_ma  21 root    8u  IPv4  18249      0t0  TCP 172.28.0.12:57822->172.28.0.12:9000 (ESTABLISHED)
kernel_ma  21 root    9u  IPv4  25065      0t0  TCP 172.28.0.12:47156->172.28.0.12:9000 (ESTABLISHED)
kernel_ma  21 root   10u  IPv4  27078      0t0  TCP 172.28.0.12:6000->172.28.0.1:56130 (ESTABLISHED)
colab-fil  59 root    3u  IPv4  18069      0t0  TCP 127.0.0.1:3453 (LISTEN)
jupyter-n  80 root    7u  IPv4  18239      0t0  TCP 172.28.0.12:9000 (LISTEN)
jupyter-n  80 root    8u  IPv4  1

In [8]:
!sudo lsof -i -P -n

COMMAND   PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
node        7 root   21u  IPv6  18971      0t0  TCP *:8080 (LISTEN)
node        7 root   26u  IPv6 104791      0t0  TCP 172.28.0.12:8080->172.28.0.1:47648 (ESTABLISHED)
node        7 root   27u  IPv4  24222      0t0  TCP 172.28.0.12:52088->172.28.0.12:6000 (ESTABLISHED)
node        7 root   28u  IPv6 104914      0t0  TCP 172.28.0.12:8080->172.28.0.1:47662 (ESTABLISHED)
kernel_ma  21 root    3u  IPv4  24223      0t0  TCP 172.28.0.12:6000->172.28.0.12:52088 (ESTABLISHED)
kernel_ma  21 root    7u  IPv4  18503      0t0  TCP 172.28.0.12:6000 (LISTEN)
kernel_ma  21 root    8u  IPv4  18249      0t0  TCP 172.28.0.12:57822->172.28.0.12:9000 (ESTABLISHED)
kernel_ma  21 root    9u  IPv4  25065      0t0  TCP 172.28.0.12:47156->172.28.0.12:9000 (ESTABLISHED)
kernel_ma  21 root   10u  IPv4  27078      0t0  TCP 172.28.0.12:6000->172.28.0.1:56130 (ESTABLISHED)
colab-fil  59 root    3u  IPv4  18069      0t0  TCP 127.0.0.1:3453 (LISTEN)
jupyter-n 

In [7]:
!curl https://localhost:5000

curl: (7) Failed to connect to localhost port 5000 after 0 ms: Connection refused
