### 1.	Crawl dữ liệu từ các trang báo điện tử Việt Nam sử dụng BeautifulSoup và Requests.<br />Ví dụ thu thập tiêu đề, mô tả ngắn &rarr; làm dữ liệu cho bài toán phân lớp.
- Import thư viện
- BeautifulSoup là thư viện Python nhằm lấy dữ liệu từ file html, xml

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://vnexpress.net/khanh-thanh-khoi-cong-5-du-an-trong-diem-o-tp-hcm-4876018.html'
response = requests.get(url)

- Lấy file html của web

In [4]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
soup.title.text

'Khánh thành, khởi công 5 dự án trọng điểm ở TP HCM - Báo VnExpress'

- class_ để không nhầm lẫn giữa các class của python

In [5]:
soup.find('p', class_='description')

<p class="description">Nhà ga T3 Tân Sơn Nhất, đường kết nối và 20 km cao tốc Bến Lức - Long Thành được đưa vào khai thác; hai đoạn Vành đai 2 TP HCM được khởi công, sáng 19/4.</p>

### 2.	Thu thập dữ liệu từ các nguồn có sẵn (VNTC, UIT-VSFC, ...)

In [1]:
import os
import torch
from typing import List, Tuple, Dict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
TRAIN_PATH = "/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full"
TEST_PATH = "/Users/quytien/VNTC/Data/10Topics/Ver1.1/Test_Full"

In [3]:
category2id = {category: idx for idx, category in enumerate(os.listdir(TRAIN_PATH))}
category2id

{'Van hoa': 0,
 'The gioi': 1,
 'Khoa hoc': 2,
 'Suc khoe': 3,
 'Chinh tri Xa hoi': 4,
 'Vi tinh': 5,
 'Kinh doanh': 6,
 'The thao': 7,
 'Phap luat': 8,
 'Doi song': 9}

In [4]:
data = []
def read_file(category, file_name):
    text_path = os.path.join(TRAIN_PATH, category, file_name)
    with open(text_path, "r", encoding = 'utf-16') as file:
        content = file.read()
    return (content, category2id[category])
print("TRAIN:")
for category in os.listdir(TRAIN_PATH):
    category_path = os.path.join(TRAIN_PATH, category)
    print(category_path)
    data.extend([ read_file(category, file_name) for file_name in os.listdir(category_path)])
test_data = []
print("TEST:")
for category in os.listdir(TEST_PATH):
    category_path = os.path.join(TRAIN_PATH, category)
    print(category_path)
    test_data.extend([ read_file(category, file_name) for file_name in os.listdir(category_path)])

TRAIN:
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Van hoa
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/The gioi
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Khoa hoc
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Suc khoe
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Chinh tri Xa hoi
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Vi tinh
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Kinh doanh
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/The thao
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Phap luat
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Doi song
TEST:
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Van hoa
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/The gioi
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Khoa hoc
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Suc khoe
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Chinh tri Xa hoi
/Users/quytien/VNTC/Data/10Topics/Ver1.1/Train_Full/Vi t

In [5]:
class VNTCDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)

In [6]:
train_data, valid_data = train_test_split(data, test_size = 0.1)

In [7]:
vntc_train_data = VNTCDataset(train_data)
vntc_valid_data = VNTCDataset(valid_data)
vntc_test_data = VNTCDataset(test_data)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [9]:
print("Model max_length: ", tokenizer.model_max_length)

Model max_length:  1000000000000000019884624838656


In [10]:
def collate_fn(data: List[Tuple[str, int]]):
    texts = []
    labels = []
    for content, label in data:
        texts.append(content)
        labels.append(label)    
    input_ids = tokenizer.batch_encode_plus(texts, padding = True, truncation = True)['input_ids']
    input_ids = torch.tensor(input_ids)
    labels = torch.tensor(labels)
    return input_ids, labels

In [11]:
train_dataloader = torch.utils.data.DataLoader(dataset = vntc_train_data, batch_size = 32, collate_fn = collate_fn, shuffle = True)
valid_dataloader = torch.utils.data.DataLoader(dataset = vntc_valid_data, batch_size = 32, collate_fn = collate_fn)
test_dataloader = torch.utils.data.DataLoader(dataset = vntc_test_data, batch_size = 32, collate_fn = collate_fn)

In [12]:
for input_ids, labels in train_dataloader:
    print(input_ids, labels)
    break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tensor([[    0,  1266,   175,  ...,     1,     1,     1],
        [    0,  9916,  3230,  ...,     1,     1,     1],
        [    0,   449,   585,  ...,     1,     1,     1],
        ...,
        [    0, 61610, 12886,  ...,     1,     1,     1],
        [    0,  1696, 21486,  ...,     1,     1,     1],
        [    0,  3490, 33101,  ...,     1,     1,     1]]) tensor([9, 4, 4, 0, 6, 8, 0, 7, 1, 8, 2, 7, 1, 4, 5, 4, 7, 7, 4, 9, 9, 4, 4, 3,
        2, 8, 7, 3, 7, 7, 3, 9])


In [13]:
class PhobertForNewsClassification(torch.nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.phobert = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels = num_classes)
        for param in self.phobert.roberta.parameters():
            param.requires_grad = False
    def forward(self, input_ids, labels):
        out = self.phobert(input_ids, labels = labels)
        probs = torch.softmax(out['logits'], dim = -1)
        preds = torch.argmax(probs, dim = -1).cpu().detach().numpy()
        out['preds'] = preds
        return out

In [14]:
model = PhobertForNewsClassification(len(category2id))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def get_num_parameters(module: torch.nn.Module) -> Dict[str, int]:
    """

    Get the number of trainable, fixed and total parameters of a pytorch module.

    :param module: pytorch module
    :return: dict containing number of parameters
    """
    trainable = 0
    fixed = 0
    for p in module.parameters():
        if p.requires_grad:
            trainable += p.numel()
        else:
            fixed += p.numel()
    return {"trainable": trainable, "fixed": fixed, "total": trainable + fixed}

In [16]:
get_num_parameters(model)

{'trainable': 598282, 'fixed': 134407680, 'total': 135005962}

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [18]:
LR = 1e-4
EPOCH = 5
LOG_STEP = 100

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr= LR)

In [20]:
print(f"Train Loader Step: {len(train_dataloader)}")
print(f"Valid Loader Step: {len(valid_dataloader)}")

Train Loader Step: 950
Valid Loader Step: 106


In [21]:
model = model.to(device)

In [22]:
losses = 0
for i in range(EPOCH):
    print(f"Epoch {i}:")
    model.train()
    for step, (input_ids, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        out = model(input_ids, labels)
        loss = out['loss']
        loss.backward()
        optimizer.step()
        losses += loss.detach().cpu().item()
        if step % LOG_STEP == 0 and step > 0:
            print(f"Step {step}, Avg Loss {loss}")
            losses = 0
    
    preds = []
    trues = []
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for step, (input_ids, labels) in enumerate(valid_dataloader):
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            out = model(input_ids, labels)
            loss = out['loss']
            valid_loss += loss.detach().cpu().item()
            preds.extend(out['preds'])
            trues.extend(labels.detach().cpu().numpy())
    print(classification_report(trues, preds, target_names = list(category2id.keys())))
    print(f"Valid loss: {valid_loss / len(valid_dataloader)}")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 0:


RuntimeError: The expanded size of the tensor (3221) must match the existing size (258) at non-singleton dimension 1.  Target sizes: [32, 3221].  Tensor sizes: [1, 258]

In [23]:
preds = []
trues = []
model.eval()
with torch.no_grad():
    for step, (input_ids, labels) in enumerate(test_dataloader):
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        out = model(input_ids, labels)
        loss = out['loss']
        preds.extend(out['preds'])
        trues.extend(labels.detach().cpu().numpy())
        
print(classification_report(trues, preds, target_names = list(category2id.keys())))

RuntimeError: The expanded size of the tensor (2017) must match the existing size (258) at non-singleton dimension 1.  Target sizes: [32, 2017].  Tensor sizes: [1, 258]