# Sentence Classification with Pytorch

## Installation

In [1]:
!pip install -q 'transformers[torch]' sentencepiece

## Import Library

In [2]:
import requests
import torch
import pandas as pd
import numpy as np
from typing import Dict
from tqdm import tqdm
from torch import nn
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModel

## Download Training and testing dataset

In [3]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [4]:
TRAIN_TEXT_URL = "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/kaggle-competition/train.txt"
TRAIN_LABEL_URL = "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/kaggle-competition/train_label.txt"
TEST_TEXT_URL = "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/kaggle-competition/test.txt"
TEST_LABEL_URL = "https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/master/kaggle-competition/test_label.txt"

train_text = requests.get(TRAIN_TEXT_URL).text
train_label = requests.get(TRAIN_LABEL_URL).text
test_text = requests.get(TEST_TEXT_URL).text
test_label = requests.get(TEST_LABEL_URL).text

train_df = pd.DataFrame(
    {"text": train_text.split("\n")[:-1], "label": train_label.split("\n")[:-1]}
)

test_df = pd.DataFrame(
    {"text": test_text.split("\n")[:-1], "label": test_label.split("\n")[:-1]}
)

train_df.to_csv("data/train.csv", index=False)
test_df.to_csv("data/test.csv", index=False)


## Define Pytorch Dataset

In [5]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, df: pd.DataFrame, label2index: Dict[str, int], tokenizer: AutoTokenizer):
    self.labels = [label2index[label] for label in df["label"]]
    self.texts = [
      tokenizer(
        text,
        padding="max_length",
        max_length=416,
        truncation=True,
        return_tensors="pt",
      )
      for text in df["text"]
    ]

  def classes(self):
    return self.labels

  def __len__(self):
    return len(self.labels)

  def get_batch_labels(self, idx):
    return np.array(self.labels[idx])

  def get_batch_texts(self, idx):
    return self.texts[idx]

  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)
    batch_y = self.get_batch_labels(idx)
    return batch_texts, batch_y

## Define Classifier

In [6]:
class WangchanBERTaClassifier(nn.Module):
  def __init__(self, model_name: str, num_classes: int = 4, dropout: float = 0.5):
    super(WangchanBERTaClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained(model_name)
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, num_classes)
    self.relu = nn.ReLU()

  def forward(self, input_id, mask):
    _, pooled_output = self.bert(
      input_ids=input_id, attention_mask=mask, return_dict=False
    )
    dropout_output = self.dropout(pooled_output)
    linear_output = self.linear(dropout_output)
    final_layer = self.relu(linear_output)
    return final_layer

## Define Training Function

In [7]:
def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  criterion = nn.CrossEntropyLoss()
  optimizer = Adam(model.parameters(), lr=learning_rate)

  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      train_label = train_label.to(device)
      mask = train_input["attention_mask"].to(device)
      input_id = train_input["input_ids"].squeeze(1).to(device)

      output = model(input_id, mask)

      batch_loss = criterion(output, train_label.long())
      total_loss_train += batch_loss.item()

      acc = (output.argmax(dim=1) == train_label).sum().item()
      total_acc_train += acc

      model.zero_grad()
      batch_loss.backward()
      optimizer.step()

    total_acc_val = 0
    total_loss_val = 0

    with torch.no_grad():
      for val_input, val_label in val_dataloader:
        val_label = val_label.to(device)
        mask = val_input["attention_mask"].to(device)
        input_id = val_input["input_ids"].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, val_label.long())
        total_loss_val += batch_loss.item()

        acc = (output.argmax(dim=1) == val_label).sum().item()
        total_acc_val += acc

    print(
      f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataloader.dataset): .3f} \
        | Train Accuracy: {total_acc_train / len(train_dataloader.dataset): .3f} \
        | Val Loss: {total_loss_val / len(val_dataloader.dataset): .3f} \
        | Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}"
    )

## Define Evaluating Function

In [8]:
def evaluate(model, test_dataloader):
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  with torch.no_grad():
    for test_input, test_label in test_dataloader:
      test_label = test_label.to(device)
      mask = test_input["attention_mask"].to(device)
      input_id = test_input["input_ids"].squeeze(1).to(device)

      output = model(input_id, mask)

      acc = (output.argmax(dim=1) == test_label).sum().item()
      total_acc_test += acc

  print(f"Test Accuracy: {total_acc_test / len(test_dataloader.dataset): .3f}")

## Setting

In [9]:
torch.manual_seed(677)
np.random.seed(677)

label2index = {
  "pos": 0,
  "neu": 1,
  "neg": 2,
  "q": 3,
}

MODEL_NAME = "airesearch/wangchanBERTa-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = WangchanBERTaClassifier(MODEL_NAME, 4)
EPOCHS = 5
LR = 1e-6

Some weights of the model checkpoint at airesearch/wangchanBERTa-base-att-spm-uncased were not used when initializing CamembertModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Read data

In [10]:
df = pd.read_csv("data/train.csv")

df_train, df_val = np.split(
    df.sample(frac=1, random_state=42), [int(0.8 * len(df))]
)
print(len(df_train), len(df_val))

train_dataset, val_dataset = Dataset(df_train, label2index, tokenizer), Dataset(df_val, label2index, tokenizer)

19250 4813


## Train model

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=16)
train(model, train_dataloader, val_dataloader, LR, EPOCHS)

  2%|▏         | 13/602 [00:28<21:32,  2.19s/it]

## Test model

In [None]:
df_test = pd.read_csv("data/test.csv")
test_dataset = Dataset(df_test, label2index, tokenizer)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32)
evaluate(model, test_dataloader)