## Import Libraries

In [1]:
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import AdamW

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from sklearn.metrics import accuracy_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [3]:
df_en = pd.read_csv("../../data/english_only_data.csv")
df_ko = pd.read_csv("../../data/korean_only_data.csv")
df_en["language"] = "en"
df_ko["language"] = "ko"

df = pd.concat([df_en, df_ko], ignore_index=True)
df = shuffle(df, random_state=70)

df["Sentiment"] = df["Sentiment"].map({"Negative": 0, "Positive": 1, "negative": 0, "positive": 1}).astype(int)
df.head(10)

Unnamed: 0,Comment,Sentiment,language
133506,맛있고 분위기도 좋아요! 야외에서도 식사 가능해서 날씨 좋으면 더 좋아요,1,ko
189273,어느 수산시장에서나 나오면 먹을 수 있는 민어.의 맛집 서비스 업종 역사상 최악 ...,0,ko
2925,wow wow! what a funky little fleet!!! too sassy,1,en
41958,The invention original purpose is the most Bri...,1,en
104357,양도 푸짐하고 소스 모자라면 리필도 가능해서 좋아요. 한 명이 먹기에는 좀 많은 양...,1,ko
32398,I feel sorry for this guy because you can tell...,0,en
158051,맛의 고장에 이런 식당이 있는 게 불가사의함..,0,ko
81396,I've never heard someone talk so beautifully a...,1,en
156946,평점 0점은 왜 없는 거죠... 맛 집이라 해서 꾸불꾸불 산길을 비포장도로 따라 내...,0,ko
108275,솔직히 별 하나도 아깝네요. 팥빙수라고 시켰는데 우유도 아니고 연유로 범벅을 해 가...,0,ko


In [4]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        encoding = self.tokenizer.encode_plus(
            str(self.texts[index]),
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        ids = encoding["input_ids"].flatten()
        attention_mask = encoding["attention_mask"].flatten()
        labels = torch.tensor(self.labels[index], dtype= torch.long)

        return {
            "ids": ids,
            "attention_mask" : attention_mask,
            "labels" : labels
        }

In [5]:
dataset = SentimentDataset(df["Comment"][:10].tolist(), df["Sentiment"][:10].tolist(), tokenizer, 128)

In [6]:
dataset.__getitem__(2)

{'ids': tensor([     0, 104130, 104130,     38,   2367,     10,  23884,     53,  10176,
          18738,    126,   1564,   5792,     57,  46048,      2,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,     

In [7]:
class SentimentClassifier(nn.Module):
    def __init__(self, model):
        super(SentimentClassifier, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(self.model.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids = input_ids, attention_mask = attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        dropout_output = self.dropout(cls_output)
        return self.linear(dropout_output)

In [14]:
def train_model(model, optimizer, criterion, train_loader, test_loader, epochs, lang):

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        model.train()
        y_true_train = []
        y_pred_train = []
        train_loss = 0

        for batch in tqdm(train_loader, leave=False, desc="Training"):
            ids = batch["ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(ids, mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            y_pred_train.extend(preds.cpu().numpy())
            y_true_train.extend(labels.cpu().numpy())

        avg_train_loss = train_loss / len(train_loader)
        train_acc = accuracy_score(y_true_train, y_pred_train)
        print(f"Train loss: {avg_train_loss:.4f} | Train accuracy: {train_acc:.4f}")

        model.eval()
        y_true = []
        y_pred = []
        test_loss = 0.0

        with torch.no_grad():
            for batch in tqdm(test_loader, leave=False, desc="Evaluating"):
                ids = batch["ids"].to(device)
                mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(ids, mask)
                loss = criterion(outputs, labels)
                test_loss += loss.item()

                preds = torch.argmax(outputs, dim=1)
                y_pred.extend(preds.cpu().numpy())
                y_true.extend(labels.cpu().numpy())

        avg_test_loss = test_loss / len(test_loader)
        test_acc = accuracy_score(y_true, y_pred)
        print(f"Test loss: {avg_test_loss:.4f} | Test accuracy: {test_acc:.4f}")

        torch.save(model.state_dict(), f"../../model/{lang}_fine_tune_classifier_{epoch}.pt")

In [9]:
def run_model(model, optimizer, criterion, language, df, max_length, epochs):
    print(f"\n{'-'*40} {language} {'-'*40}")
    lang_map = {
        "English": "en",
        "Korean": "ko",
        "English + Korean": "both",
    }

    X_train, X_test, y_train, y_test = train_test_split(df["Comment"], df["Sentiment"], test_size=0.2, random_state=42)
    train_ds = SentimentDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_length)
    test_ds = SentimentDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_length)

    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=16)

    train_model(model, optimizer, criterion, train_loader, test_loader, epochs, lang_map[language])

In [13]:
EPOCHS = 2
LEARNING_RATE = 2e-5
MAX_LENGTH = 128


def prepare_model():
    model = SentimentClassifier(XLMRobertaModel.from_pretrained("xlm-roberta-base"))
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()
    return model, optimizer, criterion

In [15]:
# Combined
model, optimizer, criterion = prepare_model()
run_model(model, optimizer, criterion, "English + Korean", df, MAX_LENGTH, EPOCHS)


---------------------------------------- English + Korean ----------------------------------------

Epoch 1/2


                                                                     

Train loss: 0.2614 | Train accuracy: 0.8882


                                                                     

Test loss: 0.2103 | Test accuracy: 0.9111

Epoch 2/2


                                                                   

Train loss: 0.1949 | Train accuracy: 0.9201


                                                               

Test loss: 0.2090 | Test accuracy: 0.9143


In [None]:
# English only
df_en = df[df["language"] == "en"]
model, optimizer, criterion = prepare_model()
run_model(model, optimizer, criterion, "English", df_en, MAX_LENGTH, EPOCHS)

In [None]:
# Korean only
df_ko = df[df["language"] == "ko"]
model, optimizer, criterion = prepare_model()
run_model(model, optimizer, criterion, "Korean", df_ko, MAX_LENGTH, EPOCHS)