### Import Libraries

In [None]:
import pandas as pd
import torch
torch.cuda.empty_cache()

from transformers import XLMRobertaTokenizer, XLMRobertaModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [39]:
device = torch.device("cpu")
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaModel.from_pretrained("xlm-roberta-base").to(device)

### Import Dataset

In [4]:
df_en = pd.read_csv("../../data/english_only_data.csv")
df_ko = pd.read_csv("../../data/korean_only_data.csv")

In [5]:
df_en["language"] = "en"
df_ko["language"] = "ko"

df = pd.concat([df_en, df_ko], ignore_index=True)
df = shuffle(df, random_state=70)
df.head(10)

Unnamed: 0,Comment,Sentiment,language
133506,맛있고 분위기도 좋아요! 야외에서도 식사 가능해서 날씨 좋으면 더 좋아요,positive,ko
189273,어느 수산시장에서나 나오면 먹을 수 있는 민어.의 맛집 서비스 업종 역사상 최악 ...,negative,ko
2925,wow wow! what a funky little fleet!!! too sassy,Positive,en
41958,The invention original purpose is the most Bri...,Positive,en
104357,양도 푸짐하고 소스 모자라면 리필도 가능해서 좋아요. 한 명이 먹기에는 좀 많은 양...,positive,ko
32398,I feel sorry for this guy because you can tell...,Negative,en
158051,맛의 고장에 이런 식당이 있는 게 불가사의함..,negative,ko
81396,I've never heard someone talk so beautifully a...,Positive,en
156946,평점 0점은 왜 없는 거죠... 맛 집이라 해서 꾸불꾸불 산길을 비포장도로 따라 내...,negative,ko
108275,솔직히 별 하나도 아깝네요. 팥빙수라고 시켰는데 우유도 아니고 연유로 범벅을 해 가...,negative,ko


In [6]:
df.shape

(200000, 3)

In [7]:
df["Sentiment"] = df["Sentiment"].replace(
    {"Negative": "negative", "Positive": "positive"}
)
df["Sentiment"] = df["Sentiment"].replace({"negative": 0, "positive": 1})
df["Sentiment"].value_counts()

  df["Sentiment"] = df["Sentiment"].replace({"negative": 0, "positive": 1})


Sentiment
1    100000
0    100000
Name: count, dtype: int64

In [8]:
X = df["Comment"]
y = df["Sentiment"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
def preprocess_text(texts, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        input_ids.append(encoding["input_ids"])
        attention_masks.append(encoding["attention_mask"])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [11]:
train_input_ids, train_attention_masks = preprocess_text(
    X_train, tokenizer, 10
)
test_input_ids, test_attention_masks = preprocess_text(
    X_test, tokenizer, 10
)

In [12]:

train_input_ids = train_input_ids.to(device)
train_attention_masks = train_attention_masks.to(device)
test_input_ids = test_input_ids.to(device)
test_attention_masks = test_attention_masks.to(device)

In [15]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


def get_embeddings(input_ids, attention_masks, model, batch_size=16, device="mps"):
    model.eval()
    model.to(device)

    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_embeddings = []

    for batch in tqdm(dataloader, desc="Extracting embeddings"):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=batch_input_ids, attention_mask=batch_attention_masks
            )
            embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token

        all_embeddings.append(embeddings.cpu())  # Move to CPU to save memory

    return torch.cat(all_embeddings, dim=0)

In [16]:
train_embeddings = get_embeddings(
    train_input_ids, train_attention_masks, model, batch_size=16, device=device
)
test_embeddings = get_embeddings(
    test_input_ids, test_attention_masks, model, batch_size=16, device=device
)

Extracting embeddings: 100%|██████████| 10000/10000 [03:35<00:00, 46.42it/s]
Extracting embeddings: 100%|██████████| 2500/2500 [00:53<00:00, 47.01it/s]


In [18]:
train_input_ids.shape, train_attention_masks.shape, train_embeddings.shape

(torch.Size([160000, 10]), torch.Size([160000, 10]), torch.Size([160000, 768]))

In [22]:
import numpy as np

np.save("../../model/train_embeddings.npy", train_embeddings)
np.save("../../model/test_embeddings.npy", test_embeddings)

In [23]:
import torch

torch.save(train_embeddings, "../../model/train_embeddings.pt")
torch.save(test_embeddings, "../../model/test_embeddings.pt")

In [24]:
X_train = train_embeddings.cpu().numpy()
X_test = test_embeddings.cpu().numpy()

In [26]:
lr = LogisticRegression(max_iter=1000)  
lr.fit(X_train, y_train)

In [27]:
# Test evaluation
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.75      0.75     20063
           1       0.75      0.73      0.74     19937

    accuracy                           0.74     40000
   macro avg       0.74      0.74      0.74     40000
weighted avg       0.74      0.74      0.74     40000



In [28]:
# Train evaluation
y_pred = lr.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74     79937
           1       0.75      0.73      0.74     80063

    accuracy                           0.74    160000
   macro avg       0.74      0.74      0.74    160000
weighted avg       0.74      0.74      0.74    160000



In [30]:
svm = LinearSVC()
svm.fit(X_train, y_train)

In [32]:
# Test evaluation
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75     20063
           1       0.75      0.73      0.74     19937

    accuracy                           0.75     40000
   macro avg       0.75      0.75      0.75     40000
weighted avg       0.75      0.75      0.75     40000



In [33]:
# Train evaluation
y_pred = svm.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75     79937
           1       0.75      0.73      0.74     80063

    accuracy                           0.75    160000
   macro avg       0.75      0.75      0.75    160000
weighted avg       0.75      0.75      0.75    160000



### English Only