In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

In [3]:
fasttext_ko = KeyedVectors.load_word2vec_format(
    "../../fasttext/wiki.ko.align.vec", binary=False
)
fasttext_en = KeyedVectors.load_word2vec_format(
    "../../fasttext/wiki.en.align.vec", binary=False
)

In [2]:
df = pd.read_csv("../../data/embedding_preprocessed_data.csv")
df.head()

Unnamed: 0,Comment,Sentiment,Language
0,"['맛있', '분위기', '좋', '어요', '야외', '에서', '식사', '가능...",1,ko
1,"['어느', '수산', '시장', '에서', '나', '나오', '면', '먹', ...",0,ko
2,"['wow', 'wow', 'funky', 'little', 'fleet', 'sa...",1,en
3,"['invention', 'original', 'purpose', 'british'...",1,en
4,"['양', '푸짐', '소스', '모자라', '면', '리필', '가능', '어서'...",1,ko


In [4]:
df_copy = df.copy()


In [4]:
X = df[["Comment", "Language"]]
y = df["Sentiment"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
def create_vector(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [7]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

X_test["embedding"] = X_test.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

In [8]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [12]:
X_train_fasttext = np.stack(X_train["embedding"].values)
X_test_fasttext = np.stack(X_test["embedding"].values)

In [13]:
bert_train = np.load("../../model/train_embeddings.npy")  
bert_test = np.load("../../model/test_embeddings.npy")

In [None]:
X_train_fasttext = normalize(X_train_fasttext)
X_test_fasttext = normalize(X_test_fasttext)
bert_train = normalize(bert_train)
bert_test = normalize(bert_test)

X_train_hybrid = np.concatenate([X_train_fasttext, bert_train], axis=1)  
X_test_hybrid = np.concatenate([X_test_fasttext, bert_test], axis=1)

In [20]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_hybrid, y_train)
y_pred = lr.predict(X_test_hybrid)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.77      0.76     20063
           1       0.76      0.73      0.75     19937

    accuracy                           0.75     40000
   macro avg       0.75      0.75      0.75     40000
weighted avg       0.75      0.75      0.75     40000



In [21]:
y_pred = lr.predict(X_train_hybrid)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75     79937
           1       0.76      0.73      0.74     80063

    accuracy                           0.75    160000
   macro avg       0.75      0.75      0.75    160000
weighted avg       0.75      0.75      0.75    160000



In [17]:
svm = LinearSVC()
svm.fit(X_train_hybrid, y_train)
y_pred = svm.predict(X_test_hybrid)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80     20063
           1       0.81      0.77      0.79     19937

    accuracy                           0.80     40000
   macro avg       0.80      0.80      0.80     40000
weighted avg       0.80      0.80      0.80     40000



### English Only

In [33]:
df_en = df_copy[df_copy["Language"] == "en"]
X= df_en[["Comment", "Language"]]
y= df_en["Sentiment"]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [35]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_en
    ),
    axis=1,
)

X_test["embedding"] = X_test.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_en
    ),
    axis=1,
)

In [36]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [37]:
X_train_fasttext = np.vstack(X_train["embedding"].values)
X_test_fasttext = np.vstack(X_test["embedding"].values)

In [38]:
bert_train = np.load("../../model/en_train_embeddings.npy")
bert_test = np.load("../../model/en_test_embeddings.npy")

In [39]:
X_train_fasttext = normalize(X_train_fasttext)
X_test_fasttext = normalize(X_test_fasttext)
bert_train = normalize(bert_train)
bert_test = normalize(bert_test)

X_train_hybrid = np.concatenate([X_train_fasttext, bert_train], axis=1)
X_test_hybrid = np.concatenate([X_test_fasttext, bert_test], axis=1)

In [40]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_hybrid, y_train)
y_pred = lr.predict(X_test_hybrid)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.75      0.71     10035
           1       0.71      0.62      0.66      9965

    accuracy                           0.69     20000
   macro avg       0.69      0.69      0.69     20000
weighted avg       0.69      0.69      0.69     20000



In [41]:
svm = LinearSVC()
svm.fit(X_train_hybrid, y_train)
y_pred = svm.predict(X_test_hybrid)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.79      0.75     10035
           1       0.76      0.68      0.72      9965

    accuracy                           0.73     20000
   macro avg       0.74      0.73      0.73     20000
weighted avg       0.74      0.73      0.73     20000



### Korean Only

In [42]:
df_kr = df_copy[df_copy["Language"] == "ko"]
X = df_kr[["Comment", "Language"]]
y = df_kr["Sentiment"]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [44]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(row["Comment"], fasttext_ko),
    axis=1,
)

X_test["embedding"] = X_test.apply(
    lambda row: create_vector(row["Comment"], fasttext_ko),
    axis=1,
)

In [45]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [46]:
X_train_fasttext = np.vstack(X_train["embedding"].values)
X_test_fasttext = np.vstack(X_test["embedding"].values)

In [47]:
bert_train = np.load("../../model/en_train_embeddings.npy")
bert_test = np.load("../../model/en_test_embeddings.npy")

In [48]:
X_train_fasttext = normalize(X_train_fasttext)
X_test_fasttext = normalize(X_test_fasttext)
bert_train = normalize(bert_train)
bert_test = normalize(bert_test)

X_train_hybrid = np.concatenate([X_train_fasttext, bert_train], axis=1)
X_test_hybrid = np.concatenate([X_test_fasttext, bert_test], axis=1)

In [49]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_hybrid, y_train)
y_pred = lr.predict(X_test_hybrid)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82     10108
           1       0.82      0.82      0.82      9892

    accuracy                           0.82     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.82      0.82      0.82     20000



In [50]:
svm = LinearSVC()
svm.fit(X_train_hybrid, y_train)
y_pred = svm.predict(X_test_hybrid)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86     10108
           1       0.86      0.86      0.86      9892

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

