### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [None]:
fasttext_ko = KeyedVectors.load_word2vec_format("../../fasttext/wiki.ko.align.vec", binary=False)
fasttext_en = KeyedVectors.load_word2vec_format("../../fasttext/wiki.en.align.vec", binary=False)

In [None]:
df = pd.read_csv('../../data/embedding_preprocessed_data.csv')
df.head()

Unnamed: 0,Comment,Sentiment,Language
0,"['맛있', '분위기', '좋', '어요', '야외', '에서', '식사', '가능...",1,ko
1,"['어느', '수산', '시장', '에서', '나', '나오', '면', '먹', ...",0,ko
2,"['wow', 'wow', 'funky', 'little', 'fleet', 'sa...",1,en
3,"['invention', 'original', 'purpose', 'british'...",1,en
4,"['양', '푸짐', '소스', '모자라', '면', '리필', '가능', '어서'...",1,ko


In [None]:
df_copy = df.copy()

### Train Test Split

In [None]:
X = df[["Comment", "Language"]]
y = df["Sentiment"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def create_vector(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

X_test["embedding"] = X_test.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

In [None]:
X_train["embedding"][1]

In [None]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [None]:
X_train_emb = np.stack(X_train["embedding"].values)
X_test_emb = np.stack(X_test["embedding"].values)

In [None]:
len(X_train_emb[0])

### Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_emb, y_train)

In [None]:
y_pred = lr.predict(X_test_emb)
print(classification_report(y_test, y_pred))

In [None]:
y_pred = lr.predict(X_train_emb)
print(classification_report(y_train, y_pred))

### SVM

In [None]:
svm = LinearSVC()
svm.fit(X_train_emb, y_train)

In [None]:
y_pred = svm.predict(X_test_emb)
print(classification_report(y_test, y_pred))

In [None]:
y_pred = svm.predict(X_train_emb)
print(classification_report(y_train, y_pred))

## English Only

In [11]:
df_en = df_copy[df_copy["Language"] == "en"]

In [12]:
X = df_en[["Comment", "Language"]]
y = df_en["Sentiment"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_en
    ),
    axis=1,
)

X_test["embedding"] = X_test.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_en
    ),
    axis=1,
)

In [15]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [16]:
X_train_emb = np.stack(X_train["embedding"].values)
X_test_emb = np.stack(X_test["embedding"].values)

In [17]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_emb, y_train)

In [18]:
y_pred = lr.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.61      0.60     10035
           1       0.59      0.56      0.58      9965

    accuracy                           0.59     20000
   macro avg       0.59      0.59      0.59     20000
weighted avg       0.59      0.59      0.59     20000



In [19]:
svm = LinearSVC()
svm.fit(X_train_emb, y_train)

In [20]:
y_pred = svm.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.64      0.61     10035
           1       0.60      0.56      0.58      9965

    accuracy                           0.60     20000
   macro avg       0.60      0.60      0.60     20000
weighted avg       0.60      0.60      0.60     20000



### Korean Only

In [21]:
df_ko = df_copy[df_copy["Language"] == "ko"]

In [22]:
X = df_ko[["Comment", "Language"]]
y = df_ko["Sentiment"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [24]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(row["Comment"], fasttext_ko),
    axis=1,
)

X_test["embedding"] = X_test.apply(
    lambda row: create_vector(row["Comment"], fasttext_ko),
    axis=1,
)

In [25]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [26]:
X_train_emb = np.stack(X_train["embedding"].values)
X_test_emb = np.stack(X_test["embedding"].values)

In [27]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_emb, y_train)

In [28]:
y_pred = lr.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80     10108
           1       0.79      0.80      0.80      9892

    accuracy                           0.80     20000
   macro avg       0.80      0.80      0.80     20000
weighted avg       0.80      0.80      0.80     20000



In [29]:
svm = LinearSVC()
svm.fit(X_train_emb, y_train)

In [30]:
y_pred = svm.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     10108
           1       0.85      0.85      0.85      9892

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000

