### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [14]:
fasttext_ko = KeyedVectors.load_word2vec_format("../../fasttext/wiki.ko.align.vec", binary=False)
fasttext_en = KeyedVectors.load_word2vec_format("../../fasttext/wiki.en.align.vec", binary=False)

In [16]:
df = pd.read_csv('../../data/embedding_preprocessed_data.csv')
df.head()

Unnamed: 0,Comment,Sentiment,Language
0,"['맛있', '분위기', '좋', '어요', '야외', '에서', '식사', '가능...",1,ko
1,"['어느', '수산', '시장', '에서', '나', '나오', '면', '먹', ...",0,ko
2,"['wow', 'wow', 'funky', 'little', 'fleet', 'sa...",1,en
3,"['invention', 'original', 'purpose', 'british'...",1,en
4,"['양', '푸짐', '소스', '모자라', '면', '리필', '가능', '어서'...",1,ko


### Train Test Split

In [17]:
X = df[["Comment", "Language"]]
y = df["Sentiment"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
def create_vector(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [33]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

X_test["embedding"] = X_test.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

In [34]:
X_train["embedding"][1]

array([-0.04984117,  0.01956353,  0.03349841, -0.01947345, -0.04708381,
       -0.01375938, -0.01546457, -0.01142292,  0.02165678, -0.05314055,
        0.0437271 , -0.06167399,  0.10019061, -0.03364216, -0.03999376,
       -0.01131406, -0.01599532, -0.0396073 ,  0.02014323, -0.06736665,
        0.05553126,  0.01765363, -0.06495213, -0.02530153,  0.02619374,
        0.0586219 ,  0.02327238, -0.05710835, -0.01440677,  0.00603072,
        0.02190677,  0.00946719, -0.02199894, -0.00036198, -0.01261251,
       -0.09411349, -0.02478021, -0.06060888,  0.04547862, -0.02956721,
        0.00509427,  0.01570884, -0.01359792,  0.04314795, -0.02471094,
        0.01916926,  0.00710729, -0.03360518, -0.07296191, -0.01872085,
        0.03054063,  0.03369947,  0.00245469, -0.02058281, -0.01608336,
        0.00273125, -0.04715838,  0.03353961, -0.01380104, -0.01960678,
        0.02873229, -0.12366251,  0.03902762, -0.02434842, -0.01991978,
        0.0788343 , -0.00743542, -0.05525935, -0.03099061,  0.05

In [35]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [36]:
X_train_emb = np.stack(X_train["embedding"].values)
X_test_emb = np.stack(X_test["embedding"].values)

In [37]:
len(X_train_emb[0])

300

### Logistic Regression

In [38]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_emb, y_train)

In [39]:
y_pred = lr.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.70      0.69     20063
           1       0.69      0.68      0.68     19937

    accuracy                           0.69     40000
   macro avg       0.69      0.69      0.69     40000
weighted avg       0.69      0.69      0.69     40000



In [40]:
y_pred = lr.predict(X_train_emb)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.70      0.69     79937
           1       0.70      0.67      0.68     80063

    accuracy                           0.69    160000
   macro avg       0.69      0.69      0.69    160000
weighted avg       0.69      0.69      0.69    160000



### SVM

In [41]:
svm = LinearSVC()
svm.fit(X_train_emb, y_train)

In [42]:
y_pred = svm.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.74      0.73     20063
           1       0.73      0.71      0.72     19937

    accuracy                           0.72     40000
   macro avg       0.72      0.72      0.72     40000
weighted avg       0.72      0.72      0.72     40000



In [43]:
y_pred = svm.predict(X_train_emb)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.74      0.73     79937
           1       0.73      0.70      0.72     80063

    accuracy                           0.72    160000
   macro avg       0.72      0.72      0.72    160000
weighted avg       0.72      0.72      0.72    160000

