### Import Libraries

In [11]:
import pandas as pd
import numpy as np
from gensim.models.fasttext import load_facebook_model
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [12]:
fasttext_ko = load_facebook_model("../../fasttext/cc.ko.300.bin.gz")
fasttext_en = load_facebook_model("../../fasttext/cc.en.300.bin.gz")

In [13]:
df = pd.read_csv('../../data/embedding_preprocessed_data.csv')
df.head()

Unnamed: 0,Comment,Sentiment,Language
0,"['맛있', '분위기', '좋', '어요', '야외', '에서', '식사', '가능...",1,ko
1,"['어느', '수산', '시장', '에서', '나', '나오', '면', '먹', ...",0,ko
2,"['wow', 'wow', 'funky', 'little', 'fleet', 'sa...",1,en
3,"['invention', 'original', 'purpose', 'british'...",1,en
4,"['양', '푸짐', '소스', '모자라', '면', '리필', '가능', '어서'...",1,ko


### Train Test Split

In [14]:
X = df[["Comment", "Language"]]
y = df["Sentiment"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
fasttext_ko.wv.similar_by_word("감사합니다", topn=10)

[('정보감사합니다', 0.680259644985199),
 ('소식감사합니다', 0.6199125647544861),
 ('감사합니다-', 0.6165705919265747),
 ('회원님의', 0.6129526495933533),
 ('중입니다', 0.6093058586120605),
 ('제출해주셔서', 0.6072656512260437),
 ('좋은정보감사합니다', 0.600220263004303),
 ('감사합니', 0.594984769821167),
 ('감사합니다ㅠㅠ', 0.5819443464279175),
 ('문의해주셔서', 0.5784170031547546)]

In [17]:
fasttext_en.wv.similar_by_word("thank", topn=10)

[('Thank', 0.8313500881195068),
 ('thanks', 0.7889633774757385),
 ('thankyou', 0.7695834040641785),
 ('Thank-you', 0.7579421401023865),
 ('thank-you', 0.7306676506996155),
 ('-Thank', 0.7267282009124756),
 ('thanking', 0.7214101552963257),
 ('-thank', 0.7014520168304443),
 ('Thanks', 0.7009347677230835),
 ('thnak', 0.698961615562439)]

In [18]:
def create_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [19]:
X_train["embedding"] = X_train.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

In [20]:
X_test["embedding"] = X_test.apply(
    lambda row: create_vector(
        row["Comment"], fasttext_ko if row["Language"] == "ko" else fasttext_en
    ),
    axis=1,
)

In [21]:
X_train = X_train.drop(columns=["Comment", "Language"])
X_test = X_test.drop(columns=["Comment", "Language"])

In [22]:
X_train_emb = np.stack(X_train["embedding"].values)
X_test_emb = np.stack(X_test["embedding"].values)

In [23]:
len(X_train_emb[0])

300

### Logistic Regression

In [24]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_emb, y_train)

In [25]:
y_pred = lr.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72     20063
           1       0.72      0.71      0.71     19937

    accuracy                           0.72     40000
   macro avg       0.72      0.72      0.72     40000
weighted avg       0.72      0.72      0.72     40000



In [26]:
y_pred = lr.predict(X_train_emb)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.73      0.72     79937
           1       0.72      0.70      0.71     80063

    accuracy                           0.72    160000
   macro avg       0.72      0.72      0.72    160000
weighted avg       0.72      0.72      0.72    160000



### SVM

In [27]:
svm = LinearSVC()
svm.fit(X_train_emb, y_train)

In [28]:
y_pred = svm.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.75      0.74     20063
           1       0.74      0.72      0.73     19937

    accuracy                           0.73     40000
   macro avg       0.73      0.73      0.73     40000
weighted avg       0.73      0.73      0.73     40000



In [29]:
y_pred = svm.predict(X_train_emb)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.75      0.73     79937
           1       0.74      0.72      0.73     80063

    accuracy                           0.73    160000
   macro avg       0.73      0.73      0.73    160000
weighted avg       0.73      0.73      0.73    160000

