In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("./assets/preprocessed.csv", encoding="ISO-8859-1")

# Models

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

classifiers = [
    KNeighborsClassifier(n_neighbors=10), 
    GaussianNB(),
    RandomForestClassifier(max_depth=20, n_estimators=1000),
    SVC(),
    XGBClassifier(use_label_encoder=False)
]

# Embedding

In [13]:
from embed_bag_of_words import load_embedded_bag_of_words
from embed_tf_idf import load_embedded_tf_idf
from embed_word2vec import load_embedded_word2vec
from embed_glove import load_embedded_glove
methods = [load_embedded_bag_of_words, load_embedded_tf_idf, load_embedded_word2vec, load_embedded_glove]
embeddings = ['bag of words', 'tf idf', 'word2vec','glove']

# Train

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
label = df["label"].to_numpy()

In [16]:
for i in range(len(embeddings)):
    print(embeddings[i])
    data = methods[i]()
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)
    for classifier in classifiers:
        classifier.n_jobs = -1
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        acc_test = accuracy_score(y_test, y_pred)
        print(classifier.__class__.__name__+" test accuracy: %0.3f" % (acc_test))
    print()

bag of words
KNeighborsClassifier test accuracy: 0.286
GaussianNB test accuracy: 0.444
RandomForestClassifier test accuracy: 0.607
SVC test accuracy: 0.782
XGBClassifier test accuracy: 0.769

tf idf
KNeighborsClassifier test accuracy: 0.204
GaussianNB test accuracy: 0.442
RandomForestClassifier test accuracy: 0.602
SVC test accuracy: 0.792
XGBClassifier test accuracy: 0.757

word2vec
KNeighborsClassifier test accuracy: 0.590
GaussianNB test accuracy: 0.509
RandomForestClassifier test accuracy: 0.662
SVC test accuracy: 0.703
XGBClassifier test accuracy: 0.684

glove
KNeighborsClassifier test accuracy: 0.589
GaussianNB test accuracy: 0.552
RandomForestClassifier test accuracy: 0.641
SVC test accuracy: 0.676
XGBClassifier test accuracy: 0.662

