In [22]:
import pandas as pd


In [23]:
df = pd.read_csv('./assets/preprocessed.csv', encoding='ISO-8859-1')


# Models


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

classifiers = (
    LogisticRegression(max_iter=100000),
    KNeighborsClassifier(n_neighbors=10),
    GaussianNB(),
    RandomForestClassifier(max_depth=20, n_estimators=1000),
    LinearSVC(max_iter=100000),
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
)


# Embeddings


In [25]:
from embed_bag_of_words import load_embedded_bag_of_words
from embed_tf_idf import load_embedded_tf_idf
from embed_word2vec import load_embedded_word2vec
from embed_glove import load_embedded_glove

embeddings = {'bag of words': load_embedded_bag_of_words,
              'tf idf': load_embedded_tf_idf,
              'word2vec': load_embedded_word2vec,
              'glove': load_embedded_glove}


# Training


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time


In [27]:
labels = df['label'].to_numpy()


In [28]:
for name, load_func in embeddings.items():
    print(name)
    data = load_func()
    X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                        test_size=0.2,
                                                        random_state=42)
    for classifier in classifiers:
        classifier.n_jobs = -1
        start_time = time.time()
        classifier.fit(X_train, y_train)
        time_elapsed = time.time() - start_time
        y_pred = classifier.predict(X_test)
        acc_test = accuracy_score(y_test, y_pred)
        print(f'{classifier.__class__.__name__} test accuracy: {acc_test:.3f}, training time: {time_elapsed}')
    print()


bag of words
LogisticRegression test accuracy: 0.803, training time: 594.3867499828339
KNeighborsClassifier test accuracy: 0.286, training time: 0.002147674560546875
GaussianNB test accuracy: 0.444, training time: 3.509169578552246
RandomForestClassifier test accuracy: 0.607, training time: 120.4115481376648
LinearSVC test accuracy: 0.779, training time: 5.093305826187134
XGBClassifier test accuracy: 0.769, training time: 372.45235419273376

tf idf
LogisticRegression test accuracy: 0.802, training time: 382.71795868873596
KNeighborsClassifier test accuracy: 0.204, training time: 0.20876526832580566
GaussianNB test accuracy: 0.442, training time: 3.0113823413848877
RandomForestClassifier test accuracy: 0.601, training time: 120.8062379360199
LinearSVC test accuracy: 0.806, training time: 1.1984708309173584
XGBClassifier test accuracy: 0.757, training time: 376.30420088768005

word2vec
LogisticRegression test accuracy: 0.671, training time: 9.849668264389038
KNeighborsClassifier test acc