In [8]:
import pandas as pd


In [9]:
df = pd.read_csv('./assets/preprocessed.csv', encoding='ISO-8859-1')


# Models


In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

classifiers = (
    KNeighborsClassifier(n_neighbors=10),
    GaussianNB(),
    RandomForestClassifier(max_depth=20, n_estimators=1000),
    LinearSVC(max_iter=100000),
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
)


# Embeddings


In [11]:
from embed_bag_of_words import load_embedded_bag_of_words
from embed_tf_idf import load_embedded_tf_idf
from embed_word2vec import load_embedded_word2vec
from embed_glove import load_embedded_glove

embeddings = {'bag of words': load_embedded_bag_of_words,
              'tf idf': load_embedded_tf_idf,
              'word2vec': load_embedded_word2vec,
              'glove': load_embedded_glove}


# Training


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time


In [13]:
labels = df['label'].to_numpy()


In [14]:
for name, load_func in embeddings.items():
    print(name)
    data = load_func()
    X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                        test_size=0.2,
                                                        random_state=42)
    for classifier in classifiers:
        classifier.n_jobs = -1
        start_time = time.time()
        classifier.fit(X_train, y_train)
        time_elapsed = time.time() - start_time
        y_pred = classifier.predict(X_test)
        acc_test = accuracy_score(y_test, y_pred)
        print(f'{classifier.__class__.__name__} test accuracy: {acc_test:.3f}, training time: {time_elapsed}')
    print()


bag of words
KNeighborsClassifier test accuracy: 0.286, training time: 0.0013053417205810547
GaussianNB test accuracy: 0.444, training time: 3.5050323009490967
RandomForestClassifier test accuracy: 0.609, training time: 118.93186521530151
LinearSVC test accuracy: 0.779, training time: 5.126997470855713
XGBClassifier test accuracy: 0.769, training time: 371.67400646209717

tf idf
KNeighborsClassifier test accuracy: 0.204, training time: 0.1533803939819336
GaussianNB test accuracy: 0.442, training time: 3.0362346172332764
RandomForestClassifier test accuracy: 0.603, training time: 119.82797050476074
LinearSVC test accuracy: 0.806, training time: 1.1637346744537354
XGBClassifier test accuracy: 0.757, training time: 374.7902054786682

word2vec
KNeighborsClassifier test accuracy: 0.590, training time: 0.00878286361694336
GaussianNB test accuracy: 0.509, training time: 0.09160709381103516
RandomForestClassifier test accuracy: 0.657, training time: 51.26156187057495
LinearSVC test accuracy: 0