In [1]:
import os
import pyarrow.parquet as pq

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from data_io import *
from prepare_data import *

In [2]:
def fit_and_print_scores(clf, name, x_train, y_train, x_dev, y_dev):
    clf.fit(x_train, y_train)
    train_score = clf.score(x_train, y_train)
    dev_score = clf.score(x_dev, y_dev)
    
    print(name, ': -> train = ', train_score, '|  dev = ', dev_score)
    
    return clf


def run_classifiers(x_train, y_train, x_dev, y_dev):
    clf = SVC()
    fit_and_print_scores(clf, 'SVC', x_train, y_train, x_dev, y_dev)

    clf = LogisticRegression()
    fit_and_print_scores(clf, 'Logistic Regression', x_train, y_train, x_dev, y_dev)

    clf = LinearDiscriminantAnalysis()
    fit_and_print_scores(clf, 'LinearDiscriminantAnalysis', x_train, y_train, x_dev, y_dev)

    clf = GaussianNB()
    fit_and_print_scores(clf, 'GaussianNB', x_train, y_train, x_dev, y_dev)

    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    fit_and_print_scores(clf, 'KNeighborsClassifier', x_train, y_train, x_dev, y_dev)

    clf = DecisionTreeClassifier()
    fit_and_print_scores(clf, 'DecisionTreeClassifier', x_train, y_train, x_dev, y_dev)

    clf = RandomForestClassifier(n_estimators=100)
    fit_and_print_scores(clf, 'RandomForestClassifier', x_train, y_train, x_dev, y_dev)

    hidden_layer_size = 20
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', solver='adam', 
                        alpha=0.0001, learning_rate_init=0.001, max_iter=1000)
    clf.fit(x_train, y_train)
    fit_and_print_scores(clf, 'MLPClassifier', x_train, y_train, x_dev, y_dev)

    dtrain = xgb.DMatrix(x_train, label=y_train)
    ddev = xgb.DMatrix(x_dev, label=y_dev)
    param = {'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'mape', 'base_score': 0.5}
    num_round = 10
    evallist = [(ddev, 'eval'), (dtrain, 'train')]
    bst = xgb.train(param, dtrain, num_round, evallist)
    y_pred = np.around(bst.predict(dtrain))
    train_score = accuracy_score(y_true=y_train, y_pred=y_pred)
    y_pred = np.around(bst.predict(ddev))
    dev_score = accuracy_score(y_true=y_dev, y_pred=y_pred)
    print('XGBoost', ': -> train = ', train_score, '|  dev = ', dev_score)
    
    return

In [3]:
data_dir = os.path.join('..', 'data', 'pan21-author-profiling-training-2021-03-14')
op_dir = os.path.join('..', 'res', 'preds')
df_columns = ['author_id', 'tweet', 'label']

## English Data

In [4]:
#en_df = create_df(data_dir, lang='en', df_columns=df_columns)
en_df = pq.read_table(os.path.join(data_dir, 'en_df.parquet')).to_pandas()
en_train, en_dev = get_single_split(en_df, data_dir, lang='en')
en_train.shape, en_dev.shape

((32000, 3), (8000, 3))

In [5]:
en_train.head()

Unnamed: 0,author_id,tweet,label
200,06893abba0bb8f94fed7562350233ed7,"Romanian graftbuster’s firing violated rights,...",0
201,06893abba0bb8f94fed7562350233ed7,Russian ventilators sent to U.S. made by firm ...,0
202,06893abba0bb8f94fed7562350233ed7,Hezbollah prevented ISIS from reaching Europe:...,0
203,06893abba0bb8f94fed7562350233ed7,Epidemiologist Dr Knut Wittkowski: ‘Lockdown H...,0
204,06893abba0bb8f94fed7562350233ed7,China refuses to let WHO investigate truth beh...,0


In [6]:
x_train, y_train, en_vec = prepare_xy(en_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                                      return_y=True, usr_len=200, is_train=True, vec=None)

x_train = x_train.toarray()
x_train.shape, y_train.shape, en_vec

((160, 21272),
 (160,),
 TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None))

In [7]:
x_dev, y_dev, _ = prepare_xy(en_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                             return_y=True, usr_len=200, is_train=False, vec=en_vec)
x_dev = x_dev.toarray()
x_dev.shape, y_dev.shape

((40, 21272), (40,))

### All Features

In [8]:
run_classifiers(x_train, y_train, x_dev, y_dev)

SVC : -> train =  0.99375 |  dev =  0.775
Logistic Regression : -> train =  0.99375 |  dev =  0.775
LinearDiscriminantAnalysis : -> train =  0.8875 |  dev =  0.6
GaussianNB : -> train =  1.0 |  dev =  0.725
KNeighborsClassifier : -> train =  1.0 |  dev =  0.625
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.675
RandomForestClassifier : -> train =  1.0 |  dev =  0.775
MLPClassifier : -> train =  1.0 |  dev =  0.775
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.725


## PCA

In [26]:
n_components = 100
pca = PCA(n_components=n_components)
x_train_p = pca.fit_transform(x_train)
x_dev_p = pca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
pca.singular_values_

(160, 100) (40, 100)


array([1.88709773, 1.7627927 , 1.23424296, 1.10822803, 0.99869542,
       0.99090601, 0.98040224, 0.94619594, 0.92065637, 0.89599113,
       0.88693037, 0.88388725, 0.87080411, 0.85760221, 0.84527254,
       0.83978436, 0.82254253, 0.81792313, 0.80597571, 0.80560853,
       0.80074391, 0.7917119 , 0.78217109, 0.77573173, 0.76691145,
       0.76216104, 0.76021899, 0.75470908, 0.74620466, 0.7407742 ,
       0.73524779, 0.72004872, 0.71931262, 0.71888844, 0.70870966,
       0.70749374, 0.69398157, 0.68895129, 0.68852643, 0.68417937,
       0.67974224, 0.67787088, 0.67647547, 0.67331265, 0.6672057 ,
       0.6649778 , 0.66037367, 0.65735135, 0.65478188, 0.6535329 ,
       0.65159883, 0.64894467, 0.64432757, 0.64193421, 0.64077105,
       0.63905547, 0.63610713, 0.63435741, 0.63101386, 0.62855841,
       0.62797681, 0.6260488 , 0.62244886, 0.62152958, 0.61913268,
       0.6178433 , 0.61650063, 0.61455007, 0.61264959, 0.61129309,
       0.60879848, 0.60786659, 0.60300408, 0.60273369, 0.60221

In [27]:
run_classifiers(x_train_p, y_train, x_dev_p, y_dev)

SVC : -> train =  0.94375 |  dev =  0.7
Logistic Regression : -> train =  0.8875 |  dev =  0.775
LinearDiscriminantAnalysis : -> train =  0.9625 |  dev =  0.75
GaussianNB : -> train =  0.79375 |  dev =  0.525
KNeighborsClassifier : -> train =  1.0 |  dev =  0.65
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.675
RandomForestClassifier : -> train =  1.0 |  dev =  0.7




MLPClassifier : -> train =  0.99375 |  dev =  0.775
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.625




### Truncated SVD

In [28]:
truncated_svd = TruncatedSVD(n_components=n_components)
x_train_tsvd = truncated_svd.fit_transform(x_train)
x_dev_tsvd = truncated_svd.transform(x_dev)
print(x_train_tsvd.shape, x_dev_tsvd.shape)
truncated_svd.singular_values_

(160, 100) (40, 100)


array([9.49571531, 1.88196962, 1.24466245, 1.10971117, 1.02573705,
       0.99300647, 0.98439712, 0.97864332, 0.94608482, 0.91155774,
       0.89587301, 0.88427002, 0.88130109, 0.86676148, 0.85701234,
       0.84008459, 0.83722951, 0.81976245, 0.8143686 , 0.80607262,
       0.80236018, 0.79948554, 0.79139165, 0.781747  , 0.77591329,
       0.76703918, 0.76216388, 0.76023475, 0.75486696, 0.74667286,
       0.74133259, 0.73442765, 0.72007386, 0.71966412, 0.71589247,
       0.70899346, 0.70734026, 0.69384813, 0.68941399, 0.686629  ,
       0.6851345 , 0.68365383, 0.67947341, 0.67686956, 0.67349235,
       0.66718253, 0.665197  , 0.66288371, 0.65992389, 0.65626649,
       0.65600389, 0.65473491, 0.64999908, 0.64714101, 0.64453676,
       0.64330558, 0.64239691, 0.6356471 , 0.63453501, 0.63243901,
       0.63063348, 0.62926087, 0.62873524, 0.62572563, 0.62352265,
       0.62178522, 0.61921941, 0.61672577, 0.6145876 , 0.61314166,
       0.61101219, 0.60890681, 0.60834885, 0.60632381, 0.60409

In [29]:
run_classifiers(x_train_tsvd, y_train, x_dev_tsvd, y_dev)

SVC : -> train =  0.91875 |  dev =  0.675
Logistic Regression : -> train =  0.88125 |  dev =  0.8
LinearDiscriminantAnalysis : -> train =  0.9375 |  dev =  0.75
GaussianNB : -> train =  0.69375 |  dev =  0.525
KNeighborsClassifier : -> train =  1.0 |  dev =  0.7
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.725
RandomForestClassifier : -> train =  1.0 |  dev =  0.775




MLPClassifier : -> train =  0.99375 |  dev =  0.8
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.7




### LDA

In [30]:
lda = LatentDirichletAllocation(n_components=n_components, batch_size=160, max_iter=100)
x_train_lda = lda.fit_transform(x_train)
x_dev_lda = lda.transform(x_dev)
print(x_train_lda.shape, x_dev_lda.shape)
lda.exp_dirichlet_component_

(160, 100) (40, 100)


array([[5.37218726e-47, 5.37218726e-47, 5.37218726e-47, ...,
        5.37218726e-47, 5.37218726e-47, 5.37218726e-47],
       [5.99606062e-47, 5.99606062e-47, 5.99606062e-47, ...,
        5.99606062e-47, 5.99606062e-47, 5.99606062e-47],
       [6.10372787e-47, 6.10372787e-47, 6.10372787e-47, ...,
        6.10372787e-47, 6.10372787e-47, 6.10372787e-47],
       ...,
       [5.58551961e-47, 5.58551961e-47, 5.58551961e-47, ...,
        5.58551961e-47, 5.58551961e-47, 5.58551961e-47],
       [6.10372787e-47, 6.10372787e-47, 6.10372787e-47, ...,
        6.10372787e-47, 6.10372787e-47, 6.10372787e-47],
       [6.10372787e-47, 6.10372787e-47, 6.10372787e-47, ...,
        6.10372787e-47, 6.10372787e-47, 6.10372787e-47]])

In [31]:
run_classifiers(x_train_lda, y_train, x_dev_lda, y_dev)

SVC : -> train =  0.76875 |  dev =  0.7
Logistic Regression : -> train =  0.76875 |  dev =  0.6
LinearDiscriminantAnalysis : -> train =  0.85 |  dev =  0.725
GaussianNB : -> train =  0.825 |  dev =  0.6
KNeighborsClassifier : -> train =  1.0 |  dev =  0.525
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.575
RandomForestClassifier : -> train =  1.0 |  dev =  0.725




MLPClassifier : -> train =  0.86875 |  dev =  0.65
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.625




## Spanish Data

In [32]:
#es_df = create_df(data_dir, lang='es', df_columns=df_columns)
es_df = pq.read_table(os.path.join(data_dir, 'es_df.parquet')).to_pandas()
es_train, es_dev = get_single_split(es_df, data_dir, lang='es')
es_train.shape, es_dev.shape

((32000, 3), (8000, 3))

In [33]:
es_train.head()

Unnamed: 0,author_id,tweet,label
0,0035a3060d075506f5b9b978a910aa1f,#USER# pasta con bichos de agua,0
1,0035a3060d075506f5b9b978a910aa1f,De verdad puto lol de mierda qué asco de juego...,0
2,0035a3060d075506f5b9b978a910aa1f,RT #USER#: me hice una pcr y ya tengo los resu...,0
3,0035a3060d075506f5b9b978a910aa1f,"Y un lomo queso de baguette entera, tranqui #URL#",0
4,0035a3060d075506f5b9b978a910aa1f,Me cambio de curro y me llegan 3 ofertas direc...,0


In [34]:
x_train, y_train, es_vec = prepare_xy(es_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='es',
                                      return_y=True, usr_len=200, is_train=True, vec=None)

x_train = x_train.toarray()
x_train.shape, y_train.shape, es_vec

((160, 34833),
 (160,),
 TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words=None, strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None))

In [35]:
x_dev, y_dev, _ = prepare_xy(es_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='es',
                             return_y=True, usr_len=200, is_train=False, vec=es_vec)
x_dev = x_dev.toarray()
x_dev.shape, y_dev.shape

((40, 34833), (40,))

### All Features

In [36]:
run_classifiers(x_train, y_train, x_dev, y_dev)

SVC : -> train =  0.95 |  dev =  0.75
Logistic Regression : -> train =  0.9125 |  dev =  0.775
LinearDiscriminantAnalysis : -> train =  0.725 |  dev =  0.75
GaussianNB : -> train =  1.0 |  dev =  0.8
KNeighborsClassifier : -> train =  1.0 |  dev =  0.725
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.4
RandomForestClassifier : -> train =  1.0 |  dev =  0.7
MLPClassifier : -> train =  1.0 |  dev =  0.85
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.625


### PCA

In [37]:
pca = PCA(n_components=n_components)
x_train_p = pca.fit_transform(x_train)
x_dev_p = pca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
pca.singular_values_

(160, 100) (40, 100)


array([1.88709773, 1.7627927 , 1.23424292, 1.10822854, 0.99869356,
       0.99090987, 0.98040475, 0.94619047, 0.92064974, 0.89601329,
       0.88694517, 0.88389186, 0.87079409, 0.8575744 , 0.84521959,
       0.83979441, 0.82259353, 0.81789115, 0.80595764, 0.8056426 ,
       0.80090685, 0.79171605, 0.78222771, 0.77542693, 0.76671333,
       0.76193328, 0.76046869, 0.75480901, 0.74640127, 0.74099812,
       0.73573423, 0.71945   , 0.71933315, 0.71870976, 0.708867  ,
       0.70755311, 0.6932554 , 0.68891537, 0.68858549, 0.68444116,
       0.68302118, 0.67926234, 0.6755276 , 0.67272483, 0.66734227,
       0.66586956, 0.66178657, 0.65834722, 0.65537609, 0.65408473,
       0.65173268, 0.64969978, 0.64721833, 0.64298616, 0.64278752,
       0.64124061, 0.63677594, 0.63535128, 0.63207617, 0.62964067,
       0.62641554, 0.62531174, 0.62428635, 0.6225652 , 0.62147632,
       0.61730836, 0.61436555, 0.61305073, 0.61046926, 0.61007768,
       0.60824676, 0.60479808, 0.60327472, 0.60165628, 0.59898

In [38]:
run_classifiers(x_train_p, y_train, x_dev_p, y_dev)

SVC : -> train =  0.95 |  dev =  0.675
Logistic Regression : -> train =  0.89375 |  dev =  0.775
LinearDiscriminantAnalysis : -> train =  0.95 |  dev =  0.725
GaussianNB : -> train =  0.73125 |  dev =  0.525
KNeighborsClassifier : -> train =  1.0 |  dev =  0.625
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.675
RandomForestClassifier : -> train =  1.0 |  dev =  0.8




MLPClassifier : -> train =  0.99375 |  dev =  0.8
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf




XGBoost : -> train =  1.0 |  dev =  0.85


### Truncated SVD

In [39]:
truncated_svd = TruncatedSVD(n_components=n_components)
x_train_tsvd = truncated_svd.fit_transform(x_train)
x_dev_tsvd = truncated_svd.transform(x_dev)
print(x_train_tsvd.shape, x_dev_tsvd.shape)
truncated_svd.singular_values_

(160, 100) (40, 100)


array([9.49571531, 1.88196962, 1.24466245, 1.10971114, 1.02573677,
       0.9930061 , 0.98439716, 0.97864336, 0.94608418, 0.91155893,
       0.89587227, 0.88427394, 0.8813099 , 0.86675975, 0.85701172,
       0.84006896, 0.83721972, 0.81975396, 0.81437267, 0.80604117,
       0.80238727, 0.79944003, 0.79136321, 0.78172513, 0.77592013,
       0.7670123 , 0.76221873, 0.76032211, 0.75488975, 0.74668325,
       0.74128781, 0.73442371, 0.72008272, 0.71979144, 0.71606612,
       0.70910439, 0.70689746, 0.69392963, 0.68932446, 0.68664118,
       0.68515832, 0.68368243, 0.67898717, 0.6771241 , 0.67320021,
       0.66766008, 0.66544228, 0.66205057, 0.66048689, 0.65655634,
       0.655431  , 0.65325607, 0.65105825, 0.64734654, 0.64474357,
       0.64372649, 0.641927  , 0.63754572, 0.63559891, 0.63305792,
       0.63118656, 0.63071926, 0.62897826, 0.62609072, 0.62427225,
       0.62039222, 0.61937584, 0.61609714, 0.61481381, 0.61366451,
       0.61169276, 0.61158381, 0.61036352, 0.60749755, 0.60333

In [40]:
run_classifiers(x_train_tsvd, y_train, x_dev_tsvd, y_dev)

SVC : -> train =  0.925 |  dev =  0.675
Logistic Regression : -> train =  0.88125 |  dev =  0.775
LinearDiscriminantAnalysis : -> train =  0.94375 |  dev =  0.75
GaussianNB : -> train =  0.6875 |  dev =  0.525
KNeighborsClassifier : -> train =  1.0 |  dev =  0.675
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.6
RandomForestClassifier : -> train =  1.0 |  dev =  0.725




MLPClassifier : -> train =  0.99375 |  dev =  0.8
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf




[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.65


### LDA

In [41]:
lda = LatentDirichletAllocation(n_components=n_components, batch_size=160, max_iter=100)
x_train_lda = lda.fit_transform(x_train)
x_dev_lda = lda.transform(x_dev)
print(x_train_lda.shape, x_dev_lda.shape)
lda.exp_dirichlet_component_

(160, 100) (40, 100)


array([[5.92653804e-47, 5.92653804e-47, 5.92653804e-47, ...,
        5.92653804e-47, 5.92653804e-47, 5.92653804e-47],
       [6.09206459e-47, 6.09206459e-47, 6.09206459e-47, ...,
        6.09206459e-47, 6.09206459e-47, 6.09206459e-47],
       [6.04948619e-47, 6.04948619e-47, 6.04948619e-47, ...,
        6.04948619e-47, 6.04948619e-47, 6.04948619e-47],
       ...,
       [6.10372787e-47, 6.10372787e-47, 6.10372787e-47, ...,
        6.10372787e-47, 6.10372787e-47, 6.10372787e-47],
       [6.02307716e-47, 6.02307716e-47, 6.02307716e-47, ...,
        6.02307716e-47, 6.02307716e-47, 6.02307716e-47],
       [6.10372787e-47, 6.10372787e-47, 6.10372787e-47, ...,
        6.10372787e-47, 6.10372787e-47, 6.10372787e-47]])

In [42]:
run_classifiers(x_train_lda, y_train, x_dev_lda, y_dev)

SVC : -> train =  0.78125 |  dev =  0.525
Logistic Regression : -> train =  0.7125 |  dev =  0.55
LinearDiscriminantAnalysis : -> train =  0.86875 |  dev =  0.675
GaussianNB : -> train =  0.83125 |  dev =  0.675
KNeighborsClassifier : -> train =  1.0 |  dev =  0.625
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.675
RandomForestClassifier : -> train =  1.0 |  dev =  0.75




MLPClassifier : -> train =  0.86875 |  dev =  0.75
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.725


