In [1]:
import os
import pyarrow.parquet as pq

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from data_io import *
from prepare_data import *
from perspective_scores_api import extract_perspective_scores_for_authors

In [2]:
def fit_and_print_scores(clf, name, x_train, y_train, x_dev, y_dev):
    clf.fit(x_train, y_train)
    train_score = clf.score(x_train, y_train)
    dev_score = clf.score(x_dev, y_dev)
    
    print(name, ': -> train = ', train_score, '|  dev = ', dev_score)
    
    return clf


def run_classifiers(x_train, y_train, x_dev, y_dev):
    clf = SVC()
    fit_and_print_scores(clf, 'SVC', x_train, y_train, x_dev, y_dev)

    clf = LogisticRegression()
    fit_and_print_scores(clf, 'Logistic Regression', x_train, y_train, x_dev, y_dev)

    clf = LinearDiscriminantAnalysis()
    fit_and_print_scores(clf, 'LinearDiscriminantAnalysis', x_train, y_train, x_dev, y_dev)

    clf = GaussianNB()
    fit_and_print_scores(clf, 'GaussianNB', x_train, y_train, x_dev, y_dev)

    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    fit_and_print_scores(clf, 'KNeighborsClassifier', x_train, y_train, x_dev, y_dev)

    clf = DecisionTreeClassifier()
    fit_and_print_scores(clf, 'DecisionTreeClassifier', x_train, y_train, x_dev, y_dev)

    clf = RandomForestClassifier(n_estimators=100)
    fit_and_print_scores(clf, 'RandomForestClassifier', x_train, y_train, x_dev, y_dev)

    hidden_layer_size = 20
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', solver='adam', 
                        alpha=0.0001, learning_rate_init=0.001, max_iter=1000)
    clf.fit(x_train, y_train)
    fit_and_print_scores(clf, 'MLPClassifier', x_train, y_train, x_dev, y_dev)

    dtrain = xgb.DMatrix(x_train, label=y_train)
    ddev = xgb.DMatrix(x_dev, label=y_dev)
    param = {'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'mape', 'base_score': 0.5}
    num_round = 10
    evallist = [(ddev, 'eval'), (dtrain, 'train')]
    bst = xgb.train(param, dtrain, num_round, evallist)
    y_pred = np.around(bst.predict(dtrain))
    train_score = accuracy_score(y_true=y_train, y_pred=y_pred)
    y_pred = np.around(bst.predict(ddev))
    dev_score = accuracy_score(y_true=y_dev, y_pred=y_pred)
    print('XGBoost', ': -> train = ', train_score, '|  dev = ', dev_score)
    
    return

In [3]:
data_dir = os.path.join('..', 'data', 'pan21-author-profiling-training-2021-03-14')
op_dir = os.path.join('..', 'res', 'preds')
df_columns = ['author_id', 'tweet', 'label']

## English Data

In [4]:
#en_df = create_df(data_dir, lang='en', df_columns=df_columns)
en_df = pq.read_table(os.path.join(data_dir, 'en_df.parquet')).to_pandas()
en_train, en_dev = get_single_split(en_df, data_dir, lang='en')
en_train.shape, en_dev.shape

((32000, 3), (8000, 3))

In [5]:
en_train.head()

Unnamed: 0,author_id,tweet,label
200,06893abba0bb8f94fed7562350233ed7,"Romanian graftbuster’s firing violated rights,...",0
201,06893abba0bb8f94fed7562350233ed7,Russian ventilators sent to U.S. made by firm ...,0
202,06893abba0bb8f94fed7562350233ed7,Hezbollah prevented ISIS from reaching Europe:...,0
203,06893abba0bb8f94fed7562350233ed7,Epidemiologist Dr Knut Wittkowski: ‘Lockdown H...,0
204,06893abba0bb8f94fed7562350233ed7,China refuses to let WHO investigate truth beh...,0


In [6]:
x_train = extract_perspective_scores_for_authors(en_train, data_dir, lang='en', steps=20)
_, y_train, _ = prepare_xy(en_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                           return_y=True, usr_len=200, is_train=True, vec=None)
x_train.shape, y_train.shape

((160, 160), (160,))

In [7]:
x_dev = extract_perspective_scores_for_authors(en_dev, data_dir, lang='en', steps=20)
_, y_dev, _ = prepare_xy(en_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                           return_y=True, usr_len=200, is_train=True, vec=None)
x_dev.shape, y_dev.shape

((40, 160), (40,))

### All Features

In [8]:
run_classifiers(x_train, y_train, x_dev, y_dev)

SVC : -> train =  0.91875 |  dev =  0.675
Logistic Regression : -> train =  0.875 |  dev =  0.7
LinearDiscriminantAnalysis : -> train =  1.0 |  dev =  0.525
GaussianNB : -> train =  0.6375 |  dev =  0.65
KNeighborsClassifier : -> train =  1.0 |  dev =  0.625
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.625
RandomForestClassifier : -> train =  1.0 |  dev =  0.7
MLPClassifier : -> train =  1.0 |  dev =  0.625
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf




[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.6


### PCA

In [9]:
n_components = 50
pca = PCA(n_components=n_components, svd_solver='full')
x_train_p = pca.fit_transform(x_train)
x_dev_p = pca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
pca.singular_values_

(160, 50) (40, 50)


array([26.19382969,  7.83508751,  7.10265551,  6.97600428,  6.55426293,
        6.36509136,  6.20802854,  5.95206833,  5.86963011,  5.77430962,
        5.56067101,  5.43460676,  5.30549088,  5.06903537,  4.90511163,
        4.81436008,  4.64481584,  4.46325598,  4.36193513,  4.22569465,
        4.09509179,  3.80768813,  3.4514503 ,  3.3350317 ,  3.14021894,
        2.99495971,  2.93743654,  2.86905819,  2.83695026,  2.71634341,
        2.68874415,  2.64934544,  2.61997657,  2.55876661,  2.50754807,
        2.47601816,  2.43672051,  2.40817307,  2.36182404,  2.29694776,
        2.2421081 ,  2.19225205,  2.15617101,  2.12744053,  2.08958321,
        2.04318623,  2.02566307,  1.98751921,  1.96672489,  1.92258272])

In [10]:
run_classifiers(x_train_p, y_train, x_dev_p, y_dev)

SVC : -> train =  0.90625 |  dev =  0.725
Logistic Regression : -> train =  0.8375 |  dev =  0.7
LinearDiscriminantAnalysis : -> train =  0.83125 |  dev =  0.725
GaussianNB : -> train =  0.81875 |  dev =  0.65
KNeighborsClassifier : -> train =  1.0 |  dev =  0.625
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.55
RandomForestClassifier : -> train =  1.0 |  dev =  0.675
MLPClassifier : -> train =  1.0 |  dev =  0.75
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.775


### Kernel PCA

In [11]:
kpca = KernelPCA(n_components=n_components, kernel='rbf', fit_inverse_transform=True)
x_train_kp = kpca.fit_transform(x_train)
x_dev_kp = kpca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
kpca.lambdas_

(160, 50) (40, 50)


array([7.33777474, 0.68396385, 0.57969203, 0.54586355, 0.48594237,
       0.45215814, 0.43562222, 0.42770364, 0.39225425, 0.38463268,
       0.35917629, 0.33609168, 0.3239094 , 0.29123352, 0.27199872,
       0.27075729, 0.24904103, 0.22905834, 0.21869784, 0.21134111,
       0.19189844, 0.16899552, 0.15495735, 0.13565327, 0.12895083,
       0.11279013, 0.10534592, 0.10190451, 0.09735498, 0.09503605,
       0.08868639, 0.08787198, 0.08292633, 0.08221109, 0.07890201,
       0.07507085, 0.07315583, 0.07158336, 0.06978766, 0.06734465,
       0.06460213, 0.06101473, 0.05961286, 0.05811223, 0.05570242,
       0.05460439, 0.05295793, 0.04986242, 0.04927391, 0.04826387])

In [12]:
run_classifiers(x_train_kp, y_train, x_dev_kp, y_dev)

SVC : -> train =  0.9125 |  dev =  0.725
Logistic Regression : -> train =  0.65625 |  dev =  0.65
LinearDiscriminantAnalysis : -> train =  0.8375 |  dev =  0.725
GaussianNB : -> train =  0.79375 |  dev =  0.7
KNeighborsClassifier : -> train =  1.0 |  dev =  0.6
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.675
RandomForestClassifier : -> train =  1.0 |  dev =  0.625




MLPClassifier : -> train =  0.875 |  dev =  0.775
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.65




### NMF

In [13]:
nmf = NMF(n_components=n_components, init='random')
x_train_nmf = nmf.fit_transform(x_train)
x_dev_nmf = nmf.transform(x_dev)
print(x_train_nmf.shape, x_dev_nmf.shape)
nmf.reconstruction_err_

(160, 50) (40, 50)


11.68868645117112

In [14]:
run_classifiers(x_train_nmf, y_train, x_dev_nmf, y_dev)

SVC : -> train =  0.7625 |  dev =  0.675
Logistic Regression : -> train =  0.73125 |  dev =  0.7
LinearDiscriminantAnalysis : -> train =  0.79375 |  dev =  0.675
GaussianNB : -> train =  0.725 |  dev =  0.6
KNeighborsClassifier : -> train =  1.0 |  dev =  0.575
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.575
RandomForestClassifier : -> train =  1.0 |  dev =  0.65




MLPClassifier : -> train =  0.99375 |  dev =  0.625
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.6




## Spanish Data

In [15]:
#es_df = create_df(data_dir, lang='es', df_columns=df_columns)
es_df = pq.read_table(os.path.join(data_dir, 'es_df.parquet')).to_pandas()
es_train, es_dev = get_single_split(es_df, data_dir, lang='es')
es_train.shape, es_dev.shape

((32000, 3), (8000, 3))

In [16]:
es_train.head()

Unnamed: 0,author_id,tweet,label
0,0035a3060d075506f5b9b978a910aa1f,#USER# pasta con bichos de agua,0
1,0035a3060d075506f5b9b978a910aa1f,De verdad puto lol de mierda qué asco de juego...,0
2,0035a3060d075506f5b9b978a910aa1f,RT #USER#: me hice una pcr y ya tengo los resu...,0
3,0035a3060d075506f5b9b978a910aa1f,"Y un lomo queso de baguette entera, tranqui #URL#",0
4,0035a3060d075506f5b9b978a910aa1f,Me cambio de curro y me llegan 3 ofertas direc...,0


In [17]:
x_train = extract_perspective_scores_for_authors(es_train, data_dir, lang='es', steps=20)
_, y_train, _ = prepare_xy(es_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='es',
                           return_y=True, usr_len=200, is_train=True, vec=None)
x_train.shape, y_train.shape

((160, 120), (160,))

In [18]:
x_dev = extract_perspective_scores_for_authors(es_dev, data_dir, lang='es', steps=20)
_, y_dev, _ = prepare_xy(es_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='es',
                           return_y=True, usr_len=200, is_train=True, vec=None)
x_dev.shape, y_dev.shape

((40, 120), (40,))

### All Features

In [19]:
run_classifiers(x_train, y_train, x_dev, y_dev)

SVC : -> train =  0.925 |  dev =  0.8
Logistic Regression : -> train =  0.88125 |  dev =  0.825
LinearDiscriminantAnalysis : -> train =  1.0 |  dev =  0.375
GaussianNB : -> train =  0.7375 |  dev =  0.7
KNeighborsClassifier : -> train =  1.0 |  dev =  0.75
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.775
RandomForestClassifier : -> train =  1.0 |  dev =  0.725
MLPClassifier : -> train =  1.0 |  dev =  0.775
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.675


### PCA

In [20]:
n_components = 40
pca = PCA(n_components=n_components, svd_solver='full')
x_train_p = pca.fit_transform(x_train)
x_dev_p = pca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
pca.singular_values_

(160, 40) (40, 40)


array([20.87730389,  7.95694644,  6.97541445,  6.93562704,  6.58450391,
        6.30114656,  6.17187826,  6.06417209,  5.89434985,  5.50202955,
        5.44674671,  5.30659403,  5.26211737,  5.08392637,  4.80990758,
        4.58096578,  4.48959968,  4.33392519,  4.16842643,  3.95374989,
        3.60460936,  3.52793679,  2.86087818,  2.82063837,  2.63969822,
        2.55722055,  2.50677374,  2.43613251,  2.31841893,  2.27574376,
        2.23359802,  2.18630093,  2.14898616,  2.09827094,  1.99493171,
        1.95634788,  1.93050352,  1.85866554,  1.78640636,  1.74362207])

In [21]:
run_classifiers(x_train_p, y_train, x_dev_p, y_dev)

SVC : -> train =  0.925 |  dev =  0.825
Logistic Regression : -> train =  0.85625 |  dev =  0.825
LinearDiscriminantAnalysis : -> train =  0.875 |  dev =  0.8
GaussianNB : -> train =  0.875 |  dev =  0.825
KNeighborsClassifier : -> train =  1.0 |  dev =  0.775
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.525
RandomForestClassifier : -> train =  1.0 |  dev =  0.8
MLPClassifier : -> train =  1.0 |  dev =  0.825
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.775


### Kernel PCA

In [22]:
kpca = KernelPCA(n_components=n_components, kernel='rbf', fit_inverse_transform=True)
x_train_kp = kpca.fit_transform(x_train)
x_dev_kp = kpca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
kpca.lambdas_

(160, 40) (40, 40)


array([6.10310692, 0.94582695, 0.72475426, 0.72200725, 0.64263922,
       0.59173553, 0.56692813, 0.54697544, 0.52316042, 0.4501591 ,
       0.44627623, 0.42273616, 0.41945332, 0.39187915, 0.34692951,
       0.33026473, 0.31534916, 0.2860135 , 0.28256431, 0.25443564,
       0.23334837, 0.19239341, 0.16456481, 0.12423734, 0.12154541,
       0.10955577, 0.10246341, 0.09808585, 0.09119461, 0.08527125,
       0.08068409, 0.08042121, 0.07728154, 0.07480601, 0.06861622,
       0.06612093, 0.06320416, 0.05927497, 0.05737951, 0.05331462])

In [23]:
run_classifiers(x_train_kp, y_train, x_dev_kp, y_dev)

SVC : -> train =  0.925 |  dev =  0.8
Logistic Regression : -> train =  0.74375 |  dev =  0.725
LinearDiscriminantAnalysis : -> train =  0.90625 |  dev =  0.8
GaussianNB : -> train =  0.8875 |  dev =  0.8
KNeighborsClassifier : -> train =  1.0 |  dev =  0.75
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.625
RandomForestClassifier : -> train =  1.0 |  dev =  0.825




MLPClassifier : -> train =  0.9 |  dev =  0.75
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.725




### NMF

In [24]:
nmf = NMF(n_components=n_components, init='nndsvda')
x_train_nmf = nmf.fit_transform(x_train)
x_dev_nmf = nmf.transform(x_dev)
print(x_train_nmf.shape, x_dev_nmf.shape)
nmf.reconstruction_err_

(160, 40) (40, 40)


9.788962382287838

In [25]:
run_classifiers(x_train_nmf, y_train, x_dev_nmf, y_dev)

SVC : -> train =  0.85 |  dev =  0.7
Logistic Regression : -> train =  0.75625 |  dev =  0.6
LinearDiscriminantAnalysis : -> train =  0.88125 |  dev =  0.8
GaussianNB : -> train =  0.8125 |  dev =  0.7
KNeighborsClassifier : -> train =  1.0 |  dev =  0.6
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.775
RandomForestClassifier : -> train =  1.0 |  dev =  0.8




MLPClassifier : -> train =  0.84375 |  dev =  0.7
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.775


