In [1]:
import os
import pyarrow.parquet as pq

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from data_io import *
from prepare_data import *
from perspective_scores_api import extract_perspective_scores_for_authors

In [2]:
def fit_and_print_scores(clf, name, x_train, y_train, x_dev, y_dev):
    clf.fit(x_train, y_train)
    train_score = clf.score(x_train, y_train)
    dev_score = clf.score(x_dev, y_dev)
    
    print(name, ': -> train = ', train_score, '|  dev = ', dev_score)
    
    return clf


def run_classifiers(x_train, y_train, x_dev, y_dev):
    clf = SVC()
    fit_and_print_scores(clf, 'SVC', x_train, y_train, x_dev, y_dev)

    clf = LogisticRegression()
    fit_and_print_scores(clf, 'Logistic Regression', x_train, y_train, x_dev, y_dev)

    clf = LinearDiscriminantAnalysis()
    fit_and_print_scores(clf, 'LinearDiscriminantAnalysis', x_train, y_train, x_dev, y_dev)

    clf = GaussianNB()
    fit_and_print_scores(clf, 'GaussianNB', x_train, y_train, x_dev, y_dev)

    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    fit_and_print_scores(clf, 'KNeighborsClassifier', x_train, y_train, x_dev, y_dev)

    clf = DecisionTreeClassifier()
    fit_and_print_scores(clf, 'DecisionTreeClassifier', x_train, y_train, x_dev, y_dev)

    clf = RandomForestClassifier(n_estimators=100)
    fit_and_print_scores(clf, 'RandomForestClassifier', x_train, y_train, x_dev, y_dev)

    hidden_layer_size = 20
    clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', solver='adam', 
                        alpha=0.0001, learning_rate_init=0.001, max_iter=1000)
    clf.fit(x_train, y_train)
    fit_and_print_scores(clf, 'MLPClassifier', x_train, y_train, x_dev, y_dev)

    dtrain = xgb.DMatrix(x_train, label=y_train)
    ddev = xgb.DMatrix(x_dev, label=y_dev)
    param = {'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'mape', 'base_score': 0.5}
    num_round = 10
    evallist = [(ddev, 'eval'), (dtrain, 'train')]
    bst = xgb.train(param, dtrain, num_round, evallist)
    y_pred = np.around(bst.predict(dtrain))
    train_score = accuracy_score(y_true=y_train, y_pred=y_pred)
    y_pred = np.around(bst.predict(ddev))
    dev_score = accuracy_score(y_true=y_dev, y_pred=y_pred)
    print('XGBoost', ': -> train = ', train_score, '|  dev = ', dev_score)
    
    return

In [3]:
data_dir = os.path.join('..', 'data', 'pan21-author-profiling-training-2021-03-14')
op_dir = os.path.join('..', 'res', 'preds')
df_columns = ['author_id', 'tweet', 'label']

## English Data

In [4]:
#en_df = create_df(data_dir, lang='en', df_columns=df_columns)
en_df = pq.read_table(os.path.join(data_dir, 'en_df.parquet')).to_pandas()
en_train, en_dev = get_single_split(en_df, data_dir, lang='en')
en_train.shape, en_dev.shape

((32000, 3), (8000, 3))

In [5]:
en_train.head()

Unnamed: 0,author_id,tweet,label
200,06893abba0bb8f94fed7562350233ed7,"Romanian graftbuster’s firing violated rights,...",0
201,06893abba0bb8f94fed7562350233ed7,Russian ventilators sent to U.S. made by firm ...,0
202,06893abba0bb8f94fed7562350233ed7,Hezbollah prevented ISIS from reaching Europe:...,0
203,06893abba0bb8f94fed7562350233ed7,Epidemiologist Dr Knut Wittkowski: ‘Lockdown H...,0
204,06893abba0bb8f94fed7562350233ed7,China refuses to let WHO investigate truth beh...,0


In [6]:
x_train_persp = extract_perspective_scores_for_authors(en_train, data_dir, lang='en', steps=20)
x_train_tfidf, y_train, en_vec = prepare_xy(en_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                           return_y=True, usr_len=200, is_train=True, vec=None)
x_train_persp.shape, x_train_tfidf.shape, y_train.shape

((160, 160), (160, 21272), (160,))

In [7]:
x_train = np.concatenate((x_train_persp, x_train_tfidf.toarray()), axis=1)
x_train.shape

(160, 21432)

In [8]:
x_dev_persp = extract_perspective_scores_for_authors(en_dev, data_dir, lang='en', steps=20)
x_dev_tfidf, y_dev, _ = prepare_xy(en_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                           return_y=True, usr_len=200, is_train=False, vec=en_vec)
x_dev_persp.shape, x_dev_tfidf.shape, y_dev.shape

((40, 160), (40, 21272), (40,))

In [9]:
x_dev = np.concatenate((x_dev_persp, x_dev_tfidf.toarray()), axis=1)
x_dev.shape

(40, 21432)

### All Features

In [10]:
run_classifiers(x_train, y_train, x_dev, y_dev)

SVC : -> train =  0.75 |  dev =  0.675
Logistic Regression : -> train =  0.9375 |  dev =  0.725
LinearDiscriminantAnalysis : -> train =  0.8625 |  dev =  0.575
GaussianNB : -> train =  1.0 |  dev =  0.725
KNeighborsClassifier : -> train =  1.0 |  dev =  0.65
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.625
RandomForestClassifier : -> train =  1.0 |  dev =  0.675
MLPClassifier : -> train =  1.0 |  dev =  0.75
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.75


### PCA

In [11]:
n_components = 150
pca = PCA(n_components=n_components, svd_solver='full')
x_train_p = pca.fit_transform(x_train)
x_dev_p = pca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
pca.singular_values_

(160, 150) (40, 150)


array([26.29147186,  7.89351222,  7.1861139 ,  7.06663141,  6.63726251,
        6.43951781,  6.28157013,  6.03197852,  5.9558171 ,  5.8546631 ,
        5.64424455,  5.52427195,  5.39060644,  5.15839657,  4.99804256,
        4.89681144,  4.73572564,  4.55696296,  4.46976193,  4.32895296,
        4.20240271,  3.91079196,  3.60456092,  3.46442572,  3.2779889 ,
        3.15869641,  3.07719332,  3.01440373,  2.99642844,  2.866378  ,
        2.84750655,  2.80796788,  2.78055358,  2.75135562,  2.67509308,
        2.64517652,  2.61810081,  2.57066016,  2.53291551,  2.48809138,
        2.42565624,  2.3756952 ,  2.34625576,  2.30874411,  2.27306876,
        2.2567691 ,  2.22952165,  2.1823069 ,  2.16851625,  2.1358046 ,
        2.11881357,  2.09768744,  2.07380189,  2.03853751,  2.00470971,
        1.98559964,  1.93668805,  1.91900062,  1.88430523,  1.86732307,
        1.85239624,  1.80711147,  1.77064697,  1.75900136,  1.72124969,
        1.70936101,  1.69122559,  1.66495474,  1.64965366,  1.59

In [12]:
run_classifiers(x_train_p, y_train, x_dev_p, y_dev)

SVC : -> train =  0.95 |  dev =  0.725
Logistic Regression : -> train =  0.9375 |  dev =  0.775
LinearDiscriminantAnalysis : -> train =  1.0 |  dev =  0.65
GaussianNB : -> train =  0.95 |  dev =  0.625
KNeighborsClassifier : -> train =  1.0 |  dev =  0.675
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.775
RandomForestClassifier : -> train =  1.0 |  dev =  0.7
MLPClassifier : -> train =  1.0 |  dev =  0.7
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.7


### PCA - PCA

In [13]:
n_components = 60
pca = PCA(n_components=n_components, svd_solver='full')
x_train_persp_p = pca.fit_transform(x_train_persp)
x_dev_persp_p = pca.transform(x_dev_persp)
print(x_train_persp_p.shape, x_dev_persp_p.shape)
print(pca.singular_values_)

n_components = 160
pca = PCA(n_components=n_components, svd_solver='full')
x_train_tfidf_p = pca.fit_transform(x_train_tfidf.toarray())
x_dev_tfidf_p = pca.transform(x_dev_tfidf.toarray())
print(x_train_tfidf_p.shape, x_dev_tfidf_p.shape)
print(pca.singular_values_)

x_train_p_p = np.concatenate((x_train_persp_p, x_train_tfidf_p), axis=1)
x_dev_p_p = np.concatenate((x_dev_persp_p, x_dev_tfidf_p), axis=1)
x_train_p_p.shape, x_dev_p_p.shape

(160, 60) (40, 60)
[26.19382969  7.83508751  7.10265551  6.97600428  6.55426293  6.36509136
  6.20802854  5.95206833  5.86963011  5.77430962  5.56067101  5.43460676
  5.30549088  5.06903537  4.90511163  4.81436008  4.64481584  4.46325598
  4.36193513  4.22569465  4.09509179  3.80768813  3.4514503   3.3350317
  3.14021894  2.99495971  2.93743654  2.86905819  2.83695026  2.71634341
  2.68874415  2.64934544  2.61997657  2.55876661  2.50754807  2.47601816
  2.43672051  2.40817307  2.36182404  2.29694776  2.2421081   2.19225205
  2.15617101  2.12744053  2.08958321  2.04318623  2.02566307  1.98751921
  1.96672489  1.92258272  1.90523377  1.89204672  1.86143007  1.8233823
  1.75946473  1.74126904  1.70622589  1.67897042  1.6426074   1.60870548]
(160, 160) (40, 160)
[2.78780047e+00 1.70684847e+00 1.46312205e+00 1.41228093e+00
 1.32122297e+00 1.29168040e+00 1.26514358e+00 1.19676899e+00
 1.17291601e+00 1.15915979e+00 1.14118061e+00 1.11859197e+00
 1.11372634e+00 1.10888053e+00 1.09084617e+00 1.

((160, 220), (40, 220))

In [14]:
run_classifiers(x_train_p_p, y_train, x_dev_p_p, y_dev)

SVC : -> train =  0.95 |  dev =  0.725
Logistic Regression : -> train =  0.925 |  dev =  0.775
LinearDiscriminantAnalysis : -> train =  0.61875 |  dev =  0.4
GaussianNB : -> train =  0.8125 |  dev =  0.725
KNeighborsClassifier : -> train =  1.0 |  dev =  0.675
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.6
RandomForestClassifier : -> train =  1.0 |  dev =  0.7
MLPClassifier : -> train =  1.0 |  dev =  0.825
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.675


## Spanish Data

In [15]:
#es_df = create_df(data_dir, lang='es', df_columns=df_columns)
es_df = pq.read_table(os.path.join(data_dir, 'es_df.parquet')).to_pandas()
es_train, es_dev = get_single_split(es_df, data_dir, lang='es')
es_train.shape, es_dev.shape

((32000, 3), (8000, 3))

In [16]:
es_train.head()

Unnamed: 0,author_id,tweet,label
0,0035a3060d075506f5b9b978a910aa1f,#USER# pasta con bichos de agua,0
1,0035a3060d075506f5b9b978a910aa1f,De verdad puto lol de mierda qué asco de juego...,0
2,0035a3060d075506f5b9b978a910aa1f,RT #USER#: me hice una pcr y ya tengo los resu...,0
3,0035a3060d075506f5b9b978a910aa1f,"Y un lomo queso de baguette entera, tranqui #URL#",0
4,0035a3060d075506f5b9b978a910aa1f,Me cambio de curro y me llegan 3 ofertas direc...,0


In [17]:
x_train_persp = extract_perspective_scores_for_authors(es_train, data_dir, lang='es', steps=20)
x_train_tfidf, y_train, es_vec = prepare_xy(es_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='es',
                           return_y=True, usr_len=200, is_train=True, vec=None)
x_train_persp.shape, x_train_tfidf.shape, y_train.shape

((160, 120), (160, 34833), (160,))

In [18]:
x_train = np.concatenate((x_train_persp, x_train_tfidf.toarray()), axis=1)
x_train.shape

(160, 34953)

In [19]:
x_dev_persp = extract_perspective_scores_for_authors(es_dev, data_dir, lang='es', steps=20)
x_dev_tfidf, y_dev, _ = prepare_xy(es_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='es',
                           return_y=True, usr_len=200, is_train=False, vec=es_vec)
x_dev_persp.shape, x_dev_tfidf.shape, y_dev.shape

((40, 120), (40, 34833), (40,))

In [20]:
x_dev = np.concatenate((x_dev_persp, x_dev_tfidf.toarray()), axis=1)
x_dev.shape

(40, 34953)

### All Features

In [21]:
run_classifiers(x_train, y_train, x_dev, y_dev)

SVC : -> train =  0.84375 |  dev =  0.825
Logistic Regression : -> train =  0.9 |  dev =  0.825
LinearDiscriminantAnalysis : -> train =  0.7375 |  dev =  0.8
GaussianNB : -> train =  1.0 |  dev =  0.8
KNeighborsClassifier : -> train =  1.0 |  dev =  0.75
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.725
RandomForestClassifier : -> train =  1.0 |  dev =  0.8
MLPClassifier : -> train =  1.0 |  dev =  0.875
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.775


### PCA

In [22]:
n_components = 150
pca = PCA(n_components=n_components, svd_solver='full')
x_train_p = pca.fit_transform(x_train)
x_dev_p = pca.transform(x_dev)
print(x_train_p.shape, x_dev_p.shape)
pca.singular_values_

(160, 150) (40, 150)


array([20.89626911,  8.09456632,  7.00516292,  6.96457061,  6.62599988,
        6.33673761,  6.20809503,  6.09879187,  5.93224175,  5.54020067,
        5.49205014,  5.34593299,  5.30326712,  5.12742299,  4.85370965,
        4.62480257,  4.54380143,  4.3820815 ,  4.21724943,  4.00674233,
        3.69482155,  3.61155168,  2.93163516,  2.90895913,  2.71891407,
        2.64328871,  2.58776863,  2.56068026,  2.41197468,  2.3642625 ,
        2.33300257,  2.28644073,  2.24930084,  2.19665349,  2.09607915,
        2.07307792,  2.03361732,  2.02524551,  1.94777377,  1.88081232,
        1.84097723,  1.83041072,  1.8127762 ,  1.79449199,  1.74446406,
        1.70551498,  1.68430999,  1.66624004,  1.61154366,  1.59354796,
        1.56040979,  1.54027011,  1.5223149 ,  1.50341524,  1.45202614,
        1.42815932,  1.42062347,  1.38969869,  1.379727  ,  1.34130924,
        1.32205039,  1.28454687,  1.25268499,  1.23075418,  1.21509387,
        1.20220478,  1.18537535,  1.16573449,  1.15069684,  1.11

In [23]:
run_classifiers(x_train_p, y_train, x_dev_p, y_dev)

SVC : -> train =  0.94375 |  dev =  0.825
Logistic Regression : -> train =  0.9 |  dev =  0.825
LinearDiscriminantAnalysis : -> train =  1.0 |  dev =  0.7
GaussianNB : -> train =  0.95 |  dev =  0.75
KNeighborsClassifier : -> train =  1.0 |  dev =  0.75
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.55
RandomForestClassifier : -> train =  1.0 |  dev =  0.7
MLPClassifier : -> train =  1.0 |  dev =  0.7
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.75


### PCA - PCA

In [24]:
n_components = 60
pca = PCA(n_components=n_components, svd_solver='full')
x_train_persp_p = pca.fit_transform(x_train_persp)
x_dev_persp_p = pca.transform(x_dev_persp)
print(x_train_persp_p.shape, x_dev_persp_p.shape)
print(pca.singular_values_)

n_components = 160
pca = PCA(n_components=n_components, svd_solver='full')
x_train_tfidf_p = pca.fit_transform(x_train_tfidf.toarray())
x_dev_tfidf_p = pca.transform(x_dev_tfidf.toarray())
print(x_train_tfidf_p.shape, x_dev_tfidf_p.shape)
print(pca.singular_values_)

x_train_p_p = np.concatenate((x_train_persp_p, x_train_tfidf_p), axis=1)
x_dev_p_p = np.concatenate((x_dev_persp_p, x_dev_tfidf_p), axis=1)
x_train_p_p.shape, x_dev_p_p.shape

(160, 60) (40, 60)
[20.87730389  7.95694644  6.97541445  6.93562704  6.58450391  6.30114656
  6.17187826  6.06417209  5.89434985  5.50202955  5.44674671  5.30659403
  5.26211737  5.08392637  4.80990758  4.58096578  4.48959968  4.33392519
  4.16842643  3.95374989  3.60460936  3.52793679  2.86087818  2.82063837
  2.63969822  2.55722055  2.50677374  2.43613251  2.31841893  2.27574376
  2.23359802  2.18630093  2.14898616  2.09827094  1.99493171  1.95634788
  1.93050352  1.85866554  1.78640636  1.74362207  1.71331985  1.69500619
  1.68049853  1.64390359  1.59230607  1.58415438  1.5600032   1.51236445
  1.44888163  1.42556953  1.38441092  1.37061986  1.36203311  1.32598334
  1.27988835  1.2580756   1.23857846  1.22413973  1.17691693  1.13808741]
(160, 160) (40, 160)
[1.88709773e+00 1.76279270e+00 1.23424314e+00 1.10822957e+00
 9.98701518e-01 9.90915002e-01 9.80410356e-01 9.46211362e-01
 9.20689040e-01 8.96050763e-01 8.86980378e-01 8.83917611e-01
 8.70842659e-01 8.57653118e-01 8.45332994e-01 

((160, 220), (40, 220))

In [25]:
run_classifiers(x_train_p_p, y_train, x_dev_p_p, y_dev)

SVC : -> train =  0.94375 |  dev =  0.8
Logistic Regression : -> train =  0.8875 |  dev =  0.825
LinearDiscriminantAnalysis : -> train =  0.7125 |  dev =  0.575
GaussianNB : -> train =  0.90625 |  dev =  0.85
KNeighborsClassifier : -> train =  1.0 |  dev =  0.75
DecisionTreeClassifier : -> train =  1.0 |  dev =  0.625
RandomForestClassifier : -> train =  1.0 |  dev =  0.875
MLPClassifier : -> train =  1.0 |  dev =  0.775
[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf
XGBoost : -> train =  1.0 |  dev =  0.775
