In [1]:
import os
import pickle
import numpy as np
import pyarrow.parquet as pq

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from data_io import *
from prepare_data import *

In [2]:
data_dir = os.path.join('..', 'data', 'pan21-author-profiling-training-2021-03-14')
op_dir = os.path.join('..', 'res', 'preds')
df_columns = ['author_id', 'tweet', 'label']

#en_df = create_df(data_dir, lang='en', df_columns=df_columns)
en_df = pq.read_table(os.path.join(data_dir, 'en_df.parquet')).to_pandas()
en_train, en_dev = get_single_split(en_df, data_dir, lang='en')
en_train.shape, en_dev.shape

((32000, 3), (8000, 3))

In [3]:
en_train.head()

Unnamed: 0,author_id,tweet,label
200,06893abba0bb8f94fed7562350233ed7,"Romanian graftbuster’s firing violated rights,...",0
201,06893abba0bb8f94fed7562350233ed7,Russian ventilators sent to U.S. made by firm ...,0
202,06893abba0bb8f94fed7562350233ed7,Hezbollah prevented ISIS from reaching Europe:...,0
203,06893abba0bb8f94fed7562350233ed7,Epidemiologist Dr Knut Wittkowski: ‘Lockdown H...,0
204,06893abba0bb8f94fed7562350233ed7,China refuses to let WHO investigate truth beh...,0


In [4]:
tweets = en_train['tweet'].to_list()
mod_tweets = filter_tweets(tweets)
len(tweets), len(mod_tweets)

(32000, 32000)

In [5]:
tweet_feautures, en_vec = prepare_tweets_using_tfidf(mod_tweets, lang='en', is_train=True, vec=None)
tweet_feautures.shape

(32000, 21272)

In [6]:
auth_tweets = group_tweets_by_author(mod_tweets)

labels = en_train['label'].to_numpy()
prep_labels = prepare_labels(labels)

len(auth_tweets), len(prep_labels)

(160, 160)

In [7]:
auth_tfidf_train, en_vec = prepare_tweets_using_tfidf(mod_tweets, lang='en', is_train=True, vec=None)
auth_tfidf_train.shape

(32000, 21272)

In [8]:
x_train, y_train, en_vec = prepare_xy(en_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                                      return_y=True, usr_len=200, is_train=True, vec=None)

x_train.shape, y_train.shape, en_vec

((160, 21272),
 (160,),
 TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words='english', strip_accents=None,
                 sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, use_idf=True, vocabulary=None))

In [9]:
x_dev, y_dev, _ = prepare_xy(en_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                             return_y=True, usr_len=200, is_train=False, vec=en_vec)
x_dev.shape, y_dev.shape

((40, 21272), (40,))

In [10]:
clf = SVC()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.99375

In [11]:
clf.score(x_dev, y_dev)

0.775

In [12]:
clf = LinearDiscriminantAnalysis()

# Dense matrix is required, so convert sparse to dense
x_train_dense = x_train.toarray()
x_dev_dense = x_dev.toarray()

clf.fit(x_train_dense, y_train)
clf.score(x_train_dense, y_train)

0.8875

In [13]:
clf.score(x_dev_dense, y_dev)

0.6

In [14]:
clf = QuadraticDiscriminantAnalysis()

# Dense matrix is required, so convert sparse to dense
x_train_dense = x_train.toarray()
x_dev_dense = x_dev.toarray()

clf.fit(x_train_dense, y_train)
clf.score(x_train_dense, y_train)



1.0

In [15]:
clf.score(x_dev_dense, y_dev)

0.6

In [16]:
clf = LogisticRegression(penalty='l2', solver='lbfgs')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.99375

In [17]:
clf.score(x_dev, y_dev)

0.775

In [18]:
clf = LogisticRegression(penalty='l1', solver='liblinear')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.5

In [19]:
clf.score(x_dev, y_dev)

0.5

In [20]:
clf = GaussianNB()
clf.fit(x_train_dense, y_train)
clf.score(x_train_dense, y_train)

1.0

In [21]:
clf.score(x_dev_dense, y_dev)

0.725

In [22]:
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.9625

In [23]:
clf.score(x_dev, y_dev)

0.7

In [24]:
clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [25]:
clf.score(x_dev, y_dev)

0.625

In [26]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [27]:
clf.score(x_dev, y_dev)

0.675

In [28]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [29]:
clf.score(x_dev, y_dev)

0.6

In [30]:
hidden_layer_size = 200
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', solver='adam', 
                    alpha=0.0001, learning_rate_init=0.001, max_iter=300)
clf.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=200, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [31]:
clf.score(x_train, y_train)

1.0

In [32]:
clf.score(x_dev, y_dev)

0.775

In [33]:
dtrain = xgb.DMatrix(x_train, label=y_train)
ddev = xgb.DMatrix(x_dev, label=y_dev)
param = {'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'mape', 'base_score': 0.5}
num_round = 10
evallist = [(ddev, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)
y_pred = np.around(bst.predict(dtrain))
accuracy_score(y_true=y_train, y_pred=y_pred)

[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf


1.0

In [34]:
y_pred = np.around(bst.predict(ddev))
accuracy_score(y_true=y_dev, y_pred=y_pred)

0.55

In [35]:
with open(os.path.join(op_dir, 'dev-preds.npy'), 'wb') as f:
    np.save(f, clf.predict(x_dev))

In [36]:
authors = en_dev['author_id'].to_list()
authlist = prepare_authlist(authors, usr_len=200)

with open(os.path.join(op_dir, '..', 'dev_auth_list.pkl'), 'wb') as f:
    pickle.dump(authlist, f)

In [37]:
es_df = pq.read_table(os.path.join(data_dir, 'es_df.parquet')).to_pandas()
es_train, es_dev = get_single_split(es_df, data_dir, 'es')

x_train, y_train, es_vec = prepare_xy(es_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='es',
                                      return_y=True, usr_len=200, is_train=True, vec=None)

x_dev, y_dev, _ = prepare_xy(es_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                                      return_y=True, usr_len=200, is_train=False, vec=es_vec)

x_dev.shape, y_dev.shape

((40, 34833), (40,))

In [38]:
clf = SVC()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.95

In [39]:
clf.score(x_dev, y_dev)

0.75

In [40]:
clf = LinearDiscriminantAnalysis()

# Dense matrix is required, so convert sparse to dense
x_train_dense = x_train.toarray()
x_dev_dense = x_dev.toarray()

clf.fit(x_train_dense, y_train)
clf.score(x_train_dense, y_train)

0.725

In [41]:
clf.score(x_dev_dense, y_dev)

0.75

In [42]:
clf = QuadraticDiscriminantAnalysis()

# Dense matrix is required, so convert sparse to dense
x_train_dense = x_train.toarray()
x_dev_dense = x_dev.toarray()

clf.fit(x_train_dense, y_train)
clf.score(x_train_dense, y_train)



1.0

In [43]:
clf.score(x_dev_dense, y_dev)

0.575

In [44]:
clf = LogisticRegression(penalty='l2', solver='lbfgs')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.9125

In [45]:
clf.score(x_dev, y_dev)

0.775

In [46]:
clf = LogisticRegression(penalty='l1', solver='liblinear')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.725

In [47]:
clf.score(x_dev, y_dev)

0.8

In [48]:
clf = GaussianNB()
clf.fit(x_train_dense, y_train)
clf.score(x_train_dense, y_train)

1.0

In [49]:
clf.score(x_dev_dense, y_dev)

0.8

In [50]:
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.9

In [51]:
clf.score(x_dev, y_dev)

0.725

In [52]:
clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [53]:
clf.score(x_dev, y_dev)

0.725

In [54]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [55]:
clf.score(x_dev, y_dev)

0.425

In [56]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [57]:
clf.score(x_dev, y_dev)

0.75

In [58]:
hidden_layer_size = 200
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', solver='adam', 
                    alpha=0.0001, learning_rate_init=0.001, max_iter=300)
clf.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=200, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [59]:
clf.score(x_train, y_train)

1.0

In [60]:
clf.score(x_dev, y_dev)

0.825

In [61]:
dtrain = xgb.DMatrix(x_train, label=y_train)
ddev = xgb.DMatrix(x_dev, label=y_dev)
param = {'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'mape', 'base_score': 0.5}
num_round = 10
evallist = [(ddev, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)
y_pred = np.around(bst.predict(dtrain))
accuracy_score(y_true=y_train, y_pred=y_pred)

[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf


1.0

In [62]:
y_pred = np.around(bst.predict(ddev))
accuracy_score(y_true=y_dev, y_pred=y_pred)

0.675