In [1]:
import os
import pyarrow.parquet as pq

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import NMF
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from data_io import *
from prepare_data import *
from perspective_scores_api import extract_perspective_scores_for_authors

In [2]:
data_dir = os.path.join('..', 'data', 'pan21-author-profiling-training-2021-03-14')
op_dir = os.path.join('..', 'res', 'preds')
df_columns = ['author_id', 'tweet', 'label']

## English Data

In [3]:
#en_df = create_df(data_dir, lang='en', df_columns=df_columns)
en_df = pq.read_table(os.path.join(data_dir, 'en_df.parquet')).to_pandas()
en_train, en_dev = get_single_split(en_df, data_dir, lang='en')
en_train.shape, en_dev.shape

((32000, 3), (8000, 3))

In [4]:
en_df.head()

Unnamed: 0,author_id,tweet,label
0,043e2766cc6d22ae4e447ca5f2885a2a,Fuck New York #URL#,1
1,043e2766cc6d22ae4e447ca5f2885a2a,#USER# #USER# I think I'm in love,1
2,043e2766cc6d22ae4e447ca5f2885a2a,Trump is awesome #URL#,1
3,043e2766cc6d22ae4e447ca5f2885a2a,#USER# You have the greatest tweets sweetheart...,1
4,043e2766cc6d22ae4e447ca5f2885a2a,"#USER# It's free pizza Hun, just free food",1


In [20]:
x_train_persp = extract_perspective_scores_for_authors(en_train, data_dir, lang='en', steps=20)
x_train_tfidf, y_train, en_vec = prepare_xy(en_train, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                           return_y=True, usr_len=200, is_train=True, vec=None)

x_dev_persp = extract_perspective_scores_for_authors(en_dev, data_dir, lang='en', steps=20)
x_dev_tfidf, y_dev, _ = prepare_xy(en_dev, tweet_feature_method=prepare_tweets_using_tfidf, lang='en',
                           return_y=True, usr_len=200, is_train=False, vec=en_vec)

x_persp = np.concatenate((x_train_persp, x_dev_persp), axis=0)
x_tfidf = np.concatenate((x_train_tfidf.toarray(), x_dev_tfidf.toarray()), axis=0)
y = np.concatenate((y_train, y_dev), axis=0)

n_components = 60
pca = PCA(n_components=n_components, svd_solver='full')
x_persp_p = pca.fit_transform(x_persp)
print(x_persp_p.shape)
print(pca.singular_values_)

n_components = 100
pca = PCA(n_components=n_components, svd_solver='full')
x_tfidf_p = pca.fit_transform(x_tfidf)
print(x_tfidf_p.shape)
print(pca.singular_values_)

x = np.concatenate((x_persp_p, x_tfidf_p), axis=1)

print(x.shape, y.shape)

(200, 60)
[30.09955133  8.394896    7.76896769  7.66163536  7.17700332  6.9891127
  6.76888324  6.56223996  6.34800107  6.31203066  6.20219051  6.04994782
  5.87902222  5.68537456  5.59075626  5.30708762  5.2282202   4.99310171
  4.87025083  4.75280772  4.68379532  4.3687815   3.81053762  3.58222062
  3.45677129  3.32135212  3.27728282  3.16021462  3.10965455  3.07774252
  2.96652065  2.89334705  2.88822641  2.81111765  2.78321628  2.7486442
  2.70490065  2.6729237   2.62251254  2.59544929  2.55887945  2.52070738
  2.48740162  2.44416817  2.39041748  2.3272917   2.30225963  2.24640683
  2.23222877  2.17823462  2.14848117  2.10436325  2.08433539  2.04742693
  1.99377035  1.98672473  1.93253653  1.91636561  1.89853942  1.86439287]
(200, 100)
[3.12798826 1.93875002 1.71068712 1.50590449 1.45371905 1.37461331
 1.34522264 1.27017024 1.24104622 1.21604627 1.19844042 1.16339599
 1.1559843  1.14773407 1.13728326 1.11772955 1.1082077  1.10021238
 1.08721517 1.07565623 1.07388557 1.05457008 1.05

In [21]:
parameters = {
    'C': (0.0001, 0.01, 1.0, 10, 100, 1000),
    'kernel': ('linear', 'rbf', 'sigmoid', 'poly'),
    'degree': (2, 4, 5, 6),
    'gamma': ('scale', 'auto'),
    'max_iter': (-1, 100, 1000),
    'tol': (1e-7, 1e-5, 1e-3)
}

gs = GridSearchCV(SVC(), param_grid=parameters, n_jobs=-1, verbose=4)
gs.fit(x, y)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 296 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 2208 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 4944 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 8450 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 8625 out of 8640 | elapsed:   11.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 8640 out of 8640 | elapsed:   12.0s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': (0.0001, 0.01, 1.0, 10, 100, 1000),
                         'degree': (2, 4, 5, 6), 'gamma': ('scale', 'auto'),
                         'kernel': ('linear', 'rbf', 'sigmoid', 'poly'),
                         'max_iter': (-1, 100, 1000),
                         'tol': (1e-07, 1e-05, 0.001)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=4)

In [22]:
gs.best_estimator_, gs.best_score_, gs.best_params_

(SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=2, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=1e-07, verbose=False),
 0.72,
 {'C': 10,
  'degree': 2,
  'gamma': 'auto',
  'kernel': 'rbf',
  'max_iter': -1,
  'tol': 1e-07})