In [342]:
import os
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.preprocessing import minmax_scale
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from data_io import *

In [3]:
data_dir = os.path.join('..', 'data', 'pan21-author-profiling-training-2021-03-14')
en_train, en_dev = get_single_split(data_dir, 'en')
en_train.shape, en_dev.shape

((32000, 3), (8000, 3))

In [4]:
en_train.head()

Unnamed: 0,author_id,tweet,label
200,06893abba0bb8f94fed7562350233ed7,"Romanian graftbuster’s firing violated rights,...",0
201,06893abba0bb8f94fed7562350233ed7,Russian ventilators sent to U.S. made by firm ...,0
202,06893abba0bb8f94fed7562350233ed7,Hezbollah prevented ISIS from reaching Europe:...,0
203,06893abba0bb8f94fed7562350233ed7,Epidemiologist Dr Knut Wittkowski: ‘Lockdown H...,0
204,06893abba0bb8f94fed7562350233ed7,China refuses to let WHO investigate truth beh...,0


In [5]:
tweet = en_train['tweet'].iloc[0]
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(tweet)

{'neg': 0.453, 'neu': 0.547, 'pos': 0.0, 'compound': -0.7003}

In [6]:
tweets = en_train['tweet'].to_list()
mod_tweets = list()

for tweet in tweets:
    mod_tweet = tweet.replace('#URL#', 'url')
    mod_tweet = mod_tweet.replace('#USER#', 'user')
    mod_tweet = mod_tweet.replace('#HASHTAG#', '')
    mod_tweet = mod_tweet.replace('RT', '')
    mod_tweets.append(mod_tweet)

len(mod_tweets)

32000

In [11]:
vader_feats = np.zeros((32000, 4))

for i, tweet in enumerate(mod_tweets):
    feats = analyzer.polarity_scores(tweet)
    vader_feats[i][0] = feats['compound']
    vader_feats[i][1] = feats['neg']
    vader_feats[i][2] = feats['neu']
    vader_feats[i][3] = feats['pos']
    
vader_feats.shape

(32000, 4)

In [27]:
labels = en_train['label'].to_numpy()

y_labels = np.zeros(160)
for i in range(160):
    y_labels[i] = labels[i*200]
    
y_labels.sum()

80.0

In [18]:
user_feats = np.zeros((160, 600))
for i in range(160):
    vader_start = i*200
#     print(user_feats[i, :200].shape)
#     print(vader_feats[vader_start:vader_start+200, 0].shape)
    user_feats[i, :200] = vader_feats[vader_start:vader_start+200, 0]
    user_feats[i, 200:400] = vader_feats[vader_start:vader_start+200, 1]
    user_feats[i, 400:600] = vader_feats[vader_start:vader_start+200, 2]

user_feats.shape

(160, 600)

In [20]:
clf = SVC(gamma='auto')
clf

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [37]:
clf.fit(user_feats, y_labels)
clf.score(user_feats, y_labels)

0.9125

In [104]:
grp_size = 10
n_grps = int(len(tweets) / grp_size)
grp_tweets = list()

for i in range(n_grps):
    start = i*grp_size
    grp_tweet = ' '.join(tweets[start:start+grp_size])
    grp_tweets.append(grp_tweet)
    
grp_tweets[10]

"Pensioners in Bulgaria fourth-highest in EU at risk of poverty - Eurostat #URL# via #USER# Hey check this out #URL# #URL# +++ UK parliament Brexit vote — live updates +++ Greece, Bulgaria, Romania and Serbia Declare War on Islam and its Trojan... #URL# via #USER# Top bank warns clients to stop trading pound before Brexit vote #URL# May faces defeat in parliament over Brexit plan #URL# France: “Yellow Vest” Protesters Take to Streets for 9th Straight Week #URL# via #USER# Finnish President expresses his 'disgust' at migrant grooming gangs #URL# via #USER# Verona declares itself a ‘pro-life city’, and fights to prevent abortion #URL# British Lawmakers to Vote on Brexit Deal #URL#"

In [310]:
def filter_tweets(tweets):
    mod_tweets = list()
    
    for tweet in tweets:
        mod_tweet = tweet.replace('#URL#', 'url')
        mod_tweet = mod_tweet.replace('#USER#', 'user')
        mod_tweet = mod_tweet.replace('#HASHTAG#', '')
        mod_tweet = mod_tweet.replace('RT', '')
        mod_tweets.append(mod_tweet)
        
    return mod_tweets

def group_tweets(tweets, grp_size=1):
    n_grps = int(len(tweets) / grp_size)
    grouped_tweets = list()
    
    for i in range(n_grps):
        start = i*grp_size
        grouped_tweet = ' '.join(tweets[start:start+grp_size])
        grouped_tweets.append(grouped_tweet)
        
    return grouped_tweets

def get_vader_features(tweets):
    rows = len(tweets)
    vader_feats = np.zeros((rows, 4))
    analyzer = SentimentIntensityAnalyzer()
    
    for i, tweet in enumerate(tweets):
        feats = analyzer.polarity_scores(tweet)
        vader_feats[i][0] = feats['compound']
        vader_feats[i][1] = feats['neg']
        vader_feats[i][2] = feats['neu']
        vader_feats[i][3] = feats['pos']
        
    return vader_feats

def prepare_user_matrix(features, usr_len=200, n_features=3):
    n_rows = int(features.shape[0] / usr_len)
    user_feats = np.zeros((n_rows, usr_len*n_features))
    
    for i in range(n_rows):
        start = i*usr_len
        user_feats[i, : usr_len] = vader_feats[start : start+usr_len, 0]
        user_feats[i, usr_len : usr_len*2] = vader_feats[start : start+usr_len, 1]
        user_feats[i, usr_len*2 : usr_len*3] = vader_feats[start : start+usr_len, 2]
        
    return user_feats

def prepare_labels(labels, usr_len=200):
    n_vals = int(len(labels) / usr_len)
    y_labels = np.zeros(n_vals)
    
    for i in range(n_vals):
        y_labels[i] = labels[i*usr_len]
        
    return y_labels
        
def prepare_xy(df, usr_len=200, grp_size=10, n_features=3):
    tweets = df['tweet'].to_list()
    labels = df['label'].to_list()
    
    y_labels = prepare_labels(labels, usr_len=usr_len)
    mod_tweets = filter_tweets(tweets)
    grouped_tweets = group_tweets(mod_tweets, grp_size=grp_size)
    usr_len = int(usr_len / grp_size)
    vader_feats = get_vader_features(grouped_tweets)
    x_feats = prepare_user_matrix(vader_feats, usr_len=usr_len, n_features=n_features)
    
    return (x_feats, y_labels)

In [311]:
grp_size = 10

In [312]:
x_train, y_train = prepare_xy(en_train, grp_size=grp_size)
x_train.shape, y_train.shape

((160, 60), (160,))

In [313]:
clf = SVC(gamma='auto')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.74375

In [314]:
x_dev, y_dev = prepare_xy(en_dev, grp_size=grp_size)
x_dev.shape, y_dev.shape

((40, 60), (40,))

In [315]:
clf.score(x_dev, y_dev)

0.4

In [316]:
clf = LinearDiscriminantAnalysis(solver='svd', n_components=10)
clf.fit(x_train, y_train)



LinearDiscriminantAnalysis(n_components=10, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [317]:
clf.score(x_train, y_train)

0.76875

In [318]:
clf.score(x_dev, y_dev)

0.475

In [319]:
clf = LinearDiscriminantAnalysis(solver='eigen')
clf.fit(x_train, y_train)
clf.score(x_train ,y_train)

0.76875

In [320]:
clf.score(x_dev, y_dev)

0.475

In [321]:
clf = QuadraticDiscriminantAnalysis()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [322]:
clf.score(x_dev, y_dev)

0.425

In [323]:
clf = LogisticRegression(penalty='l2', solver='lbfgs')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.76875

In [324]:
clf.score(x_dev, y_dev)

0.4

In [325]:
clf = LogisticRegression(penalty='l1', solver='liblinear')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.73125

In [326]:
clf.score(x_dev, y_dev)

0.35

In [327]:
clf = GaussianNB()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.725

In [328]:
clf.score(x_dev, y_dev)

0.425

In [329]:
clf = MultinomialNB()

# Does not accept negative feature values, so scaling needed
x_train_scaled = minmax_scale(x_train, feature_range=(0, 1))

clf.fit(x_train_scaled, y_train)
clf.score(x_train_scaled, y_train)

0.675

In [330]:
x_dev_scaled = minmax_scale(x_dev, feature_range=(0, 1))
clf.score(x_dev_scaled, y_dev)

0.425

In [331]:
clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [332]:
clf.score(x_dev, y_dev)

0.425

In [333]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [334]:
clf.score(x_dev, y_dev)

0.425

In [335]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
clf.score(x_train, y_train)

1.0

In [336]:
clf.score(x_dev, y_dev)

0.425

In [337]:
hidden_layer_size = int(x_train.shape[1] / 2)
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', solver='lbfgs', 
                    alpha=0.0001, max_iter=300)
clf.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=30, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [338]:
clf.score(x_train, y_train)

1.0

In [339]:
clf.score(x_dev, y_dev)

0.425

In [348]:
dtrain = xgb.DMatrix(x_train, label=y_train)
ddev = xgb.DMatrix(x_dev, label=y_dev)
param = {'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'mape', 'base_score': 0.5}
num_round = 10
evallist = [(ddev, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)
y_pred = np.around(bst.predict(dtrain))
accuracy_score(y_true=y_train, y_pred=y_pred)

[0]	eval-mape:inf	train-mape:inf
[1]	eval-mape:inf	train-mape:inf
[2]	eval-mape:inf	train-mape:inf
[3]	eval-mape:inf	train-mape:inf
[4]	eval-mape:inf	train-mape:inf
[5]	eval-mape:inf	train-mape:inf
[6]	eval-mape:inf	train-mape:inf
[7]	eval-mape:inf	train-mape:inf
[8]	eval-mape:inf	train-mape:inf
[9]	eval-mape:inf	train-mape:inf


1.0

In [349]:
y_pred = np.around(bst.predict(ddev))
accuracy_score(y_true=y_dev, y_pred=y_pred)

0.425