In [1]:
REPORT_PATH_TEMPLATE = '../out/reports/bots__band_%s__target_%s.pkl'
# or overrided REPORT_PATH_TEMPLATE
REPORT_PATH = None

BAND = '1k'
TARGET_CONFIDENCE = '50'
FOCUS_BINS = 20
TARGET_CLASS = 0

In [2]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict

from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from scripts import bots

  from numpy.core.umath_tests import inner1d


In [3]:
def load_results(datafile):
    with open(datafile, 'rb') as f:
        results = pickle.load(f)
        return results
    
def group_by_key(data, key):
    obj = defaultdict(list)
    for d in data:
        value = d[key]
        obj[value].append(d)        
    return obj

def save_fig(fig, name_template, *args, **kwargs):
    fig.savefig(os.path.join(FIGS_PATH, name_template.format(*args, **kwargs)))

In [4]:
# Load results from running adversarial search on bot dataset.
if REPORT_PATH is None:
    REPORT_PATH = REPORT_PATH_TEMPLATE % (BAND, TARGET_CONFIDENCE)
results = load_results(REPORT_PATH)

results_data = group_by_key(results, 'bins')[FOCUS_BINS][0]['search_results']

In [5]:
X_adv = np.vstack(results_data.x_adv_features.as_matrix())
adv_indices = results_data.dataset_index

Load and preprocess the data

In [6]:
# Features that will be removed.
drop_features = [
    "follower_friend_ratio",
    "tweet_frequency",
    "favourite_tweet_ratio",
]

bins = 20

X, y, feature_names = bots.load_transform_data(
    human_dataset='../data/twitter_bots/humans/humans.1k.csv',
    bot_dataset='../data/twitter_bots/bots/bots.1k.csv',
    drop_features=drop_features,
    bins=20,
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=1
)

Baseline

In [7]:
max(y_train.mean(), 1 - y_train.mean())

0.6094827586206897

Train a bunch of models, including the recreation of the original target model

In [8]:
lr = bots.fit_lr(X_train, y_train, seed=1)
lr.score(X_test, y_test)

0.875968992248062

In [9]:
svm = bots.fit_svmrbf(X_train, y_train, seed=1)
svm.score(X_test, y_test)

0.875968992248062

In [10]:
net1 = MLPClassifier(activation='relu',
                     hidden_layer_sizes=[20, 10])
net1.fit(X_train, y_train)
net1.score(X_test, y_test)

0.8527131782945736

In [11]:
net2 = MLPClassifier(activation='relu',
                     hidden_layer_sizes=[2000, 500])
net2.fit(X_train, y_train)
net2.score(X_test, y_test)

0.8449612403100775

In [12]:
gbt = GradientBoostingClassifier(n_estimators=100)
gbt.fit(X_train, y_train)
gbt.score(X_test, y_test)

0.8682170542635659

In [13]:
models = {
    'LR': lr,
    'NN-A': net1,
    'NN-B': net2,
    'GBDT': gbt,
    'SVM (RBF)': svm,
}

In [14]:
trans_df = pd.DataFrame(columns=['model', 'test accuracy', 'transferability'])
for model_name, model in models.items():
    score = model.score(X_test, y_test)
    trans = (model.predict(X_adv) == TARGET_CLASS).mean()
    trans_df = trans_df.append({
        'model': model_name,
        'test accuracy': '{:1.2f}%'.format(score * 100),
        'transferability': '{:1.2f}%'.format(trans * 100),
    }, ignore_index=True)
    
trans_df

Unnamed: 0,model,test accuracy,transferability
0,LR,87.60%,100.00%
1,NN-A,85.27%,48.78%
2,NN-B,84.50%,48.78%
3,GBDT,86.82%,75.61%
4,SVM (RBF),87.60%,73.17%


In [15]:
trans_df.to_latex()

'\\begin{tabular}{llll}\n\\toprule\n{} &      model & test accuracy & transferability \\\\\n\\midrule\n0 &         LR &        87.60\\% &         100.00\\% \\\\\n1 &       NN-A &        85.27\\% &          48.78\\% \\\\\n2 &       NN-B &        84.50\\% &          48.78\\% \\\\\n3 &       GBDT &        86.82\\% &          75.61\\% \\\\\n4 &  SVM (RBF) &        87.60\\% &          73.17\\% \\\\\n\\bottomrule\n\\end{tabular}\n'