In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn
import sklearn.ensemble
import sklearn.linear_model
import sklearn.svm
import sklearn.naive_bayes
import xgboost
from sklearn.calibration import CalibratedClassifierCV

# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [2]:
rf_clf = sklearn.ensemble.RandomForestClassifier(random_state=0, criterion='entropy', n_estimators=5)
xgb_clf = xgboost.XGBClassifier(random_state=0, n_estimators=50)
gdbt_clf = sklearn.ensemble.GradientBoostingClassifier(random_state=0, n_estimators=5)

lsvc_clf = CalibratedClassifierCV(sklearn.svm.LinearSVC(C=100, random_state=0), cv=5)
lcsvc_clf = sklearn.svm.LinearSVC(C=1000, random_state=0)
rsvc_clf = sklearn.svm.SVC(kernel='rbf', C=10, gamma='auto', random_state=0)
lr_clf = sklearn.linear_model.LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')
bnb_clf = sklearn.naive_bayes.BernoulliNB(alpha=100)

clf_models = {
    'randomforest': rf_clf,
    'SVM_linear': lsvc_clf,
    'logistic': lr_clf,
    'NB': bnb_clf,
    'xgboost': xgb_clf,
    "SVM_rbf": rsvc_clf,
}

params = {
    'randomforest': {'n_estimators': [30, 50, 80], 'criterion': ['entropy', 'gini']},
    'SVM': {},
    'logistic': {'C': [1, 0.5],},
    'NB': {'alpha': [100, 50, 10],},
    'xgboost': {'n_estimators': [30, 50, 80],},
}

### read active user data

In [3]:
df_user = pd.read_csv("./DSock/Active_Users.csv")
df_user["isPart"] = (df_user[["isMod", "isObserver", "isPuppet"]] == "t").any(axis=1)

print(f"{df_user.shape}")

(116, 57)


In [4]:
df_part = df_user[df_user["isPart"]]
df_covert = df_user[df_user["Strategy"] == "COVERT"]
df_overt = df_user[df_user["Strategy"] == "OVERT"]
df_unres = df_user[df_user["Strategy"] == "UNRESTRICTED"]

print(f"{df_part.shape} {df_covert.shape} {df_overt.shape} {df_unres.shape}")

(69, 57) (22, 57) (8, 57) (5, 57)


In [5]:
def get_data(df_part, df_sock):
#     X = pd.concat([df_part, df_sock], axis=0)[["received_comments", "issued_posts"]].values
    X = pd.concat([df_part, df_sock], axis=0)[["received_comments", "issued_posts", "issued_like", "issued_comment"]].values
    y = [0] * df_part.shape[0] + [1] * df_sock.shape[0]
    y = np.array(y)
    return X, y

In [6]:
import warnings
warnings.filterwarnings("ignore")

def get_metrics(X, y):
    res = {
        m: cross_validate(estimator=clf_models[m], X=X, y=y, cv=10, scoring=["f1", "precision", "recall", "accuracy",]) for m in clf_models
    }
    df_res = pd.DataFrame({
        m: pd.DataFrame(res[m]).mean()
        for m in clf_models
    })
    return df_res

In [7]:
res = {
    "COVERT": get_metrics(*get_data(df_part, df_covert)),
    "OVERT": get_metrics(*get_data(df_part, df_overt)),
    "UNRESTRICTED": get_metrics(*get_data(df_part, df_unres)),
    "ALLSOCKS": get_metrics(*get_data(df_part, pd.concat([df_covert, df_overt, df_unres], axis=0))),
}


In [8]:
df_res = pd.concat(res, keys=res.keys(), axis=1)
display(df_res)
df_res.to_csv("./res/par_socks_res2.csv")

Unnamed: 0_level_0,COVERT,COVERT,COVERT,COVERT,COVERT,COVERT,OVERT,OVERT,OVERT,OVERT,...,UNRESTRICTED,UNRESTRICTED,UNRESTRICTED,UNRESTRICTED,ALLSOCKS,ALLSOCKS,ALLSOCKS,ALLSOCKS,ALLSOCKS,ALLSOCKS
Unnamed: 0_level_1,randomforest,SVM_linear,logistic,NB,xgboost,SVM_rbf,randomforest,SVM_linear,logistic,NB,...,logistic,NB,xgboost,SVM_rbf,randomforest,SVM_linear,logistic,NB,xgboost,SVM_rbf
fit_time,0.004522,0.022295,0.003122,0.000491,0.013013,0.000856,0.006794,0.014585,0.004111,0.00049,...,0.00413,0.000496,0.004863,0.000705,0.007079,0.017814,0.002734,0.000492,0.008075,0.00095
score_time,0.001764,0.002052,0.001442,0.001458,0.002451,0.002026,0.002581,0.002142,0.001418,0.00145,...,0.001417,0.00147,0.002415,0.002016,0.002572,0.002087,0.001388,0.001445,0.002421,0.002012
test_f1,0.090476,0.14,0.057143,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.1,0.0,0.0,0.0,0.211905,0.033333,0.105,0.0,0.100794,0.053333
test_precision,0.075,0.133333,0.04,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.1,0.0,0.0,0.0,0.20303,0.05,0.175,0.0,0.103333,0.036364
test_recall,0.116667,0.15,0.1,0.0,0.0,0.0,0.0,0.0,0.1,0.0,...,0.1,0.0,0.0,0.0,0.258333,0.025,0.083333,0.0,0.1,0.1
test_accuracy,0.558889,0.758889,0.67,0.758889,0.547778,0.57,0.803571,0.898214,0.842857,0.898214,...,0.907143,0.935714,0.866071,0.866071,0.46,0.644545,0.585455,0.664545,0.46,0.46
