In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn
import sklearn.ensemble
import sklearn.linear_model
import sklearn.svm
import sklearn.naive_bayes
import xgboost
from sklearn.calibration import CalibratedClassifierCV

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
rf_clf = sklearn.ensemble.RandomForestClassifier(random_state=0, criterion='entropy', n_estimators=5)
xgb_clf = xgboost.XGBClassifier(random_state=0, n_estimators=50)
gdbt_clf = sklearn.ensemble.GradientBoostingClassifier(random_state=0, n_estimators=5)

lsvc_clf = CalibratedClassifierCV(sklearn.svm.LinearSVC(C=100, random_state=0), cv=5)
lcsvc_clf = sklearn.svm.LinearSVC(C=1000, random_state=0)
rsvc_clf = sklearn.svm.SVC(kernel='rbf', C=10, gamma='auto', random_state=0)
lr_clf = sklearn.linear_model.LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')
bnb_clf = sklearn.naive_bayes.BernoulliNB(alpha=100)

clf_base = {
    'randomforest': rf_clf,
    'SVM_linear': lsvc_clf,
    'logistic': lr_clf,
    'NB': bnb_clf,
    'xgboost': xgb_clf,
    "SVM_rbf": rsvc_clf,
}

clf_smote = {model + '+SMOTE': Pipeline([("smote", SMOTE()), ("clf", clf_base[model])]) for model in clf_base}

# params = {
#     'randomforest': {'n_estimators': [30, 50, 80], 'criterion': ['entropy', 'gini']},
#     'SVM': {},
#     'logistic': {'C': [1, 0.5],},
#     'NB': {'alpha': [100, 50, 10],},
#     'xgboost': {'n_estimators': [30, 50, 80],},
# }

clf_models = {**clf_base, **clf_smote}

### read active user data

In [3]:
df_user = pd.read_csv("./DSock/Active_Users.csv")
df_user["isPart"] = (df_user[["isMod", "isPuppet"]] == "f").all(axis=1)

print(f"{df_user.shape}")

(116, 57)


In [4]:
df_part = df_user[df_user["isPart"]]
df_covert = df_user[df_user["Strategy"] == "COVERT"]
df_overt = df_user[df_user["Strategy"] == "OVERT"]
df_unres = df_user[df_user["Strategy"] == "UNRESTRICTED"]

print(f"{df_part.shape} {df_covert.shape} {df_overt.shape} {df_unres.shape}")

(81, 57) (22, 57) (8, 57) (5, 57)


In [5]:
def get_data(df_part, df_sock):
    X = pd.concat([df_part, df_sock], axis=0)[["received_comments", "issued_posts"]].values
#     X = pd.concat([df_part, df_sock], axis=0)[["received_comments", "issued_posts", "issued_like", "issued_comment", "received_comments", "received_likes"]].values
    y = [0] * df_part.shape[0] + [1] * df_sock.shape[0]
    y = np.array(y)
    return X, y

In [6]:
import warnings
warnings.filterwarnings("ignore")

def get_metrics(X, y):
    res = {
        m: cross_validate(estimator=clf_models[m], X=X, y=y,
                          cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                          scoring=["f1", "precision", "recall", "accuracy",]) for m in clf_models
    }
    df_res = pd.DataFrame({
        m: pd.DataFrame(res[m]).mean()[["test_f1", "test_precision", "test_recall", "test_accuracy"]]
        for m in clf_models
    })
    return df_res

In [7]:
res = {
    "COVERT": get_metrics(*get_data(df_part, df_covert)),
    "OVERT": get_metrics(*get_data(df_part, df_overt)),
    "UNRESTRICTED": get_metrics(*get_data(df_part, df_unres)),
    "ALLSOCKS": get_metrics(*get_data(df_part, pd.concat([df_covert, df_overt, df_unres], axis=0))),
}

In [9]:
df_res = pd.concat(res, keys=res.keys(), axis=0)
display(df_res)
df_res.to_csv("./res/par_socks_res_two.csv")

Unnamed: 0,Unnamed: 1,randomforest,SVM_linear,logistic,NB,xgboost,SVM_rbf,randomforest+SMOTE,SVM_linear+SMOTE,logistic+SMOTE,NB+SMOTE,xgboost+SMOTE,SVM_rbf+SMOTE
COVERT,test_f1,0.277143,0.0,0.0,0.0,0.323377,0.137143,0.443175,0.328254,0.313596,0.464995,0.385641,0.301261
COVERT,test_precision,0.44,0.0,0.0,0.0,0.35,0.3,0.382222,0.228887,0.209634,0.314266,0.341667,0.248333
COVERT,test_recall,0.23,0.0,0.0,0.0,0.31,0.09,0.54,0.64,0.63,0.92,0.45,0.45
COVERT,test_accuracy,0.727619,0.786667,0.786667,0.786667,0.756667,0.746667,0.718095,0.43381,0.395714,0.554286,0.699048,0.631905
OVERT,test_f1,0.493333,0.0,0.0,0.0,0.433333,0.466667,0.516508,0.097436,0.111538,0.26367,0.580952,0.439394
OVERT,test_precision,0.533333,0.0,0.0,0.0,0.5,0.6,0.35381,0.066667,0.066667,0.154444,0.43,0.344444
OVERT,test_recall,0.5,0.0,0.0,0.0,0.4,0.4,1.0,0.3,0.5,1.0,0.9,0.7
OVERT,test_accuracy,0.932026,0.910458,0.910458,0.910458,0.920915,0.943791,0.821569,0.515033,0.481699,0.494118,0.877124,0.843137
UNRESTRICTED,test_f1,0.133333,,0.0,0.0,0.0,0.4,,,,,,
UNRESTRICTED,test_precision,0.1,,0.0,0.0,0.0,0.4,,,,,,
