In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn
import sklearn.ensemble
import sklearn.linear_model
import sklearn.svm
import sklearn.naive_bayes
import xgboost
from sklearn.calibration import CalibratedClassifierCV

# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [3]:
rf_clf = sklearn.ensemble.RandomForestClassifier(random_state=0, criterion='entropy', n_estimators=5)
xgb_clf = xgboost.XGBClassifier(random_state=0, n_estimators=50)
gdbt_clf = sklearn.ensemble.GradientBoostingClassifier(random_state=0, n_estimators=5)

lsvc_clf = CalibratedClassifierCV(sklearn.svm.LinearSVC(C=100, random_state=0), cv=5)
lcsvc_clf = sklearn.svm.LinearSVC(C=1000, random_state=0)
rsvc_clf = sklearn.svm.SVC(kernel='rbf', C=10, gamma='auto', random_state=0)
lr_clf = sklearn.linear_model.LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')
bnb_clf = sklearn.naive_bayes.BernoulliNB(alpha=100)

clf_base = {
    'randomforest': rf_clf,
    'SVM_linear': lsvc_clf,
    'logistic': lr_clf,
    'NB': bnb_clf,
    'xgboost': xgb_clf,
    "SVM_rbf": rsvc_clf,
}

clf_smote = {model + '+SMOTE': Pipeline([("smote", SMOTE()), ("clf", clf_base[model])]) for model in clf_base}

# params = {
#     'randomforest': {'n_estimators': [30, 50, 80], 'criterion': ['entropy', 'gini']},
#     'SVM': {},
#     'logistic': {'C': [1, 0.5],},
#     'NB': {'alpha': [100, 50, 10],},
#     'xgboost': {'n_estimators': [30, 50, 80],},
# }

clf_models = {**clf_base, **clf_smote}

### read active user data

In [4]:
df_user = pd.read_csv("./DSock/Active_Users.csv")
df_user["isPart"] = (df_user[["isMod", "isObserver", "isPuppet"]] == "t").any(axis=1)

print(f"{df_user.shape}")

(116, 57)


In [5]:
df_part = df_user[df_user["isPart"]]
df_covert = df_user[df_user["Strategy"] == "COVERT"]
df_overt = df_user[df_user["Strategy"] == "OVERT"]
df_unres = df_user[df_user["Strategy"] == "UNRESTRICTED"]

print(f"{df_part.shape} {df_covert.shape} {df_overt.shape} {df_unres.shape}")

(69, 57) (22, 57) (8, 57) (5, 57)


In [6]:
def get_data(df_part, df_sock):
    X = pd.concat([df_part, df_sock], axis=0)[["received_comments", "issued_posts"]].values
#     X = pd.concat([df_part, df_sock], axis=0)[["received_comments", "issued_posts", "issued_like", "issued_comment"]].values
    y = [0] * df_part.shape[0] + [1] * df_sock.shape[0]
    y = np.array(y)
    return X, y

In [7]:
import warnings
warnings.filterwarnings("ignore")

def get_metrics(X, y):
    res = {
        m: cross_validate(estimator=clf_models[m], X=X, y=y, cv=10, scoring=["f1", "precision", "recall", "accuracy",]) for m in clf_models
    }
    df_res = pd.DataFrame({
        m: pd.DataFrame(res[m]).mean()
        for m in clf_models
    })
    return df_res

In [8]:
res = {
    "COVERT": get_metrics(*get_data(df_part, df_covert)),
    "OVERT": get_metrics(*get_data(df_part, df_overt)),
    "UNRESTRICTED": get_metrics(*get_data(df_part, df_unres)),
    "ALLSOCKS": get_metrics(*get_data(df_part, pd.concat([df_covert, df_overt, df_unres], axis=0))),
}


In [9]:
df_res = pd.concat(res, keys=res.keys(), axis=0)
display(df_res)
df_res.to_csv("./res/par_socks_res.csv")

Unnamed: 0,Unnamed: 1,randomforest,SVM_linear,logistic,NB,xgboost,SVM_rbf,randomforest+SMOTE,SVM_linear+SMOTE,logistic+SMOTE,NB+SMOTE,xgboost+SMOTE,SVM_rbf+SMOTE
COVERT,fit_time,0.006351,0.02174,0.002569,0.000502,0.012508,0.000838,0.007936,0.020084,0.003491,0.001261,0.010021,0.002155
COVERT,score_time,0.002384,0.002037,0.001406,0.001446,0.002432,0.002011,0.00257,0.002024,0.001402,0.00152,0.002414,0.002025
COVERT,test_f1,0.033333,0.233333,0.066667,0.0,0.033333,0.0,0.187619,0.552143,0.395,0.484286,0.12381,0.155714
COVERT,test_precision,0.025,0.35,0.1,0.0,0.025,0.0,0.165,0.466667,0.343333,0.37,0.1,0.115833
COVERT,test_recall,0.05,0.183333,0.05,0.0,0.05,0.0,0.25,0.75,0.5,0.75,0.166667,0.266667
COVERT,test_accuracy,0.592222,0.781111,0.703333,0.758889,0.536667,0.57,0.558889,0.725556,0.681111,0.603333,0.492222,0.47
OVERT,fit_time,0.006737,0.015795,0.002785,0.000495,0.005739,0.000735,0.00767,0.020642,0.003155,0.001063,0.008408,0.001941
OVERT,score_time,0.002559,0.002066,0.001472,0.001466,0.002395,0.001988,0.002563,0.002034,0.001426,0.001448,0.002411,0.002024
OVERT,test_f1,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.1,0.066667,0.328889,0.166667,0.05
OVERT,test_precision,0.0,0.0,0.0,0.0,0.0,0.0,0.116667,0.1,0.05,0.214167,0.116667,0.033333
