In [1]:
import os, sys, argparse
import pandas as pd
import numpy as np

import pickle
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

import numba
import sklearn
import sklearn.metrics

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
alg_list = ['bn', 'feagle', 'fraudar', 'trust', 'rsd', 'bad', 'rev2', 'rtv']

In [3]:
parser = argparse.ArgumentParser(description='plot creator')
parser.add_argument('-o', '--output', type=str, default='-', action='store', help='target')
parser.add_argument('-d', '--data', type=str, default='alpha', choices=['alpha', 'amazon', 'epinions', 'otc'], help='data name')
parser.add_argument('-a', '--alg', type=str, choices=alg_list, help='alg name')

if bool(getattr(sys, 'ps1', sys.flags.interactive)):
    from tqdm import tqdm_notebook as tqdm
    print('interactive mode')
    parsed = parser.parse_args(['--data', 'alpha'])
else:
    from tqdm import tqdm
    print('script mode')
    parsed = parser.parse_args(sys.argv[1:])
    display=print

print(parsed)
data_name = parsed.data

interactive mode
Namespace(alg=None, data='otc', output='-')


In [4]:
network_df = pd.read_csv('../rev2data/%s/%s_network.csv' %(data_name, data_name), header=None, names=['src', 'dest', 'rating', 'timestamp'], parse_dates=[3], infer_datetime_format=True)
user_list = ['u' + str(u) for u in network_df['src'].tolist()]
gt_df = pd.read_csv('../rev2data/%s/%s_gt.csv' %(data_name, data_name), header=None, names=['id', 'label'])
gt = dict([('u'+str(x[0]), x[1]) for x in zip(gt_df['id'], gt_df['label'])])

In [22]:
def average_multiple(flist):
    results_df = pd.read_csv(flist[0], header=None)
    ulist = results_df[1].tolist()
    ytrue_old = results_df[0].tolist()
    ytrue = [0 if ytrue_old[i] == 1 else 2 if ulist[i][0] == 's' else 1 for i in range(len(ytrue_old))]
    u_sum = {u: 0 for u in ulist}
    for f in flist:
        try:
            try_df = pd.read_csv(f, header=None)
            s = dict(zip(try_df[1].tolist(), try_df[2].tolist()))
            for u in u_sum:
                u_sum[u] += s[u]
        except:
            pass
    yscore = [u_sum[u] for u in u_sum]
    return ulist, ytrue, yscore

def resort(ulist, ytrue, yscore):
    uscore = dict(zip(ulist, yscore))
    utrue = dict(zip(ulist, ytrue))
    slist = sorted(uscore, key=lambda u: uscore[u])
    strue = [utrue[u] for u in slist]
    sscore = [uscore[u] for u in slist]
    return slist, strue, sscore

def compute_score(alg_name, data_name):
    # if user is good in ground truth output 0
    # if user is fraudster in ground truth output 1
    # if user is sockpuppet output 2
    
    if alg_name == 'rev2':
        flist = [
        '../res/non-socks/%s-%s-1-1-1-1-1-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-2-1-1-1-1-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-1-2-1-1-1-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-1-1-2-1-1-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-1-1-1-2-1-1-1.csv' %(alg_name, data_name),
        ]
        try:
            results_df = pd.read_csv(flist[0], header=None)
        except:
            return None
        ulist, ytrue, yscore = average_multiple(flist)
    elif alg_name == 'rtv':
        flist = [
        '../res/non-socks/%s-%s-1-1-1-1-10-2-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-2-1-1-1-10-2-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-1-2-1-1-10-2-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-1-1-2-1-10-2-1-1.csv' %(alg_name, data_name),
        '../res/non-socks/%s-%s-1-1-1-2-10-2-1-1.csv' %(alg_name, data_name),
        ]
        try:
            results_df = pd.read_csv(flist[0], header=None)
        except:
            return None
        ulist, ytrue, yscore = average_multiple(flist)
    else:
        try:
            results_df = pd.read_csv('../res/non-socks/%s-%s.csv' %(alg_name, data_name), header=None)
        except:
            return None
        ytrue_old = results_df[0].tolist()
        ulist = results_df[1].tolist()
        yscore = results_df[2].tolist()
        ytrue = [0 if ytrue_old[i] == 1 else 2 if ulist[i][0] == 's' else 1 for i in range(len(ytrue_old))]
    
    ulist, ytrue, yscore = resort(ulist, ytrue, yscore)
    return {'ulist': ulist, 'ytrue': ytrue, 'yscore': yscore}

@numba.jit
def get_metrics(ytrue, yscore):
    '''get precision and recall at q percentile'''
    q = np.array([0.005, 0.01, 0.03, 0.05, 0.1])

    assert len(ytrue) == len(yscore)
    size = len(ytrue)
    prec_dict = {}
    recl_dict = {}
    f1_dict = {}
    for qq in q:
        cut = qq*size
        ypred = (np.arange(size) < cut) * 1
        prec = sklearn.metrics.precision_score(y_pred=ypred, y_true=ytrue)
        recl = sklearn.metrics.recall_score(y_pred=ypred, y_true=ytrue)
        f1 = sklearn.metrics.f1_score(y_pred=ypred, y_true=ytrue)
        prec_dict[qq] = prec
        recl_dict[qq] = recl
        f1_dict[qq] = f1
    return prec_dict, recl_dict, f1_dict

def compute_metrics(res_dict):
    ulist = np.array(res_dict['ulist'])
    yscore = np.array(res_dict['yscore'])
    ytrue = np.array(res_dict['ytrue'])
    ytrue[ytrue > 1] = 1
    prec_dict, recl_dict, f1_dict = get_metrics(ytrue, yscore)
    return {'prec': prec_dict, 'recl': recl_dict, 'f1': f1_dict}

In [23]:
met_dict = {alg_name: {} for alg_name in alg_list}
results_dict = {}

for alg_name in alg_list:
    results_dict[alg_name] = compute_score(alg_name, data_name)
    mets = compute_metrics(results_dict[(alg_name)])
    for q in [0.005, 0.01, 0.03, 0.05, 0.1]:
        met_dict[alg_name].update({(q, 'Precision'): mets['prec'][q], (q, 'Recall'): mets['recl'][q], (q, 'F1'): mets['f1'][q]})

df = pd.DataFrame(met_dict)
display(df)
df.to_csv('../res/non-socks-table/%s.csv' %data_name)

Unnamed: 0,Unnamed: 1,bn,feagle,fraudar,trust,rsd,bad,rev2,rtv
0.005,F1,0.034188,0.0,0.017094,0.034188,0.017094,0.034188,0.017094,0.017094
0.005,Precision,1.0,0.0,0.5,1.0,0.5,1.0,0.5,0.5
0.005,Recall,0.017391,0.0,0.008696,0.017391,0.008696,0.017391,0.008696,0.008696
0.01,F1,0.033898,0.0,0.033898,0.050847,0.016949,0.050847,0.016949,0.016949
0.01,Precision,0.666667,0.0,0.666667,1.0,0.333333,1.0,0.333333,0.333333
0.01,Recall,0.017391,0.0,0.017391,0.026087,0.008696,0.026087,0.008696,0.008696
0.03,F1,0.065041,0.03252,0.081301,0.130081,0.01626,0.130081,0.081301,0.01626
0.03,Precision,0.5,0.25,0.625,1.0,0.125,1.0,0.625,0.125
0.03,Recall,0.034783,0.017391,0.043478,0.069565,0.008696,0.069565,0.043478,0.008696
0.05,F1,0.07874,0.031496,0.141732,0.188976,0.031496,0.188976,0.125984,0.015748
