In [1]:
import os
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import time
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import glob
import numpy as np
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from scipy.stats import linregress
import statsmodels.api as sm

In [15]:
frame=pd.read_csv('../FullData/totaldataset.csv')

In [16]:
def strpDateTime(data):
    return data['trx_timestamp'].apply(lambda x : datetime.strptime(x[:10]+x[11:19],"%Y-%m-%d%H:%M:%S"))
def dropCols(data):
    #d_memo and maybe block_num may have some relevance later on
    return data.drop(['account_action_seq','block_num','d_memo','authorization','global_action_seq','data','name','trx_id','trx_timestamp_unix'],axis=1)
def applyLabels(data):
    list_bots=['edgarwinston','Griffinhamza','jacksonjimmy','1ffyqhg4rmbk','1ffyqhg4rmbk','powellernest','2rezoaf4bhly','nckj42dit5sb','scottphillip','oepa252sdx4p','myh2o4wayvxg']
    list_humans=['g44dinjygene','onebrother11','rvrkingfishr','iloveyoudapp','huiyong12345','pketothemoon','zhshj1212123','vipgamedice2','dldldldldldl','pkeniubixxxx']
    data['bot_label']=data['d_from'].apply(lambda x: 1 if x in list_bots else (0 if x in list_humans else None))
def dataProcess(data):
    data['trx_timestamp']=strpDateTime(data)
    data['hours']=[i.hour for i in data['trx_timestamp']]
    data['minutes']=[i.minute for i in data['trx_timestamp']]
    data['seconds']=[i.second for i in data['trx_timestamp']]
    data['day']=[i.day for i in data['trx_timestamp']]
    applyLabels(data)
    return dropCols(data)

In [17]:
def timeseries_agg(data):
    '''
    Pulling together all data on an hourly basis and applying count_txns and volume_eos to it
    @params:
    data is the transaction data set
    
    
    '''
    hour_index=pd.date_range(start="3/1/2019", end="4/1/2019", freq="H")
    timeseries_dummy = pd.DataFrame(0, index=hour_index, columns=['count_txns', 'volume_eos'])
    timeseries_dummy=timeseries_dummy.astype(float)
    for index, row in data.iterrows(): 
        time=row['trx_timestamp'].replace(microsecond=0, second=0, minute=0)
        timeseries_dummy.loc[time]['count_txns']=1+timeseries_dummy.loc[time]['count_txns']
        timeseries_dummy.loc[time]['volume_eos']=row['d_quantity']+ timeseries_dummy.loc[time]['volume_eos']
    return timeseries_dummy
    

In [18]:
def maxAutoCorrelation(data,user):
    '''
    Returns a tuple with maxAutoCorrelation and lag period as values
    '''
    hold=0
    l=0
    data=timeseries_agg(data[data['d_from']==user])
    for i in range(1,168):
        check=data['count_txns'].autocorr(lag=i)
        if (check > hold):
            hold=check
            l=i
    return (hold,l)

In [19]:
def stdevVolumeSent(data,user):
    '''
    Returns a tuple with maxAutoCorrelation and lag period as values
    1-48 hour periods
    '''
    hold=0
    l=0
    data=data[data['Account']==user]
    return np.std(data['AmountDEX']['value'])

In [20]:
def countTotalTransactions(data,account_name):
    return data[data['d_from']==account_name].shape[0]

In [21]:
def getSubSetData(data,low,high):
    user_counts = data['d_from'].value_counts().rename('user_counts')

    zip_data_df = data.merge(user_counts.to_frame(),
                                left_on='d_from',
                                right_index=True)
    return zip_data_df[(zip_data_df['user_counts']>low) & (zip_data_df['user_counts']<high)]

In [22]:
def stdevVolumeSent(data,user):
    '''
    Returns a value of the standard deviation of volume of EOS sent
    '''
    hold=0
    l=0
    return np.std([x for x in data[data['d_from']==user]['d_quantity'].tolist()])

In [23]:
def accountAgg(data):
    list_accounts=data['d_from'].unique()
    dic={}
    dic={el:[0,0,0] for el in list_accounts}
    for key in dic.keys():
        dic[key][0]=maxAutoCorrelation(data,key)[0]
        dic[key][1]=countTotalTransactions(data,key)
        dic[key][2]=stdevVolumeSent(data,key)
    
    return pd.DataFrame.from_dict(dic,orient='index',columns=['maxAutoCorrelation','numberofTxns','stdevVolumeSent']).reset_index()
    

In [24]:
clean_frame=dataProcess(frame)

In [25]:
data=accountAgg(getSubSetData(clean_frame,30,200))

In [26]:
data

Unnamed: 0,index,maxAutoCorrelation,numberofTxns,stdevVolumeSent
0,edgarwinston,0.983027,31,2.775558e-17
1,raidenkeegan,0.579704,41,3.864911e-01
2,heathderrick,0.983027,31,2.775558e-17
3,jakobantoine,0.983027,31,2.775558e-17
4,bryantjaylon,0.983027,31,2.775558e-17
5,karsonskylar,0.983027,31,2.775558e-17
6,luisclarence,0.983027,31,2.775558e-17
7,griffinhamza,0.983027,31,2.775558e-17
8,miguelmalaki,0.914547,31,2.775558e-17
9,devinbeckham,0.983027,31,2.775558e-17


In [27]:
data.to_csv('../Data/clusteringDataSet1.csv')

def getBotsViaAutoCorrThresh(data):
    names=[]
    for index,row in data.iterrows():
        if row['maxAutoCorrelation']>0.983026:
            names.append(row['index'])
    return names

def applyLabels2(data):
    list_bots=['edgarwinston','1ffyqhg4rmbk','nckj42dit5sb','heathderrick']
    list_bots=list_bots+getBotsViaAutoCorrThresh(data)
    list_humans=['nakajoneso33','myhappyended','bidream3idxx','h4nne5eos231','bobo12341234','praabc123abc']
    data['bot_label']=data['index'].apply(lambda x: 1 if x in list_bots else (0 if x in list_humans else None))

applyLabels2(data)

data

data['bot_label'].value_counts()

#all the accounts with 0.983027 autocorrelation and 31 transactions are all bots

def plot_autocorr_counttxns(data,lagnum):
    plt.subplots(3,0)
    title_name=data['d_from'].iloc[0]
    data=timeseries_agg(data)
    sm.graphics.tsa.plot_acf(data['count_txns'], lags=lagnum)
    plt.title(title_name)
    plt.show()

def selectDataFromUser(data,user):
    return data[data['d_from']==user]

plot_autocorr_counttxns(selectDataFromUser(clean_frame,'praabc123abc'),168)

from sklearn.linear_model import LogisticRegression

log1 = LogisticRegression(random_state=42)

train=data[data['bot_label'].notna()]

train_x=train[['maxAutoCorrelation', 'numberofTxns']]

train_y=train.loc[:, train.columns == 'bot_label']

log1.fit(train_x,train_y)

logreg_train_acc = log1.score(train_x,train_y)

print(logreg_train_acc)

unlabeled=data[data['bot_label'].isna()]

unlabeled_x=unlabeled[['maxAutoCorrelation', 'numberofTxns']]

log1.predict(unlabeled_x)

Extremely Biased Training data since 80 data pts are bots and 5 are humans

baseline accuracy to beat is 94%