In [1]:
import os
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import time
from pandas.plotting import scatter_matrix

import glob

# Loading Data

Loading data from multiple json sources into one pandas Dataframe

In [None]:
path = r'../endlessdice-201903' # use your path
all_files = glob.glob(path + "/*.json")

li = []

for filename in all_files:
    df = pd.read_json(filename,lines=True)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True,sort=True)

In [None]:
frame=pd.read_csv()

In [4]:
frame.shape

(4762541, 16)

## EDA

In [5]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4762541 entries, 0 to 4762540
Data columns (total 16 columns):
account               object
account_action_seq    int64
authorization         object
block_num             int64
d_from                object
d_memo                object
d_quantity            float64
d_quantity_unit       object
d_to                  object
dapp_code             object
data                  object
global_action_seq     int64
name                  object
trx_id                object
trx_timestamp         object
trx_timestamp_unix    float64
dtypes: float64(2), int64(3), object(11)
memory usage: 581.4+ MB


Looking at numerical features of data to understand distribution. It seems d_quantity will be the only useful metric from understanding of the features provided in our data description file

In [6]:
frame.describe()

Unnamed: 0,account_action_seq,block_num,d_quantity,global_action_seq,trx_timestamp_unix
count,4762541.0,4762541.0,4762537.0,4762541.0,4762541.0
mean,4302966.0,47916050.0,0.962701,5611532000.0,1552741000.0
std,1956884.0,1558004.0,41.73186,166178500.0,779766.4
min,1284860.0,45232760.0,8e-07,5326755000.0,1551398000.0
25%,2475495.0,46570300.0,0.0024,5469543000.0,1552068000.0
50%,4232553.0,47897800.0,0.098,5608729000.0,1552732000.0
75%,6018499.0,49300790.0,0.196,5756417000.0,1553434000.0
max,7804443.0,50584610.0,22360.03,5907187000.0,1554077000.0


How many unique accounts are there and how many transactions do each of them do?

In [7]:
frame['d_from'].value_counts()

endlessdicex    3048248
endlessbank1     776259
nmslnmslnmsl     254893
eossuperplay     202279
endlesscrash     175291
bimawen11111      17832
thegrintsch1      14983
kaidenmatias      14179
g4ztemjugyge      12963
everettlayne      12767
andrewcortez      12459
ticklish2424      11152
skdi12312311       9662
danieldamari       9164
adawalletwin       8151
stardlullaby       7464
xuminggangok       7450
twinswallet1       6344
g44temrwgage       5922
g44dmnrxg4ge       5791
iyeoseos1eee       5453
leiniao12345       4887
asdfghjklz14       4799
wobuzhidaoya       4425
benzilimbenb       3981
gy3tamrxgqge       3838
zahuhuo1o1o2       3635
xulingbookok       3463
gyydinbrgige       3247
ge4tinrzhage       2975
                 ...   
jonathankoch          1
soowangkilky          1
hazdsmrwgqge          1
wosdateos154          1
thecoinusio3          1
jxjdsidsijsk          1
eosers.x              1
seabattlemem          1
wenzhenxiang          1
zhujinyong11          1
bearjie12345    

Over the 1667 accounts, there is pretty non-uniform distribution with a heavy tail of accounts with only one transaction. Perhaps all of the single transaction accounts can be bots?

In [8]:
frame.head(10)

Unnamed: 0,account,account_action_seq,authorization,block_num,d_from,d_memo,d_quantity,d_quantity_unit,d_to,dapp_code,data,global_action_seq,name,trx_id,trx_timestamp,trx_timestamp_unix
0,eosio.token,4092172,"[{'actor': 'justiceariel', 'permission': 'acti...",46268353,justiceariel,96-5db37a4363ab3ea2bec1ebc2858ffb4a8e39648d3ba...,0.1,EOS,endlessdicex,Endless Dice,,5437311955,transfer,1a67bec15c41f19a16c52095484179da52f99954c7666b...,2019-03-07T00:00:02.000,1551917000.0
1,eosio.token,4092173,"[{'actor': 'eossuperplay', 'permission': 'acti...",46268355,eossuperplay,96-a61d1f367685a2aa8b1da4b78ade17e166e5be9d256...,0.1,EOS,endlessdicex,Endless Dice,,5437312292,transfer,9dba0bbf8240c888f69eae8479705fa0be5c40de7eaeab...,2019-03-07T00:00:03.000,1551917000.0
2,eosio.token,4092174,"[{'actor': 'nmslnmslnmsl', 'permission': 'acti...",46268358,nmslnmslnmsl,96-db123609ce06d9da8ed163a15babe723ee6a6944a11...,0.2,EOS,endlessdicex,Endless Dice,,5437313093,transfer,a6ab394b9d2a5f29d4ab55d5015ae102f0da0726449bfd...,2019-03-07T00:00:04.500,1551917000.0
3,eosio.token,4092175,"[{'actor': 'edgarwinston', 'permission': 'acti...",46268360,edgarwinston,96-4cb99f6c294c85e824f1668f790608f6a90d003ec58...,0.1,EOS,endlessdicex,Endless Dice,,5437313534,transfer,adfc46a0bc27b1347a9636756728e4bdecce28bf601bc6...,2019-03-07T00:00:06.000,1551917000.0
4,eosio.token,4092176,"[{'actor': 'raidenkeegan', 'permission': 'acti...",46268366,raidenkeegan,96-8cde107767044401cb03912b9464a9af653c8e1ccaa...,0.1,EOS,endlessdicex,Endless Dice,,5437314911,transfer,d68cdfa9bd39d2f6fcdabaf37439cdbb24f2f9f55f4670...,2019-03-07T00:00:09.000,1551917000.0
5,eosio.token,4092177,"[{'actor': 'endlessdicex', 'permission': 'acti...",46268370,endlessdicex,bet id:13612230 player: edgarwinston send to b...,0.098,EOS,endlessbank1,Endless Dice,,5437315482,transfer,a67cb0f4106d1fe4facb695828de02b231ade8238991f3...,2019-03-07T00:00:11.000,1551917000.0
6,eosio.token,4092179,"[{'actor': 'endlessdicex', 'permission': 'acti...",46268370,endlessdicex,bet id:13612230 player: edgarwinston dividend ...,0.0008,EOS,endlessdivdn,Endless Dice,,5437315485,transfer,a67cb0f4106d1fe4facb695828de02b231ade8238991f3...,2019-03-07T00:00:11.000,1551917000.0
7,eosio.token,4092181,"[{'actor': 'endlessdicex', 'permission': 'acti...",46268370,endlessdicex,bet id:13612230 player: edgarwinston send to c...,0.0012,EOS,endlessoptex,Endless Dice,,5437315488,transfer,a67cb0f4106d1fe4facb695828de02b231ade8238991f3...,2019-03-07T00:00:11.000,1551917000.0
8,eosio.token,4092182,"[{'actor': 'endlessdicex', 'permission': 'acti...",46268370,endlessdicex,bet id:13612231 player: raidenkeegan send to b...,0.098,EOS,endlessbank1,Endless Dice,,5437315499,transfer,a67cb0f4106d1fe4facb695828de02b231ade8238991f3...,2019-03-07T00:00:11.000,1551917000.0
9,eosio.token,4092184,"[{'actor': 'endlessdicex', 'permission': 'acti...",46268370,endlessdicex,bet id:13612231 player: raidenkeegan dividend ...,0.0008,EOS,endlessdivdn,Endless Dice,,5437315502,transfer,a67cb0f4106d1fe4facb695828de02b231ade8238991f3...,2019-03-07T00:00:11.000,1551917000.0


In [11]:
frame['d_from'].value_counts()[frame['d_from'].value_counts()>1000]

endlessdicex    3048248
endlessbank1     776259
nmslnmslnmsl     254893
eossuperplay     202279
endlesscrash     175291
bimawen11111      17832
thegrintsch1      14983
kaidenmatias      14179
g4ztemjugyge      12963
everettlayne      12767
andrewcortez      12459
ticklish2424      11152
skdi12312311       9662
danieldamari       9164
adawalletwin       8151
stardlullaby       7464
xuminggangok       7450
twinswallet1       6344
g44temrwgage       5922
g44dmnrxg4ge       5791
iyeoseos1eee       5453
leiniao12345       4887
asdfghjklz14       4799
wobuzhidaoya       4425
benzilimbenb       3981
gy3tamrxgqge       3838
zahuhuo1o1o2       3635
xulingbookok       3463
gyydinbrgige       3247
ge4tinrzhage       2975
babybaby3355       2787
zzxzzxzzxzzx       2745
lt5555lt5555       2685
gu3domagenes       2496
perolehsitos       2041
gu3dsnbqhege       1879
azxsdcvf1111       1794
gi4tqnbrgene       1740
eostxleos111       1739
gu2dcmjqgqge       1721
55113335hgqu       1692
txltxltxl111    

## Initial PreProcessing for EDA

Make a timestamp object to explore timeseries data

In [5]:
def strpDateTime(data):
    return data['trx_timestamp'].apply(lambda x : datetime.strptime(x[:10]+x[11:19],"%Y-%m-%d%H:%M:%S"))

Drop cols which are deemed useless based on feature understanding

In [6]:
def dropcols(data):
    #d_memo and maybe block_num may have some relevance later on
    return data.drop(['account_action_seq','block_num','d_memo','authorization','global_action_seq','data','name','trx_id','trx_timestamp_unix'],axis=1)

These are the labels which were given to us by our Advisor, creating a function to label them

In [7]:
def applyLabels(data):
    list_bots=['edgarwinston','Griffinhamza','jacksonjimmy','1ffyqhg4rmbk','1ffyqhg4rmbk','powellernest','2rezoaf4bhly','nckj42dit5sb','scottphillip','oepa252sdx4p','myh2o4wayvxg']
    list_humans=['g44dinjygene','onebrother11','rvrkingfishr','iloveyoudapp','huiyong12345','pketothemoon','zhshj1212123','vipgamedice2','dldldldldldl','pkeniubixxxx']
    data['bot_label']=data['d_from'].apply(lambda x: 1 if x in list_bots else (0 if x in list_humans else None))

Aggregate dataprocessing function

In [8]:
def dataprocess(data):
    data['trx_timestamp']=strpDateTime(frame)
    applyLabels(data)
    return dropcols(data)

In [9]:
clean_frame=dataprocess(frame)

NameError: name 'frame' is not defined

In [15]:
clean_frame

Unnamed: 0,account,d_from,d_quantity,d_quantity_unit,d_to,dapp_code,trx_timestamp,bot_label
0,eosio.token,justiceariel,0.1000,EOS,endlessdicex,Endless Dice,2019-03-07 00:00:02,
1,eosio.token,eossuperplay,0.1000,EOS,endlessdicex,Endless Dice,2019-03-07 00:00:03,
2,eosio.token,nmslnmslnmsl,0.2000,EOS,endlessdicex,Endless Dice,2019-03-07 00:00:04,
3,eosio.token,edgarwinston,0.1000,EOS,endlessdicex,Endless Dice,2019-03-07 00:00:06,1.0
4,eosio.token,raidenkeegan,0.1000,EOS,endlessdicex,Endless Dice,2019-03-07 00:00:09,
5,eosio.token,endlessdicex,0.0980,EOS,endlessbank1,Endless Dice,2019-03-07 00:00:11,
6,eosio.token,endlessdicex,0.0008,EOS,endlessdivdn,Endless Dice,2019-03-07 00:00:11,
7,eosio.token,endlessdicex,0.0012,EOS,endlessoptex,Endless Dice,2019-03-07 00:00:11,
8,eosio.token,endlessdicex,0.0980,EOS,endlessbank1,Endless Dice,2019-03-07 00:00:11,
9,eosio.token,endlessdicex,0.0008,EOS,endlessdivdn,Endless Dice,2019-03-07 00:00:11,


## Labeled Bots and Humans in Training Data

Let's see how many bots and humans there are in our data!

In [3]:

list_bots=['edgarwinston','Griffinhamza','jacksonjimmy','1ffyqhg4rmbk','1ffyqhg4rmbk','powellernest','2rezoaf4bhly','nckj42dit5sb','scottphillip','oepa252sdx4p','myh2o4wayvxg']
list_humans=['g44dinjygene','onebrother11','rvrkingfishr','iloveyoudapp','huiyong12345','pketothemoon','zhshj1212123','vipgamedice2','dldldldldldl','pkeniubixxxx']


In [4]:
training_bots= clean_frame[pd.DataFrame(clean_frame['d_from'].to_list()).isin(list_bots).any(1)]
training_bots

NameError: name 'clean_frame' is not defined

### There are zero labeled humans in our given data! - This is an issue looks like we will have to label a lot more data

In [21]:
training_bots['d_from'].value_counts()

1ffyqhg4rmbk    32
nckj42dit5sb    32
edgarwinston    31
Name: d_from, dtype: int64

Only 3 bots with 95 transactions total

In [19]:
training_bots['d_quantity'].value_counts()

0.1    95
Name: d_quantity, dtype: int64

Interestingly all of the bot labeled transactions have 0.1 EOS

### All bots trade 0.1 d_quantity per transaction - Baseline heuristic? - Advisor says no just a coincidence