In [1]:
%%time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

CPU times: user 1.37 s, sys: 667 ms, total: 2.04 s
Wall time: 3.65 s


In [2]:
%%time
print("#\tReading cleaned data...")
file_name = "training_data"
dtype={"address": str, "solidity": str, "opcode": str, "function_names": list}
udf = pd.read_csv(file_name, sep='\t')

#	Reading cleaned data...
CPU times: user 19.5 s, sys: 9.96 s, total: 29.5 s
Wall time: 36.2 s


In [3]:
%%time
def write_dataset_into_file(potential, potential_non, file_name_potential, file_name_non_potential):
    sample_size = min(potential_non.shape[0], potential.shape[0])
    potential.head(sample_size).to_csv(file_name_potential, sep=',', index=False, header='opcode')
    potential_non.sample(n=sample_size).to_csv(file_name_non_potential, sep=',', index=False, header='opcode')

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 7.87 µs


#### ERC20 Example

In [19]:
%%time
print("#\tCreating erc20-nonerc20 dataset.")

potential_erc20 = df.loc[
    lambda data: data.function_names.apply(
        lambda l: 'balanceof' in str(l) and
                  'totalsupply' in str(l) and 
                  'transfer' in str(l) and
                  'transferfrom' in str(l) and
                  'approve' in str(l) and
                  'allowance' in str(l)
    )
]
print("#\tNumber of potential erc20 contracts found: {}".format(potential_erc20.shape[0]))

potential_non_erc20 = df.loc[
    lambda data: data.function_names.apply(
        lambda l: 'balanceof' not in str(l) and
                  'totalsupply' not in str(l) and 
                  'transfer' not in str(l) and
                  'transferfrom' not in str(l) and
                  'approve' not in str(l) and
                  'allowance' not in str(l)
    )
]
print("#\tNumber of potential non-erc20 contracts found: {}".format(potential_non_erc20.shape[0]))

print("#\tWriting the potential erc20-nonerc20 contracts into file with an equal distribution...")
write_dataset_into_file(potential_erc20, potential_non_erc20, "erc20.csv", "non-erc20.csv")

#	Creating erc20-nonerc20 dataset.
#	Number of potential erc20 contracts found: 1684
#	Number of potential non-erc20 contracts found: 1601
#	Writing the potential erc20-nonerc20 contracts into file with an equal distribution...
CPU times: user 3.16 s, sys: 186 ms, total: 3.35 s
Wall time: 3.96 s


In [16]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

# Creating training data sets

# Application Patterns

## Voting

In [7]:
%%time
print("#\tCreating voting-nonvoting dataset.")

potential_voting = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: 'vote' in str(l) or
                  'ballot' in str(l) or
                  'voting' in str(l)
    )
]

print("#\tNumber of potential vote contracts found: {}".format(potential_voting.shape[0]))
potential_non_voting = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: 'vote' not in str(l) and
                  'voting' not in str(l) and
                  'ballot' not in str(l)
    )
]
potential_non_voting = potential_non_voting.drop_duplicates(subset='function_names', keep="first")
print("#\tNumber of potential non-vote contracts found: {}".format(potential_non_voting.shape[0]))

print( "#\tWriting the potential voting-nonvoting contracts into file with an equal distribution...")
write_dataset_into_file(potential_voting, potential_non_voting, "voting.csv", "non-voting.csv")

#	Creating voting-nonvoting dataset.
#	Number of potential vote contracts found: 1113
#	Number of potential non-vote contracts found: 41436
#	Writing the potential voting-nonvoting contracts into file with an equal distribution...
CPU times: user 3.08 s, sys: 321 ms, total: 3.4 s
Wall time: 3.54 s


## Auction

In [9]:
%%time
print("#\tCreating auction-nonauciton dataset.")

potential_auction = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: 'auction' in str(l) and
                  'bid' in str(l)
    )
]

print("#\tNumber of potential auction contracts found: {}".format(potential_auction.shape[0]))

potential_non_auction = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: 'auction' not in str(l) and
                  'bid' not in str(l)
    )
]
potential_non_auction = potential_non_auction.drop_duplicates(subset='function_names', keep="first")
print("#\tNumber of potential non-auction contracts found: {}".format(potential_non_auction.shape[0]))

print("#\tWriting the potential auction-nonauction contracts into file with an equal distribution...")
write_dataset_into_file(potential_auction, potential_non_auction, "auction.csv", "non-auction.csv")

#	Creating auction-nonauciton dataset.
#	Number of potential auction contracts found: 433
#	Number of potential non-auction contracts found: 41373
#	Writing the potential auction-nonauction contracts into file with an equal distribution...
CPU times: user 1.26 s, sys: 89.1 ms, total: 1.35 s
Wall time: 1.4 s


## Entity Management

In [10]:
%%time
print("#\tCreating entitymgm-nonentitymgm dataset.")

potential_entitymgm = udf.loc[
    lambda data: data.function_names.apply(
        lambda l:(('register' in str(l) or 'add' in str(l)) and ('update' in str(l) or 'edit' in str(l)) and 'delete' in str(l))
                or ('addorg' in str(l) or 'updateorg' in str(l) or 'removeorg' in str(l))
                or ('addmember' in str(l) or 'updatemember' in str(l) or 'removemember' in str(l) or 'getmember' in str(l) and 'getmembershipprice' not in str(l))
                or 'updateplayer' in str(l) and ('addplayer' in str(l) or 'removeplayer' in str(l))
                or 'updateuser' in str(l) and ('adduser' in str(l) or ('removeuser' in str(l) or 'deleteuser' in str(l)))
    )
]

print("#\tNumber of potential entitymgm contracts found: {}".format(potential_entitymgm.shape[0]))

potential_non_entitymgm = udf.loc[
   lambda data: data.function_names.apply(
        lambda l: 'add' not in str(l) and
                  'update' not in str(l) and 
                  'delete' not in str(l) and
                  'remove' not in str(l) and 
                  'get' not in str(l) and 
                  'setowner' not in str(l)
    )
]
potential_non_entitymgm = potential_non_entitymgm.drop_duplicates(subset='function_names', keep="first")
print("#\tNumber of potential non-entitymgm contracts found: {}".format(potential_non_entitymgm.shape[0]))

print("#\tWriting the potential auction-nonauction contracts into file with an equal distribution...")
write_dataset_into_file(potential_entitymgm, potential_non_entitymgm, "entitymgm.csv", "non-entitymgm.csv")

#	Creating entitymgm-nonentitymgm dataset.
#	Number of potential entitymgm contracts found: 799
#	Number of potential non-entitymgm contracts found: 10348
#	Writing the potential auction-nonauction contracts into file with an equal distribution...
CPU times: user 2.64 s, sys: 166 ms, total: 2.8 s
Wall time: 2.87 s


## Renting

In [11]:
%%time
print("#\tCreating renting-nonrenting dataset.")

potential_renting = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: ('rent' in str(l) and 'parent' not in str(l) and 'current' not in str(l) and 'userentr' not in str(l) and
                  'rentity' not in str(l) and 'prentftoken' not in str(l) and 'thorentiumtoken' not in str(l) and 
                  'different' not in str(l) and 'cyberentry' not in str(l) and 'transferentirestake' not in str(l) and
                  'advisorentitlement' not in str(l) and 'apprentice' not in str(l) and 'torrent' not in str(l) and 
                  'childrenteam' not in str(l)) or
                  (('lease' in str(l) or 'lessee' in str(l)) and 'release' not in str(l) and 'please' not in str(l)) or
                  ('hire' in str(l) and 'kushiresul' not in str(l) and 'hiregotoken' not in str(l))
    )
]

print("#\tNumber of potential renting contracts found: {}".format(potential_renting.shape[0]))

potential_non_renting = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: 'rent' not in str(l) and
                  'lease' not in str(l) and
                  'hire' not in str(l)
    )
]

potential_non_renting = potential_non_renting.drop_duplicates(subset='function_names', keep="first")

print("#\tNumber of potential non-renting contracts found: {}".format(potential_non_renting.shape[0]))

print("#\tWriting the potential renting-nonrenting contracts into file with an equal distribution...")
write_dataset_into_file(potential_renting, potential_non_renting, "renting.csv", "non-renting.csv")

#	Creating renting-nonrenting dataset.
#	Number of potential renting contracts found: 60
#	Number of potential non-renting contracts found: 37919
#	Writing the potential renting-nonrenting contracts into file with an equal distribution...
CPU times: user 505 ms, sys: 23 ms, total: 528 ms
Wall time: 536 ms


## Trading

In [12]:
%%time
print("#\tCreating trading-nontrading dataset.")

potential_trading = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: (('buy' in str(l) or 'sell' in str(l) or 'purchase' in str(l)) and 'transferownership' in str(l) and 'asset' in str(l))
                  or ('trade' in str(l) and 'trademark' not in str(l))
    )
]
potential_trading = potential_trading.drop_duplicates(subset='function_names', keep="first")
print("#\tNumber of potential trading contracts found: {}".format(potential_trading.shape[0]))


potential_non_trading = udf.loc[
    lambda data: data.function_names.apply(
        lambda l: 'ownership' not in str(l) and
                  'transfer' not in str(l) and
                  'trade' not in str(l) and
                  'sell'not in str(l) and
                  'buy' not in str(l) and
                  'asset' not in str(l) and
                  'own' not in str(l)
    )
]

potential_non_trading = potential_non_trading.drop_duplicates(subset='function_names', keep="first")
print("#\tNumber of potential non-trading contracts found: {}".format(potential_non_trading.shape[0]))

print("#\tWriting the potential trading-nontrading contracts into file with an equal distribution...")
write_dataset_into_file(potential_trading, potential_non_trading, "trading_not_annotated.csv", "non-trading.csv")

#	Creating trading-nontrading dataset.
#	Number of potential trading contracts found: 1117
#	Number of potential non-trading contracts found: 3337
#	Writing the potential trading-nontrading contracts into file with an equal distribution...
CPU times: user 2.4 s, sys: 196 ms, total: 2.6 s
Wall time: 2.66 s
