### Imports 

In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt 
import numpy as np 
import tensorflow as tf 
from tensorflow import keras

In [2]:
df = pd.read_csv("transaction_dataset.csv", skipinitialspace=True)

In [3]:
df.shape

(9841, 51)

### Split the data 

In [4]:
def split_data(data, percentages):
    
    n_rows = data.shape[0]
    n_rows1 = np.round(percentages[0]*data.shape[0])
    n_rows2 = np.round((percentages[0]+percentages[1])*data.shape[0])
    
    n_rows1 = n_rows1.astype(np.int64)
    n_rows2 = n_rows2.astype(np.int64)

    print((percentages[0]+percentages[1])*data.shape[0])
    
    data_train = data.iloc[:n_rows1,:]
    data_validation = data.iloc[n_rows1:n_rows2,:]
    data_test = data.iloc[n_rows2:,:]
    
    return data_train, data_validation, data_test

In [5]:
df_train, df_val, df_test = split_data(df, [0.8,0.1,0.1])

8856.9


### Preprocessing 

In [6]:
df_train

Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.000000,1.683100e+07,271779.920000,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809e+00,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token
2,2,3,0x0002bda54cb772d040f779e88eb453cac0daa244,0,246194.54,2434.02,516729.30,2,10,0,...,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,8.0,,XENON
3,3,4,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,10219.60,15785.09,397555.90,25,9,0,...,100.000000,9.029231e+03,3804.076893,0.0,0.0,0.0,1.0,11.0,Raiden,XENON
4,4,5,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,36.61,10707.77,382472.42,4598,20,1,...,0.000000,4.500000e+04,13726.659220,0.0,0.0,0.0,6.0,27.0,StatusNetwork,EOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7868,7868,207,0x16c26eb6051fe13273c634cb6df70a26befa9ec3,1,754.55,310.70,24145.55,18,34,0,...,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,1.0,,OCoin
7869,7869,208,0x1707bc69d91f86a6000d318e85d07b00747152fe,1,0.00,0.00,1330.48,1,1,0,...,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
7870,7870,209,0x1708a04876fec1ca8a482e72afb340c362d3ec20,1,0.00,0.00,0.00,0,0,0,...,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,1.0,,Blockwell say NOTSAFU
7871,7871,210,0x1754cebea65afe5cfe90ee7d393adce1568d2ba4,1,0.00,461.94,268014.63,1,10,0,...,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,4.0,,GSENetwork


In [7]:
# we remove the columns that contains text instead of figures 
cols_to_drop = [
    'Address',
    'Index',
    'Unnamed: 0', 
    "ERC20 most sent token type",
    "ERC20_most_rec_token_type"
]

df_train = df_train.drop(cols_to_drop, axis = 1)
df_test = df_test.drop(cols_to_drop, axis = 1)
df_val = df_val.drop(cols_to_drop, axis = 1)


In [8]:
df_train

Unnamed: 0,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,Unique Received From Addresses,Unique Sent To Addresses,min value received,...,ERC20 max val rec,ERC20 avg val rec,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name
0,0,844.26,1093.71,704785.63,721,89,0,40,118,0.000000,...,1.500000e+07,265586.147600,0.000000,1.683100e+07,271779.920000,0.0,0.0,0.0,39.0,57.0
1,0,12709.07,2958.44,1218216.73,94,8,0,5,14,0.000000,...,3.650000e+02,57.632615,2.260809,2.260809e+00,2.260809,0.0,0.0,0.0,1.0,7.0
2,0,246194.54,2434.02,516729.30,2,10,0,10,2,0.113119,...,4.428198e+02,65.189009,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,8.0
3,0,10219.60,15785.09,397555.90,25,9,0,7,13,0.000000,...,1.141223e+04,1555.550174,100.000000,9.029231e+03,3804.076893,0.0,0.0,0.0,1.0,11.0
4,0,36.61,10707.77,382472.42,4598,20,1,7,19,0.000000,...,9.000000e+04,4934.232147,0.000000,4.500000e+04,13726.659220,0.0,0.0,0.0,6.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7868,1,754.55,310.70,24145.55,18,34,0,25,15,0.000010,...,3.000000e+00,3.000000,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,1.0
7869,1,0.00,0.00,1330.48,1,1,0,1,1,0.647108,...,1.337000e+01,13.370000,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,1.0
7870,1,0.00,0.00,0.00,0,0,0,0,0,0.000000,...,1.337000e+01,13.370000,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,1.0
7871,1,0.00,461.94,268014.63,1,10,0,9,1,0.000593,...,9.900000e+01,28.092500,0.000000,0.000000e+00,0.000000,0.0,0.0,0.0,0.0,4.0


### Training the models 

In [18]:
cols_to_drop = [
    ' erc20 most sent token type',
    ' erc20_most_rec_token_type',
    'address',
    'index',
    'unnamed: 0'
]

features = [x for x in df.columns if (x != 'flag' and x not in cols_to_drop)]

unique_values = df.nunique()
# not a good practive if we have more data
features = [x for x in features if x in unique_values.loc[(unique_values>1)]]

In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras import Input
#create model
model = Sequential()
#add model layers
model.add(Input(shape=(len(features),)))

model.add(Dense(len(features), activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 44)                1980      
                                                                 
 dense_17 (Dense)            (None, 20)                900       
                                                                 
 dense_18 (Dense)            (None, 5)                 105       
                                                                 
 dense_19 (Dense)            (None, 2)                 12        
                                                                 
Total params: 2,997
Trainable params: 2,997
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10


ValueError: in user code:

    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 263, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_4" is incompatible with the layer: expected shape=(None, 44), found shape=(None, 45)


In [12]:
y_train = df_train["FLAG"]
X_train = df_train.drop("FLAG", axis = 1)

y_test = df_test["FLAG"]
X_test = df_test.drop("FLAG", axis = 1)

y_val = df_val["FLAG"]
X_val = df_val.drop("FLAG", axis = 1)

In [17]:
from keras.models import Sequential
from keras.layers import Dense
from keras import Input
#create model
model = Sequential()
#add model layers
model.add(Input(shape=(X_train.shape[1],)))

model.add(Dense(X_train.shape[1], activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 45)                2070      
                                                                 
 dense_13 (Dense)            (None, 20)                920       
                                                                 
 dense_14 (Dense)            (None, 5)                 105       
                                                                 
 dense_15 (Dense)            (None, 2)                 12        
                                                                 
Total params: 3,107
Trainable params: 3,107
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10


ValueError: in user code:

    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\training.py", line 809, in train_step
        loss = self.compiled_loss(
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\losses.py", line 1664, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\tziya\anaconda3\lib\site-packages\keras\backend.py", line 4994, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 2) are incompatible
