# ETHEREUM FRAUD DETECTION                                                                              
##  PROJECT IN FEATURE ENGINEERING

### TRAINING AND TESTING DATASET

    Given
        History of Transactions
        List of Etherum Accounts
        Training set -> with flags(0-> normal account 1-> fraud account)
        Testing set -> To be predicted
        

### TRANSACTION DATASET

    Given 
        From account
        To account
        Transaction time -> "yyyy-mm-dd HH:MM:SS"
        Value -> ammount of ETH to transfer from sender to receipient(in WEI)( 0 -> TOKEN TRANSFER )
        gas -> ammount of computational power
        gas_price -> Price of each gas_unit

In [1]:
# Importing Pandas
import pandas as pd
# Importing Numpy
import re
import numpy as np
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [2]:
# Importing Train and test dataset
train = pd.read_csv("./train_accounts.csv")
test = pd.read_csv("test_accounts.csv")

In [3]:
# Importing Transaction Dataset
trans_acc = pd.read_csv("transactions.csv")

In [4]:
### Take a part of the dataset and apply the following
# trans_acc = trans_acc.sample(frac=0.01,random_state=1)

### Now, we take out all the NaN values from the dataset

In [5]:
trans_acc.shape

(5826604, 6)

In [6]:
# Theres only account whose value is null
trans_acc["value"].isnull().sum()

0

In [7]:
# Dropping that particular column -> This maybe fraudulent so use the 
trans_acc = trans_acc.dropna(axis=0)

In [8]:
trans_acc.dtypes
print(trans_acc.shape)

(5826604, 6)


### To take the token transfers away from the value

In [9]:
# Applied Binary Encoding
trans_acc["token_transfer"] = trans_acc["value"].apply(lambda row:1 if row == "0" else 0)

In [10]:
trans_acc.shape

(5826604, 7)

### Transform the value from object to float

In [11]:
trans_acc["value"] = trans_acc["value"].astype("float")

In [12]:
trans_acc.dtypes

from_account             object
to_account               object
transaction_time_utc     object
value                   float64
gas                       int64
gas_price                 int64
token_transfer            int64
dtype: object

### Transforming the Date object to Pandas Date Object

In [13]:
trans_acc["trans_time_conv"] = pd.to_datetime(trans_acc["transaction_time_utc"])

### Now we can drop the Date object

In [14]:
trans_acc = trans_acc.drop("transaction_time_utc",axis=1)

In [15]:
trans_acc.shape

(5826604, 7)

### As the Variance of value attribute is very high we convert into ether

In [16]:
trans_acc["value_log"] = trans_acc["value"].apply(lambda row : row * 1.0e-18)
trans_acc["gas_price_log"] = trans_acc["gas_price"].apply(lambda row : row * 1.0e-9)

### We can now drop the value and gasprice

In [17]:
trans_acc = trans_acc.drop(["value","gas_price"],axis=1)

### To take the Year, Month, Time from the time

In [18]:
trans_acc["month"] = trans_acc["trans_time_conv"].apply(lambda row : row.month)
trans_acc["day"] = trans_acc["trans_time_conv"].apply(lambda row : row.date().day)
trans_acc["year"] = trans_acc["trans_time_conv"].apply(lambda row : row.year)
trans_acc["hour"] = trans_acc["trans_time_conv"].apply(lambda row : row.time().hour)
trans_acc["minutes"] = trans_acc["trans_time_conv"].apply(lambda row : row.time().minute)
trans_acc["seconds"] = trans_acc["trans_time_conv"].apply(lambda row : row.time().second)

### As we have taken out the date we can now delete the trans_time_conv

In [19]:
trans_acc = trans_acc.drop(["trans_time_conv"],axis=1)

In [20]:
trans_acc.corr()

Unnamed: 0,gas,token_transfer,value_log,gas_price_log,month,day,year,hour,minutes,seconds
gas,1.0,0.185005,-0.0074,0.014938,0.008084,0.004585,0.053809,0.01456,-0.012228,0.006588
token_transfer,0.185005,1.0,-0.03767,0.038727,-0.006035,-0.002554,0.207131,-0.006075,-0.017555,0.004475
value_log,-0.0074,-0.03767,1.0,0.002357,-1.9e-05,-0.001103,-0.009571,0.001244,2.1e-05,0.000645
gas_price_log,0.014938,0.038727,0.002357,1.0,0.016553,-0.013945,0.170488,0.014445,-0.00101,0.000894
month,0.008084,-0.006035,-1.9e-05,0.016553,1.0,0.031594,-0.177706,-0.002325,0.002679,4e-06
day,0.004585,-0.002554,-0.001103,-0.013945,0.031594,1.0,-0.02661,0.001343,-0.008496,-0.002817
year,0.053809,0.207131,-0.009571,0.170488,-0.177706,-0.02661,1.0,-0.006006,-0.012938,0.002058
hour,0.01456,-0.006075,0.001244,0.014445,-0.002325,0.001343,-0.006006,1.0,-0.009465,0.000829
minutes,-0.012228,-0.017555,2.1e-05,-0.00101,0.002679,-0.008496,-0.012938,-0.009465,1.0,0.002308
seconds,0.006588,0.004475,0.000645,0.000894,4e-06,-0.002817,0.002058,0.000829,0.002308,1.0


### FEATURE ENGINEERING THROUGH ACCOUNTS AND FLAGS (FROM and TO ACCOUNTS)

In [21]:
# x = pd.DataFrame(trans_acc.pivot_table(columns=['from_account',"to_account"], aggfunc='size'),columns=["no_of transactions"])

In [22]:
trans_acc["account"] = trans_acc["from_account"]

In [23]:
trans_acc = trans_acc.drop(["from_account","to_account"],axis=1)

In [24]:
trans_acc = pd.merge(trans_acc,train)

In [25]:
trans_acc[trans_acc["flag"]==1].size

832068

In [26]:
trans_acc

Unnamed: 0,gas,token_transfer,value_log,gas_price_log,month,day,year,hour,minutes,seconds,account,flag
0,72585,1,0.000000,11.50,5,4,2020,14,54,3,a00996,0
1,73747,1,0.000000,10.35,5,5,2020,5,14,4,a00996,0
2,52747,1,0.000000,12.65,5,6,2020,8,10,14,a00996,0
3,72585,1,0.000000,23.00,5,6,2020,13,55,41,a00996,0
4,74391,1,0.000000,20.70,5,6,2020,16,8,51,a00996,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2205344,21000,0,0.080000,60.00,5,3,2020,14,33,23,a11569,0
2205345,21000,0,1.109931,3.30,5,3,2020,22,24,52,a17328,1
2205346,21000,0,2.389127,10.89,5,4,2020,8,54,34,a24708,1
2205347,21000,0,20.623694,9.90,5,4,2020,8,58,13,a04245,1


### FEATURE ENGINEERING BY CHANGING THE ACCOUNTS TO FLOAT

In [27]:
trans_acc["account"] = trans_acc["account"].astype("string")

In [28]:
def to_float(acc):
    my_str = acc[1:]
    return my_str

In [29]:
trans_acc["account"] = trans_acc["account"].apply(lambda row : to_float(row))

In [30]:
trans_acc["account"] = trans_acc["account"].astype("float")

In [31]:
Fraud = trans_acc[trans_acc['flag']==1]

Valid = trans_acc[trans_acc['flag']==0]

outlier_fraction = len(Fraud)/float(len(Valid))

## MODELLING AND PREDICTING

In [32]:
train["account"] = train["account"].apply(lambda row : to_float(row)).astype("float")

In [33]:
#Create independent and Dependent Features
columns = train.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["flag"]]
# Store the variable we are predicting 
target = "flag"
# Define a random state 
state = np.random.RandomState(42)
X = train[columns]
Y = train[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

(25198, 1)
(25198,)


In [34]:
classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
                                       contamination=outlier_fraction,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='minkowski',
                                              p=2, metric_params=None, contamination=outlier_fraction),
    "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05, 
                                         max_iter=-1)
   
}

In [35]:
n_outliers = len(Fraud)
for i, (clf_name,clf) in enumerate(classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    elif clf_name == "Support Vector Machine":
        clf.fit(X)
        y_pred = clf.predict(X)
    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_pred = clf.predict(X)
    #Reshape the prediction values to 0 for Valid transactions , 1 for Fraud transactions
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != Y).sum()
    # Run Classification Metrics
    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(Y,y_pred))
    print("Classification Report :")
    print(classification_report(Y,y_pred))



Isolation Forest: 3127
Accuracy Score :
0.8759028494324946
Classification Report :
              precision    recall  f1-score   support

           0       0.90      0.97      0.93     22743
           1       0.09      0.03      0.04      2455

    accuracy                           0.88     25198
   macro avg       0.50      0.50      0.49     25198
weighted avg       0.82      0.88      0.85     25198

Local Outlier Factor: 3095
Accuracy Score :
0.8771727914913882
Classification Report :
              precision    recall  f1-score   support

           0       0.90      0.97      0.93     22743
           1       0.11      0.04      0.05      2455

    accuracy                           0.88     25198
   macro avg       0.51      0.50      0.49     25198
weighted avg       0.83      0.88      0.85     25198

Support Vector Machine: 11750
Accuracy Score :
0.5336931502500198
Classification Report :
              precision    recall  f1-score   support

           0       0.90      0.

### PREDICTING THE MODEL

In [36]:
test = pd.read_csv("test_accounts.csv")

In [37]:
test_acc = test["account"]
test["account"] = test["account"].apply(lambda row : to_float(row)).astype("float")

In [38]:
test_pred = pd.DataFrame(clf.predict(test))

In [39]:
test_pred["flag"] = test_pred[0].apply(lambda row : 0 if row==-1 else 1)

In [40]:
test_pred = test_pred.drop(0,axis=1)

In [41]:
test["flag"] = test_pred["flag"]

In [42]:
test["account"] = test_acc

In [43]:
result = test

In [44]:
result

Unnamed: 0,account,flag
0,a27890,0
1,a29649,0
2,a28243,1
3,a07155,0
4,a15576,0
...,...,...
6295,a19941,0
6296,a09327,0
6297,a10254,1
6298,a08928,0


### save the RESULT in the csv file

In [45]:
result.to_csv('submission.csv',, encoding='utf-8')