In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**1.0 Data loading**

Extract zip files and load data into panda dataframe.

In [None]:
# Extrac zip files
import zipfile
def extract_images(filePath):
    with zipfile.ZipFile(filePath,"r") as z:
        z.extractall(".")
# Test
extract_images('/kaggle/input/facebook-recruiting-iv-human-or-bot/train.csv.zip')
extract_images('/kaggle/input/facebook-recruiting-iv-human-or-bot/bids.csv.zip')
extract_images('/kaggle/input/facebook-recruiting-iv-human-or-bot/test.csv.zip')

In [None]:
train_df = pd.read_csv('/kaggle/working/train.csv')
bids_df = pd.read_csv('/kaggle/working/bids.csv')
test_df = pd.read_csv('/kaggle/working/test.csv')
test_df.head()

In [None]:
bids_df.head()

In [None]:
train_df.head()

**2.0 Analyze Dataset**
The datasets are provided by two part:
* train.csv: the base bidder dataset with labels. Besides label, it also contains unique bidder_id followed with payment_account, address and outcome;
* bids.csv: the dataset contains some more useful information associated with unique key bid_id, corresponding with bidder_id, auction, merchandise, device, time, country, ip, and url. time and url are encrypted information.

Since both of these two files contains the unique key bidder_id, we firstly joined these data by this key. Our object is to try to find each bid is made by a human or robot, thus, we only consider the data which have bids information and labelled. We will left join on train.csv and check the joined data for missing bids information.

In [None]:
# Populate train and test data with bid details
def populate_bids(df):
    df = pd.merge(left=df, right=bids_df, how='left', left_on='bidder_id', right_on='bidder_id')
    return df

In [None]:
test = populate_bids(test_df)
test.isna().sum()

In [None]:
train = populate_bids(train_df)
train.isna().sum()

**Data Cleaning**
Since we can see all the missing values are from the variables that come from bids, we want to check if there exist some of bidder_id do not have any bids.

In [None]:
nobids_train = train[pd.isnull(train['bid_id'])]['bidder_id'].unique()
len(nobids_train)

In [None]:
nobids_test = test[pd.isnull(test['bid_id'])]['bidder_id'].unique()
len(nobids_test)

In [None]:
# Check outcome for these in the train
train[train['bidder_id'].isin(nobids_train)]['outcome']

There exist 29 bidder in training data set having no bid data. After checking with their labels, they were all marked as human. Since we aim to detect robot, we will just ignore them. We will update `populate_bids` function.

In [None]:
# Populate train and test data with bid details
def populate_bids_with_data_cleanup(df):
    df = pd.merge(left=df, right=bids_df, how='left', left_on='bidder_id', right_on='bidder_id')
    nobids = df[pd.isnull(df['bid_id'])]['bidder_id'].unique()
    # for all the missing value observation, drop it
    df = df[~df['bidder_id'].isin(nobids)]
    return df

In [None]:
test_data = populate_bids_with_data_cleanup(test_df)
test_data.isna().sum()

In [None]:
train_data = populate_bids_with_data_cleanup(train_df)
train_data.isna().sum()

In [None]:
# check the missing value ratio of the country entry in train data
count_miss_ratio_train = pd.isnull(train_data['country']).sum()/len(train_data)*100
print(f" %0.2f %% observations missing country entry in train data." %
      float(count_miss_ratio_train))

In [None]:
# check the missing value ratio of the country entry in test data
count_miss_ratio_test = pd.isnull(test_data['country']).sum()/len(test_data)*100
print(f" %0.2f %% observations missing country entry in test data." %
      float(count_miss_ratio_test))

For the full data set, there is only few of observations missing country entry. We will just ignore the missing country entry data.

In [None]:
# Populate train and test data with bid details
def populate_bids_with_data_cleanup_final(df):
    df = pd.merge(left=df, right=bids_df, how='left', left_on='bidder_id', right_on='bidder_id')
    nobids = df[pd.isnull(df['bid_id'])]['bidder_id'].unique()
    # for all the missing value observation, drop it
    df = df[~df['bidder_id'].isin(nobids)]
    # remove missing country data
    df = df.dropna()
    return df

In [None]:
train = populate_bids_with_data_cleanup_final(train_df)
train.isna().sum()

In [None]:
test = populate_bids_with_data_cleanup_final(test_df)
test.isna().sum()

**3.0 Data Exploration**

In [None]:
# check the unique number of each feature in train
print(f"total row in train:          {len(train)}")
print(f"total bids in train:         {len(train['bid_id'].unique())}")
print(f"total bidder in train:      {len(train['bidder_id'].unique())}")
print(f"total payment in train:     {len(train['payment_account'].unique())}")
print(f"total address in train:     {len(train['address'].unique())}")
print(f"total auction in train:     {len(train['auction'].unique())}")
print(f"total merchandise in train: {len(train['merchandise'].unique())}")
print(f"total device in train:      {len(train['device'].unique())}")
print(f"total country in train:     {len(train['country'].unique())}")
print(f"total ip in train:          {len(train['ip'].unique())}")
print(f"total url in train:         {len(train['url'].unique())}")

From this output, we can see the payment and address are one-to-one correspondence to bidder. Since our outpue, whether a bid is made from a human or robot, is label based on bidder, we can igonre these two features for future analysis. Since merchandise and country are compared low in this situation, we can just treat themselves as a feature for analysis.

For the other features, let's check the ratios to robots and human. Before that, let's check the robots vs. human first.

In [None]:
print(
    f"total bids made by robots in data set: {len(train[train['outcome'] == 1])}")
print(
    f"total bids made by human in data set:  {len(train[train['outcome'] == 0])}")
print("the ratio of made by robot vs. bids made by human in data set: 3:20")

These trainning data set has unbalanced number of robots and human:

We might need to do resampling before training the model.
for the evaluation metrices, we will foucus on AUC and precision vs. recall.
Then checking the features' ratio of robots and human:

In [None]:
def get_metrics(bid):
    print(f"average bids per robot: %.0f" % float(len(
        bid[bid['outcome'] == 1])/len(bid[bid['outcome'] == 1]['bidder_id'].unique())))
    print(f"average bids per human:  %.0f" % float(len(
        bid[bid['outcome'] == 0])/len(bid[bid['outcome'] == 0]['bidder_id'].unique())))
    print(f"average bids per auction by robot: %.0f" % float(
        len(bid[bid['outcome'] == 1])/len(bid[bid['outcome'] == 1]['auction'].unique())))
    print(f"average bids per auction by human:  %.0f" % float(
        len(bid[bid['outcome'] == 0])/len(bid[bid['outcome'] == 0]['auction'].unique())))
    print(f"average bids per device by robot: %.0f" % float(
        len(bid[bid['outcome'] == 1])/len(bid[bid['outcome'] == 1]['device'].unique())))
    print(f"average bids per device by human:  %.0f" % float(
        len(bid[bid['outcome'] == 0])/len(bid[bid['outcome'] == 0]['device'].unique())))
    print(f"average bids per ip by robots: %.2f" % float(
        len(bid[bid['outcome'] == 1])/len(bid[bid['outcome'] == 1]['ip'].unique())))
    print(f"average bids per ip by human:  %.2f" % float(
        len(bid[bid['outcome'] == 0])/len(bid[bid['outcome'] == 0]['ip'].unique())))
    print(f"average bids per url by robot: %.2f" % float(
        len(bid[bid['outcome'] == 1])/len(bid[bid['outcome'] == 1]['url'].unique())))
    print(f"average bids per url by human:  %.2f" % float(
        len(bid[bid['outcome'] == 0])/len(bid[bid['outcome'] == 0]['url'].unique())))

get_metrics(train)

From the above, we can see distinct difference between bids made by human and bots per auction, per device. Features related to number of bids, auction, device draw our attention.

**4. Feature Engineering**

In [None]:
# bidding time difference per user (bidder_id)
def calculate_bidding_time_difference_per_user(df):
    df = df.sort_values(by=['time'])
    df['timediffs'] = df.groupby('bidder_id')['time'].transform(pd.Series.diff)
    return df

In [None]:
# number of bids a user made per auction
def bids_user_made_per_auction(df):
    bids_per_auction = df.groupby(['auction', 'bidder_id']).size()
    return bids_per_auction.to_frame()

In [None]:
# proportion of bots for each country
def proportion_of_bots_for_each_country(df):
    pbots_country = df[df['outcome'] == 1].groupby('country').size()/df.groupby('country').size()
    pbots_country = pbots_country.fillna(0)
    return pbots_country.to_frame()

In [None]:
# proportion of bots per device
def proportion_of_bots_per_device(df):
    pbots_device = df[df['outcome'] == 1].groupby('device').size()/df.groupby('device').size()
    pbots_device = pbots_device.fillna(0)
    return pbots_device.to_frame()

In [None]:
# number of unique ip to number of bids ratio
def number_of_unique_ip_to_number_of_bids_ratio(df):
    ip_bids_ratio = df.groupby('bidder_id')['ip'].nunique()/df.groupby('bidder_id')['bid_id'].nunique()
    return ip_bids_ratio.to_frame()

In [None]:
from scipy import stats

def ent(data):
    """
    Calculate the entropy

    Parameters
    ----------
    data : dataframe
        a DataFrame containing original data

    Returns
    -------
    output: float
        computed entropy 

    Notes
    -----
    entropy is calculated with the following steps:
        1. compute the probabilities
        2. fit in the entropy formula
    """
    p_data = data.value_counts()/len(data)  # calculates the probabilities
    # input probabilities to get the entropy
    entropy = stats.entropy(p_data)
    return entropy

In [None]:
# mean per auction url entropy for each user
# Input a pandas series
def mean_per_auctio_url_entropy_for_each_user(df):
    auction_url_entropy = df.groupby(['auction', 'bidder_id'])['url'].apply(ent)
    return auction_url_entropy.groupby('bidder_id').mean().reset_index()

In [None]:
def feature_engineering(df):
    df = df.sort_values(by=['time'])
    df['timediffs'] = df.groupby('bidder_id')['time'].transform(pd.Series.diff)
    df = pd.merge(df, bids_user_made_per_auction(df), on=['auction', 'bidder_id'], how='left')
    df = pd.merge(df, proportion_of_bots_for_each_country(df), on='country', how='left')
    df = pd.merge(df, proportion_of_bots_per_device(df), on='device', how='left')
    df = pd.merge(df, number_of_unique_ip_to_number_of_bids_ratio(df), on='bidder_id', how='left')
    df = pd.merge(df, mean_per_auctio_url_entropy_for_each_user(df), on='bidder_id', how='left')
    return df

In [None]:
train = feature_engineering(train)
train.isnull().sum()

In [None]:
train.columns

In [None]:
# set column names
train.columns = ['bidder_id', 'payment_account', 'address', 'outcome',
               'bid_id', 'auction', 'merchandise', 'device', 'time', 'country',
               'ip', 'url', 'timediffs', 'bids_per_auction', 'pbots_country', 'pbots_device',
               'ip_bids_ratio', 'auction_url_entropy']
train.head()

In [None]:
train.timediffs = train.timediffs.fillna(0)
train.head()

**5. Data Visualization**
Check whether the new features could help us identify human and robots, plot the distribution of each feature of two different type of bidders.

In [None]:
!pip install beautifultable

In [None]:
# data visualization package
import matplotlib.pyplot as plt
import seaborn as sns
# from ggplot import *
from beautifultable import BeautifulTable
# ignore the warning
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
bots = train.loc[train.outcome == 1]
human = train.loc[train.outcome == 0]
bots.shape, human.shape

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(14, 12), sharex=False)
sns.distplot(train['bids_per_auction'], hist=False, kde=True,
             bins=int(180/5), color='darkblue',
             kde_kws={'linewidth': 1.5}, ax=axes[0, 0])
sns.distplot(human['bids_per_auction'], hist=False, kde=True,
             bins=int(180/5), color='darkred',
             kde_kws={'linewidth': 1.5}, ax=axes[0, 0])

sns.distplot(bots['pbots_country'], hist=False, kde=True,
             bins=int(180/5), color='darkblue',
             kde_kws={'linewidth': 1.5}, ax=axes[0, 1])
sns.distplot(human['pbots_country'], hist=False, kde=True,
             bins=int(180/5), color='darkred',
             kde_kws={'linewidth': 1.5}, ax=axes[0, 1])

sns.distplot(bots['pbots_device'], hist=False, kde=True,
             bins=int(180/5), color='darkblue', label='bots',
             kde_kws={'linewidth': 1.5}, ax=axes[1, 0])
sns.distplot(human['pbots_device'], hist=False, kde=True,
             bins=int(180/5), color='darkred', label='human',
             kde_kws={'linewidth': 1.5}, ax=axes[1, 0])

sns.distplot(bots['ip_bids_ratio'], hist=False, kde=True,
             bins=int(180/5), color='darkblue',
             kde_kws={'linewidth': 1.5}, ax=axes[1, 1])
sns.distplot(human['ip_bids_ratio'], hist=False, kde=True,
             bins=int(180/5), color='darkred',
             kde_kws={'linewidth': 1.5}, ax=axes[1, 1])

sns.distplot(bots['auction_url_entropy'], hist=False, kde=True,
             bins=int(180/5), color='darkblue',
             kde_kws={'linewidth': 1.5}, ax=axes[2, 0])
sns.distplot(human['auction_url_entropy'], hist=False, kde=True,
             bins=int(180/5), color='darkred',
             kde_kws={'linewidth': 1.5}, ax=axes[2, 0])

sns.distplot(bots['timediffs'], hist=False, kde=True,
             bins=int(180/5), color='darkblue',
             kde_kws={'linewidth': 1.5}, ax=axes[2, 1])
sns.distplot(human['timediffs'], hist=False, kde=True,
             bins=int(180/5), color='darkred',
             kde_kws={'linewidth': 1.5}, ax=axes[2, 1])
plt.legend(['bots', 'human'])
plt.show()

Since the variable timediff (time differ per user) showed a similar distribution between robots and human. It also has no clearly correlation with our outcome. Therefore, this variable may not be very helpful when we are fitting a model. In contrast, we may even have overfitting problems if we incorporate it in our models. Therefore, we will ignore the variable timediff during the modeling process.


In [None]:
# Plot the correlation matrix for the numerical values
corr_matrix = train.corr()
sns.heatmap(corr_matrix.corr(),
            xticklabels=corr_matrix.corr().columns,
            yticklabels=corr_matrix.corr().columns,
            cmap="Blues",
            fmt='d')

This plot is the correlation matrix heatmap for our engineered features. Dark blue in this heatmap means two variables are highly positively correlated with each other. The white color in this heatmap indicates that two features are highly negatively correlated with each other. From this plot, we can infer that proportion of bots per device, proportion of bots for each country and number of unique ip to number of bids ratio are highly positively correlated with our outcome. While the mean per auction url entropy for each user are highly negatively correlated with our outcome. However, the variable time differ per user does not show clearly corrrelation with our outcome.


**6. Process imblance data**

In [None]:
# for train and test data set split
from sklearn.model_selection import train_test_split
bid_train, bid_test = train_test_split(train, test_size=0.2)

**Undersampling**

From data preprocessing part, we see the ratio of the robot bidder and the human bidder is 3:20, thus we need to undersampling the human bidder samples in the trainset to keep the data balance.



In [None]:
# Balance train data
bots_train = bid_train.loc[bid_train.outcome == 1]
human_train = bid_train.loc[bid_train.outcome == 0]
human_sample = human_train.sample(n=len(bots_train))
bid_train_balance = pd.concat([bots_train, human_sample])
bid_train_balance.head()

In [None]:
train_columns = ['bids_per_auction','pbots_country','pbots_device','ip_bids_ratio','auction_url_entropy']
X_train = bid_train_balance[train_columns]
y_train = bid_train_balance['outcome']

X_test = bid_test[train_columns]
y_test = bid_test['outcome']
X_train.shape, y_train.shape

**7. Modeling**

**First Model : Decision Tree**

In [None]:
# for decision tree model
from sklearn import tree
# for grid search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# hyperparameter tuning
dt = tree.DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 6),
    'max_leaf_nodes': range(10, 15),
    'min_samples_split': range(2, 6)
}

dt_cv = GridSearchCV(estimator=dt,
                     param_grid=param_grid,
                     cv=5)
dt_cv.fit(X_train, y_train)
print(dt_cv.best_params_)

In [None]:
import time
# fit the model
# Decision Tree
start_time = time.time()
kwargs_regularize = dict(criterion='gini',
                         max_depth=5,
                         max_leaf_nodes=14,
                         min_samples_split=2)
dt = tree.DecisionTreeClassifier(**kwargs_regularize)
dt.fit(X_train, y_train)

dt_time = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))


**Tree Visualization**

In [None]:
%pip install pydotplus

In [None]:
# for decision tree visualizaiton
from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(dt, out_file=dot_data,
                filled=True, rounded=True,
                feature_names=X_train.columns.values,
                class_names=['human', 'bot'],
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

In [None]:
# Check feature importance and display in bar plot.
print('Feature importance of Decision Tree Model')
plt.style.use('ggplot')
fig = plt.figure(figsize=(5, 5))
feat_importances = pd.Series(dt.feature_importances_, index=X_train.columns)
feat_importances.nsmallest(5).plot(kind='barh', alpha=0.7)
fig.savefig('dt_feature.png')

**Evaluation Metric**

In [None]:
# predict
y_dt_pred = dt.predict(X_test)

**Accuracy**

In [None]:
# for evaluation metric
# accuracy
from sklearn.metrics import accuracy_score
# accuracy score
print(f"Decision Tree Accuracy: {accuracy_score(y_dt_pred, y_test):.3f}")

**AUC**

In [None]:
# AUC
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pylab import rcParams

# Plot ROC in one graph
y_dt_score = dt.predict_proba(X_test)[:, 1]
fpr_dt, tpr_dt, _dt = roc_curve(y_test, y_dt_score)
roc_dt_auc = auc(fpr_dt, tpr_dt)

fig = plt.figure(figsize=(5, 5))
plt.plot(fpr_dt, tpr_dt, label='DT ROC curve (area = %0.2f)' % roc_dt_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.005])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
fig.savefig('roc_dt_auc.png')

**Pecision vs Recall**

In [None]:
print('Classification Report of Decision Tree Model')
print(classification_report(y_test, y_dt_pred))

**Second Model: Random Forest**

In [None]:
# for random search
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50)
param_grid = {
    'max_depth': range(3, 6),
    'max_leaf_nodes': range(8, 12),
    'max_features': ['sqrt', 'auto', 'log2']
}

rf_cv = GridSearchCV(estimator=rf,
                     param_grid=param_grid,
                     cv=5)
rf_cv.fit(X_train, y_train)
print(rf_cv.best_params_)

In [None]:
start_time = time.time()

rf = RandomForestClassifier(n_estimators=100, max_depth=5,
                            max_leaf_nodes=11, max_features='log2',
                            bootstrap=True, oob_score=True)
rf.fit(X_train, y_train)

rf_time = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Check feature importance and display in bar plot.
print('Feature importance of Random Forest Model')
plt.style.use('ggplot')
fig = plt.figure(figsize=(5, 5))
feat_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
feat_importances.nsmallest(5).plot(kind='barh', alpha=0.7)
fig.savefig('rf_feature.png')

**Evaluation Metric**

Accuracy

In [None]:
y_rf_pred = rf.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_rf_pred, y_test):.3f}")

AUC

In [None]:
# Plot ROC in one graph
y_rf_score = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _rf = roc_curve(y_test, y_rf_score)
roc_rf_auc = auc(fpr_rf, tpr_rf)

plt.figure(figsize=(5, 5))
plt.plot(fpr_rf, tpr_rf, label='RF ROC curve (area = %0.2f)' % roc_rf_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.005])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
fig.savefig('roc_rf_auc.png')

Precision vs. Recall

In [None]:
print('Classification Report of Random Forest Model')
print(classification_report(y_test, y_rf_pred))

**Third Model: Gradient Boosting**

In [None]:
# for gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=10)
param_grid = {
    'max_depth': range(3, 6),
    'max_leaf_nodes': range(8, 11)
}

gb_cv = GridSearchCV(estimator=gb,
                     param_grid=param_grid,
                     cv=5)
gb_cv.fit(X_train, y_train)
print(gb_cv.best_params_)

In [None]:
start_time = time.time()

gb = GradientBoostingClassifier(n_estimators=100, max_depth=5, max_features='sqrt',
                                max_leaf_nodes=9)
gb.fit(X_train, y_train)

gb_time = time.time() - start_time
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# Check feature importance and display in bar plot.
print('Feature importance of Gradient Boosting Model')
plt.style.use('ggplot')
fig = plt.figure(figsize=(5, 5))
feat_importances = pd.Series(gb.feature_importances_, index=X_train.columns)
feat_importances.nsmallest(5).plot(kind='barh', alpha=0.7)
fig.savefig('gb_feature.png')

**Evaluation Metrics**

Accuracy

In [None]:
y_gb_pred = gb.predict(X_test)
print(f"Gradient Boosting Accuracy: {accuracy_score(y_gb_pred, y_test):.3f}")

AUC

In [None]:
# Plot ROC in one graph
y_gb_score = gb.predict_proba(X_test)[:, 1]
fpr_gb, tpr_gb, _gb = roc_curve(y_test, y_gb_score)
roc_gb_auc = auc(fpr_gb, tpr_gb)

plt.figure(figsize=(5, 5))
plt.plot(fpr_gb, tpr_gb, label='GB ROC curve (area = %0.2f)' % roc_gb_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.005])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
fig.savefig('roc_gb_auc.png')

Precison vs. Recall

In [None]:
print('Classification Report of Gradient Boosting Model')
print(classification_report(y_test, y_gb_pred))

**8 Compare Different Models**

In [None]:
# for model comparision
from sklearn import metrics

dt_metrics = [metrics.accuracy_score(y_test, y_dt_pred), metrics.precision_score(y_test, y_dt_pred),
              metrics.recall_score(y_test, y_dt_pred), metrics.f1_score(
                  y_test, y_dt_pred),
              metrics.roc_auc_score(y_test, y_dt_pred), dt_time]
rf_metrics = [metrics.accuracy_score(y_test, y_rf_pred), metrics.precision_score(y_test, y_rf_pred),
              metrics.recall_score(y_test, y_rf_pred), metrics.f1_score(
                  y_test, y_rf_pred),
              metrics.roc_auc_score(y_test, y_rf_pred), rf_time]
gb_metrics = [metrics.accuracy_score(y_test, y_gb_pred), metrics.precision_score(y_test, y_gb_pred),
              metrics.recall_score(y_test, y_gb_pred), metrics.f1_score(
                  y_test, y_gb_pred),
              metrics.roc_auc_score(y_test, y_gb_pred), gb_time]

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
index = np.arange(5)
width = 0.2
b1 = plt.bar(index, dt_metrics[0:5], width,
             alpha=0.4, color='grey', label='decision tree')
b2 = plt.bar(index+width, rf_metrics[0:5], width,
             alpha=0.8, color='powderblue', label='random forest')
b3 = plt.bar(index+2*width, gb_metrics[0:5], width,
             alpha=0.8, color='pink', label='gradient boosting')
plt.title('Model Comparison')
plt.ylabel('score')
plt.xticks(index+width, ('accuracy', 'precision', 'recall', 'F1', 'ROC AUC'))
plt.legend(loc=8, ncol=3, mode="expand", borderaxespad=0.)
plt.show()
fig.savefig('model_comparison.png')

In [None]:
label = ["Accuracy_score", "Precision_score",
         "Recall_Score", "F1_score", "ROC_AUC_score", "Time(s)"]
table = pd.DataFrame({'Decision Tree': dt_metrics,
                      'Random Forest': rf_metrics, 'Gradient Boosting': gb_metrics})
table = table.transpose()
table.columns = label
table.transpose().round(3)

**Conclusion**

From the feature importance, we can see that in the online bidding, human being are more likely come from a random url into a certain aution, however, the robots are more likely come from few specific url. And compared with human beings robots are more likely to use different ip addresses for online bidding. Thus if for a user, it using multi-ip addresses and come from few certain url, we might consider it is a robots.