In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
bids_df = pd.read_csv('../input/facebook-recruiting-iv-human-or-bot/bids.csv.zip')
train_df = pd.read_csv('../input/facebook-recruiting-iv-human-or-bot/train.csv.zip')
test_df = pd.read_csv('../input/facebook-recruiting-iv-human-or-bot/test.csv.zip')

# Data Cleaning

In [None]:
bids_df.head()

In [None]:
train_df.head()

### Missing values

In [None]:
bids_df.shape

In [None]:
bids_df.isnull().sum()

In [None]:
missing_percent = bids_df['country'].isnull().mean()
print(f"Percentage of missing data in country column: {missing_percent*100: .2f}%")

We find that only the country column has missing values and it is only a small proportion of the entire dataset.

In [None]:
unique_countries = bids_df['country'].value_counts()

fig, ax = plt.subplots(figsize = (18, 4.8))
countries_distribution = sns.barplot(x = unique_countries.index, y = unique_countries.values, ax = ax)                        
ax.get_xaxis().set_visible(False)

plt.show()

We check the distribution of the countries and find that there is one country that is significantly higher than the rest. Hence, mode imputation may be a good approach.

In [None]:
bids_df['country'] = bids_df['country'].fillna(bids_df['country'].mode()[0])

# Feature Engineering

### Features of unique counts using .nunique()

In [None]:
bidder_unique = bids_df.groupby("bidder_id").nunique().reset_index()
bidder_unique.head()

In [None]:
train_set = train_df.merge(bidder_unique, on='bidder_id', how='left').drop(columns = ['payment_account', 'address'], errors = 'ignore')
train_set = train_set.rename(columns = {'bid_id': 'num_bids', 'auction': 'num_auct', 'merchandise': 'num_merch_type', 'device': 'num_device_type', 'time': 'num_time', 'country': 'num_ctry', 'ip': 'num_ip', 'url': 'num_url'})
train_set = train_set.fillna(0)

test_set = test_df.merge(bidder_unique, on='bidder_id', how='left').drop(columns = ['payment_account', 'address'], errors = 'ignore')
test_set = test_set.rename(columns = {'bid_id': 'num_bids', 'auction': 'num_auct', 'merchandise': 'num_merch_type', 'device': 'num_device_type', 'time': 'num_time', 'country': 'num_ctry', 'ip': 'num_ip', 'url': 'num_url'})
test_set = test_set.fillna(0)

train_set.head()

### Features generated from first-differencing using .diff()

The competition mentioned that the relative order and scales of time are preserved. Hence, it can be useful for comparisons between behavior using time.

In [None]:
time_df = bids_df.sort_values(['bidder_id', 'time'])
time_df.head(10)

We can generate features by looking at the time difference between every two bids for each bidder. We cannot take the first difference of each bidder's first bid so there are missing values. As these missing values do not have any intepretations, we drop them.

In [None]:
firstdiff = time_df.groupby('bidder_id')[['time']].diff()
firstdiff.head(10)

In [None]:
time_df['first_diff'] = firstdiff
firstdiff_feat = time_df[['bidder_id', 'first_diff']].dropna()
firstdiff_feat.head()

We can generate some features using common aggregate functions such as mean, median, minimum and maximum.

In [None]:
bid_intervals = firstdiff_feat.groupby('bidder_id')[['first_diff']].describe().reset_index()
bid_intervals = bid_intervals.droplevel(axis=1, level=0)
bid_intervals = bid_intervals.rename(columns = {'': 'bidder_id', 'mean': 'mean_diff', 'std': 'std_diff', '50%': 'median_diff', 'min': 'min_diff', 'max': 'max_diff'}).fillna(0)
bid_intervals['iqr_diff'] = bid_intervals['75%'] - bid_intervals['25%']
bid_intervals = bid_intervals.drop(['25%', '75%', 'count'], axis = 1)
bid_intervals.head()

When adding the above features to our training and test datasets, we fill missing values for the concurrent bids with 0 simply because the bidders do not have any.

For the first-difference features, we fill the missing values with the median values because these bidders do not have any bid information.

In [None]:
train_set = train_set.merge(bid_intervals, on='bidder_id', how='left')
train_set = train_set.fillna(train_set.median())

test_set = test_set.merge(bid_intervals, on='bidder_id', how='left')
test_set = test_set.fillna(test_set.median())

One interesting observation is that some bidders have zero lag time between two bids. Intuitively, bots may be able to achieve this more than an average human, so we can look at the values where the time difference equals 0. We label such bids as concurrent bids.

In [None]:
concurrent_bids = firstdiff_feat[firstdiff_feat['first_diff'] == 0].groupby('bidder_id').count().reset_index()
concurrent_bids = concurrent_bids.rename(columns = {'first_diff': 'num_concurrent_bids'})
concurrent_bids.head()

In [None]:
train_set = train_set.merge(concurrent_bids, on='bidder_id', how='left').fillna(0)

test_set = test_set.merge(concurrent_bids, on='bidder_id', how='left').fillna(0)

### Features generated from time using .first()

Time can be used to indicate the behavior of a bidder relative to the others. The easiest comparison is the number of times a bidder is the first or last in an auction. Intuitively, we fill missing values with 0.

In [None]:
first_bid = bids_df.sort_values(['auction', 'time'])
first_bid = first_bid.groupby('auction').first().reset_index()
first_bid = first_bid.groupby('bidder_id').count()['bid_id'].reset_index()
first_bid = first_bid.rename(columns = {'bid_id': 'num_first_bid'})
first_bid.head()

In [None]:
last_bid = bids_df.sort_values(['auction', 'time'], ascending = [True, False])
last_bid = last_bid.groupby('auction').first().reset_index()
last_bid = last_bid.groupby('bidder_id').count()['bid_id'].reset_index()
last_bid = last_bid.rename(columns = {'bid_id': 'num_last_bid'})
last_bid.head()

In [None]:
train_set = train_set.merge(first_bid, on='bidder_id', how='left').fillna(0)
train_set = train_set.merge(last_bid, on='bidder_id', how='left').fillna(0)

test_set = test_set.merge(first_bid, on='bidder_id', how='left').fillna(0)
test_set = test_set.merge(last_bid, on='bidder_id', how='left').fillna(0)

### Features by other hypotheses

##### Ratio of bids in first half to second half of auction

We define the duration of the auction to be the difference between the first and last bid of that auction. We assume that the bots will bid more towards the end of an auction because bidding early does not really ensure that the bots would not be outbidded.

In [None]:
auct_duration = bids_df.sort_values(['auction', 'time'])[['bidder_id', 'auction', 'time']]
auct_duration = auct_duration[['auction','time']].groupby('auction').agg([max,min]).reset_index().droplevel(axis=1, level=0).rename(columns = {'': 'auction'})
auct_duration['auct_duration'] = auct_duration['max'] - auct_duration['min']
auct_duration.head()

In [None]:
time_ratio = bids_df.sort_values(['auction', 'time'])[['bidder_id', 'auction', 'time']]
time_ratio = time_ratio.merge(auct_duration, on = 'auction', how = 'left')
time_ratio.head()

In [None]:
time_ratio['temp'] = time_ratio['time'] - time_ratio['auct_duration'] / 2
time_ratio.head()

In [None]:
time_ratio['firsthalf'] = time_ratio['temp'] < time_ratio['min']
time_ratio.head()

In [None]:
ratio_firsthalf = time_ratio[['bidder_id', 'firsthalf']].groupby('bidder_id').agg(['count', sum]).reset_index().droplevel(axis=1, level=0).rename(columns = {'': 'bidder_id', 'count': 'num_total_bids', 'sum': 'num_firsthalf_bids'})
ratio_firsthalf['num_secondhalf_bids'] = ratio_firsthalf['num_total_bids'] - ratio_firsthalf['num_firsthalf_bids']
ratio_firsthalf['percent_firsthalf_bids'] = ratio_firsthalf['num_firsthalf_bids'] / ratio_firsthalf['num_total_bids']
ratio_firsthalf['percent_secondhalf_bids'] = ratio_firsthalf['num_secondhalf_bids'] / ratio_firsthalf['num_total_bids']
ratio_firsthalf = ratio_firsthalf.drop('num_total_bids', axis = 1)
ratio_firsthalf.head()

In [None]:
train_set = train_set.merge(ratio_firsthalf, on='bidder_id', how='left').fillna(0)

test_set = test_set.merge(ratio_firsthalf, on='bidder_id', how='left').fillna(0)

##### Max number of bids in an auction

Since the bots' aim is to win the auction, by assuming that a bot will not give up an auction, the bot should realistically make more bids for any auction.

In [None]:
max_bids_in_auct = bids_df.groupby(['bidder_id', 'auction']).count().reset_index()[['bidder_id', 'auction', 'bid_id']].rename(columns = {'bid_id': 'max_bids_in_auct'})
max_bids_in_auct = max_bids_in_auct[['bidder_id', 'max_bids_in_auct']].groupby('bidder_id').max().reset_index()
max_bids_in_auct.head()

In [None]:
train_set = train_set.merge(max_bids_in_auct, on='bidder_id', how='left').fillna(0)

test_set = test_set.merge(max_bids_in_auct, on='bidder_id', how='left').fillna(0)

##### Max number of bids in across devices

We find out the maximum number of bids made using the same device by a bidder and find out whether humans and bots have different behavior when it comes to switching devices.

In [None]:
device = bids_df.groupby(['bidder_id', 'device']).nunique()[['bid_id', 'auction']].reset_index().rename(columns = {'bid_id': 'max_bids_per_device', 'auction': 'num_auct_per_device'})
device = device.groupby('bidder_id').max().reset_index()
device['max_bids_per_device_per_auct'] = device['max_bids_per_device'] / device['num_auct_per_device']
device = device[['bidder_id', 'max_bids_per_device', 'max_bids_per_device_per_auct']]
device.head()

In [None]:
train_set = train_set.merge(device, on='bidder_id', how='left').fillna(0)

test_set = test_set.merge(device, on='bidder_id', how='left').fillna(0)

### Feature transformations

We attempt to create some features that makes sense by intuition. For example, bids_per_auct may be a better feature than just num_bids and num_auct separately because we standardize the number of bids made by a bidder with respect to the total number of auctions they participated. The same can be said for the other features.
    
Again, we fill missing values with 0 because they indicate those bidders with no bid information.

In [None]:
train_set['percent_concurrent_bids'] = train_set['num_concurrent_bids'] / train_set['num_bids']
train_set['bids_per_auct'] = train_set['num_bids'] / train_set['num_auct']
train_set['bids_per_device'] = train_set['num_bids'] / train_set['num_device_type']
train_set['bids_per_url'] = train_set['num_bids'] / train_set['num_url']
train_set['device_per_auct'] = train_set['num_device_type'] / train_set['num_auct']
train_set['ip_per_ctry'] = train_set['num_ip'] / train_set['num_ctry']
train_set['percent_max_bids'] = train_set['max_bids_per_device'] / train_set['num_bids']

train_set = train_set.fillna(0)

test_set['percent_concurrent_bids'] = test_set['num_concurrent_bids'] / test_set['num_bids']
test_set['bids_per_auct'] = test_set['num_bids'] / test_set['num_auct']
test_set['bids_per_device'] = test_set['num_bids'] / test_set['num_device_type']
test_set['bids_per_url'] = test_set['num_bids'] / test_set['num_url']
test_set['device_per_auct'] = test_set['num_device_type'] / test_set['num_auct']
test_set['ip_per_ctry'] = test_set['num_ip'] / test_set['num_ctry']
test_set['percent_max_bids'] = test_set['max_bids_per_device'] / test_set['num_bids']

test_set = test_set.fillna(0)

### Outliers

There are outliers, with 5 bots having only a single bid. We remove them as they may affect model performance badly.

In [None]:
fig, ax = plt.subplots()
num_bids_per_bot = sns.barplot(data = train_set[train_set['outcome'] == 1].sort_values('num_bids').head(10),
                               x = 'bidder_id',
                               y = 'num_bids',
                               ax = ax
                              )
ax.bar_label(ax.containers[0])
plt.xticks(rotation = 90)

plt.show()

In [None]:
train_set[train_set['outcome'] == 1].sort_values('num_bids').head(6)

In [None]:
train_set = train_set.drop([615, 775, 392, 1669, 1102], axis = 0)

### Checking the significance of features

We compute the mean of each feature for a human and a bot. One interesting observation is that both humans and bots only bidded for one type of merchandise. The value for humans is skewed due to humans with no bid data.

We can investigate the feature, merchandise, further to see if encoding can be performed.

In [None]:
train_set.groupby('outcome').mean().T

The top 3 merchandises bidded by both humans and robots are the same, namely sporting goods, mobile and jewelry. Some merchandise like home goods and autoparts are not bidded by robots. However, they make up a small proportion of the human bids. We choose not to include categorical variables in our model.

In [None]:
temp_df = bids_df.merge(train_df, on = 'bidder_id', how = 'left').dropna()
temp_df.groupby(['outcome', 'merchandise']).count()[['bid_id']].sort_values(['outcome', 'bid_id'], ascending = False)

In [None]:
train_set = train_set.drop('num_merch_type', axis = 1)

test_set = test_set.drop('num_merch_type', axis = 1)

In [None]:
all_features = train_set.columns.drop(['bidder_id', 'outcome'])
all_features

Another way to check the significance of the features is to use a density plot. From the plots below, the features num_url, num_concurrent_bids, num_firsthalf_bids, max_bids_per_device have similar distributions for both human and bot. Hence, we choose to exclude them from our model as they may not help to differentiate between a human and a bot well.

In [None]:
nrows = 15
ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, figsize = (18, 60))
for i, column in enumerate(all_features):
    humans = sns.kdeplot(data = train_set[train_set['outcome'] == 0],
                x = column,
                ax = axes[i // ncols, i % ncols],
                color = 'blue',
                fill = True,
                alpha = 0.1,
                linewidth = 2,
                label = 'Human').set_xlim(left = 0)
    bots = sns.kdeplot(data = train_set[train_set['outcome'] == 1],
                x = column,
                ax = axes[i // ncols, i % ncols],
                color = 'red',
                fill = True,
                alpha = 0.1,
                linewidth = 2,
                label = 'Bot').set_xlim(left = 0)
    axes[i // ncols,i % ncols].legend()
    
plt.show()

# Model Validation

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from time import time
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline

### Imbalanced data

In [None]:
train_set['outcome'].value_counts()

Due to imbalanced data, we perform over-sampling using RandomOverSampler.

Afterwards, we choose to do ensemble averaging of multiple Random Forest models to reduce the variance of our predictions. We do that by setting different random_state for each model. We then perform hyperparameter tuning for each model separately using GridSearchCV.

### Initialize training/test data and models

In [None]:
feature_col = train_set.columns.drop(['bidder_id', 'outcome',
                                     'num_url',  'num_concurrent_bids',  'num_firsthalf_bids', 'max_bids_per_device'
                                     ])
print(feature_col)

X = train_set[feature_col]
y = train_set['outcome']

X_kaggle = test_set[feature_col]

We first initialize the base models and find out the AUC as reference.

In [None]:
rf1 = RandomForestClassifier(random_state = 0)
rf2 = RandomForestClassifier(random_state = 123)
rf3 = RandomForestClassifier(random_state = 456)
rf4 = RandomForestClassifier(random_state = 789)
rf5 = RandomForestClassifier(random_state = 999)

ros = RandomOverSampler(sampling_strategy = 0.1, random_state = 456)

pp1 = make_pipeline(ros, rf1)
pp2 = make_pipeline(ros, rf2)
pp3 = make_pipeline(ros, rf3)
pp4 = make_pipeline(ros, rf4)
pp5 = make_pipeline(ros, rf5)

base_models = [pp1, pp2, pp3, pp4, pp5]

In [None]:
def cv(models, X, y):
    start = time()

    rskfold = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 456) 
    k_fold_AUC = []

    for train_index, test_index in rskfold.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_proba = []
        for model in models:
            model.fit(X_train, y_train)
            sub_y_proba = model.predict_proba(X_test)[:, 1]
            y_proba.append(sub_y_proba)
        y_proba = np.mean(y_proba, axis = 0)

        AUC = roc_auc_score(y_test, y_proba)
        k_fold_AUC.append(AUC)
    
    mean_AUC = np.mean(k_fold_AUC)
    
    end = time()

    print(f"Time elapsed: {(end - start):.4f} seconds")
    print(f"AUC:{mean_AUC:.4f}")
    
    return k_fold_AUC

### One individual model before hyperparameter tuning

In [None]:
base_individual_AUCs = cv([base_models[2]], X, y) # model with random_state = 456 used as reference

### Ensemble averaging before hyperparameter tuning

In [None]:
base_average_AUCs = cv(base_models, X, y)

### Hyperparameter tuning with GridSearchCV

In [None]:
grid = {'randomforestclassifier__n_estimators': [100, 200, 300],
        'randomforestclassifier__max_depth': [None, 5, 8, 10],
        'randomforestclassifier__min_samples_split': [2, 5, 10],
        'randomforestclassifier__min_samples_leaf': [1, 2, 4],
       }

In [None]:
start = time()

best_models = []

for model in base_models:

    search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          scoring = 'roc_auc', 
                          cv = 3, # default is StratifiedKFold as estimator is classifier
                          verbose = 2,
                          n_jobs = -1)
    search.fit(X,y)
    best_models.append(search.best_estimator_)

end = time()

print(f"Time Elapsed: {(end - start):.4f} seconds")

In [None]:
print(best_models)

### One individual model after hyperparameter tuning

In [None]:
individual_AUCs = cv([best_models[2]], X, y) # again, model with random_state = 456 used as reference

### Ensemble averaging after hyperparameter tuning

In [None]:
average_AUCs =cv(best_models, X, y)

### Summary

We plot the distribution of the ROC AUC scores to compare the differences from ensemble averaging and hyperparameter tuning.

In [None]:
model_AUCs_list = [('Base individual RF', base_individual_AUCs), ('Base average RFs', base_average_AUCs),
                   ('Tuned individual RF', individual_AUCs), ('Tuned average RFs', average_AUCs)]

fig, ax = plt.subplots(figsize = (18, 6))

for i in range(len(model_AUCs_list)):
    color = next(ax._get_lines.prop_cycler)['color']
    
    AUC_distribution = sns.kdeplot(x = model_AUCs_list[i][1],
                                   ax = ax,
                                   label = model_AUCs_list[i][0],
                                   color = color
                                  )
    
    x_coord = ax.lines[-1].get_xdata()
    y_coord = ax.lines[-1].get_ydata()
    index_of_max_y = np.argmax(y_coord)
    ax.axvline(x_coord[index_of_max_y], linestyle = '--', linewidth = 1, color = color)

ax.legend(loc = 'upper left', fontsize = 'x-large')
plt.title('ROC AUC score distribution across 4 models', fontsize = 'xx-large')
plt.xlabel('ROC AUC score', fontsize = 'x-large')
plt.ylabel('Density', fontsize = 'x-large')
plt.xlim(right = 1)

plt.show()

It is evident that hyperparameter tuning reduced the variance of ROC AUC scores as indicated by the spread of the green and red curves.

Also, the ROC AUC score for tuned ensemble average of random forests (red vertical line) is more likely to be higher compared to the 3 other models, as seen by the vertical line indicators of their respective peaks.

# Final Model

In [None]:
final_models = best_models

In [None]:
proba = []

for model in final_models:
    model.fit(X, y)

    proba_rforest = model.predict_proba(X_kaggle)[:,1]
    proba.append(proba_rforest)

result = np.mean(proba, axis = 0)
result

In [None]:
output_dataframe = pd.DataFrame({
    'bidder_id': test_set['bidder_id'],
    'prediction': result
})
output_dataframe.to_csv('my_predictions.csv', index=False) 

---