In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

%cd '/content/gdrive/My Drive/self_case_study_1'
import pywt
import warnings
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score as roc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

Mounted at /content/gdrive
/content/gdrive/My Drive/self_case_study_1


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
bids = pd.read_pickle('cleaned_bids.pkl')
df = pd.read_pickle('dataset.pkl')

# Merchandise

In [3]:
unique_merchandise_bidded = (bids.groupby(['bidder_id','auction'])['merchandise']
                                .first()
                                .reset_index()
                                .drop('auction',axis=1)
                                .groupby(['bidder_id','merchandise'])
                                .size()
                                .unstack('merchandise',fill_value=0)
                                .reset_index())

unique_merchandise_bidded.columns = ['bidder_id']+['unique_'+ix for ix in unique_merchandise_bidded.columns[1:]]

total_merchandise_bids = (bids.groupby(['bidder_id','merchandise'])
                                .size()
                                .unstack('merchandise',fill_value=0)
                                .reset_index())

total_merchandise_bids.columns = ['bidder_id']+['total_'+ix for ix in total_merchandise_bids.columns[1:]]

merchandise_features = unique_merchandise_bidded.merge(total_merchandise_bids,on = 'bidder_id',how = 'left')
merchandise_features.head()

Unnamed: 0,bidder_id,unique_auto parts,unique_books and music,unique_clothing,unique_computers,unique_furniture,unique_home goods,unique_jewelry,unique_mobile,unique_office equipment,unique_sporting goods,total_auto parts,total_books and music,total_clothing,total_computers,total_furniture,total_home goods,total_jewelry,total_mobile,total_office equipment,total_sporting goods
0,001068c415025a009fee375a12cff4fcnht8y,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,002d229ffb247009810828f648afc2ef593rb,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0
2,0030a2dd87ad2733e0873062e4f83954mkj86,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0
4,00486a11dff552c4bd7696265724ff81yeo9v,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,20,0,0,0,0


# Country

## Response Encoding Features

In [4]:
#rE: Response Encoding
#Flag: We have set a threshold of >0.5 meaning that this attribute tends to point towards bots
country_rE = ((df[df['outcome']==1]['country'].value_counts()/df['country'].value_counts())
                                              .fillna(0).reset_index()
                                              .rename(columns = {'index':'country','country':'country_rE'})
                                              .merge(bids, on = 'country',how = 'right'))
avg_country_rE = (country_rE.groupby('bidder_id')['country_rE']
                      .mean()
                      .rename('avg_country_rE')
                      .reset_index())
country_rE_flags = (country_rE[country_rE['country_rE']>0.5].groupby(['bidder_id'])['country_rE']
                                                            .size()
                                                            .rename('flagCountry_rE')
                                                            .reset_index())
country_rE_flags_norm = ((country_rE[country_rE['country_rE']>0.5]['bidder_id'].value_counts()/country_rE['bidder_id'].value_counts())
                                                                               .fillna(0).reset_index()
                                                                               .rename(columns = {'index':'bidder_id','bidder_id':'flagCountry_rE_norm'}))
country_rE_features = avg_country_rE.merge(country_rE_flags,on = 'bidder_id',how = 'left').merge(country_rE_flags_norm,on = 'bidder_id',how = 'left').fillna(0)

country_rE_features.head()

Unnamed: 0,bidder_id,avg_country_rE,flagCountry_rE,flagCountry_rE_norm
0,001068c415025a009fee375a12cff4fcnht8y,0.050393,0.0,0.0
1,002d229ffb247009810828f648afc2ef593rb,0.289463,0.0,0.0
2,0030a2dd87ad2733e0873062e4f83954mkj86,0.053973,0.0,0.0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0.117759,0.0,0.0
4,00486a11dff552c4bd7696265724ff81yeo9v,0.019456,0.0,0.0


## Unique Country

In [5]:
country_counts = (bids.groupby(['bidder_id'])['country']
                  .nunique()
                  .rename('unique_country_per_bidder')
                  .reset_index())

avg_country_counts_per_auction = (bids.groupby(['bidder_id','auction'])['country']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['country']
                  .mean()
                  .rename('avg_unique_country_per_auction')
                  .reset_index())

max_country_counts_per_auction = (bids.groupby(['bidder_id','auction'])['country']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['country']
                  .max()
                  .rename('max_unique_country_per_auction')
                  .reset_index())

unique_country_features = country_counts.merge(avg_country_counts_per_auction,on = 'bidder_id',how = 'left').merge(max_country_counts_per_auction,on = 'bidder_id',how = 'left')
unique_country_features.head()

Unnamed: 0,bidder_id,unique_country_per_bidder,avg_unique_country_per_auction,max_unique_country_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1,1.0,1
1,002d229ffb247009810828f648afc2ef593rb,1,1.0,1
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,1.0,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1,1.0,1
4,00486a11dff552c4bd7696265724ff81yeo9v,1,1.0,1


# Device

## Response Encoding Features

In [6]:
#rE: Response Encoding
#Flag: We have set a threshold of >0.5 meaning that this attribute tends to point towards bots
device_rE = ((df[df['outcome']==1]['device'].value_counts()/df['device'].value_counts())
                                              .fillna(0).reset_index()
                                              .rename(columns = {'index':'device','device':'device_rE'})
                                              .merge(bids, on = 'device',how = 'right'))
avg_device_rE = (device_rE.groupby('bidder_id')['device_rE']
                      .mean()
                      .rename('avg_device_rE')
                      .reset_index())
device_rE_flags = (device_rE[device_rE['device_rE']>0.5].groupby(['bidder_id'])['device_rE']
                                                            .size()
                                                            .rename('flagdevice_rE')
                                                            .reset_index())
device_rE_flags_norm = ((device_rE[device_rE['device_rE']>0.5]['bidder_id'].value_counts()/device_rE['bidder_id'].value_counts())
                                                                               .fillna(0).reset_index()
                                                                               .rename(columns = {'index':'bidder_id','bidder_id':'flagdevice_rE_norm'}))
device_rE_features = avg_device_rE.merge(device_rE_flags,on = 'bidder_id',how = 'left').merge(device_rE_flags_norm,on = 'bidder_id',how = 'left').fillna(0)

device_rE_features.head()

Unnamed: 0,bidder_id,avg_device_rE,flagdevice_rE,flagdevice_rE_norm
0,001068c415025a009fee375a12cff4fcnht8y,0.013193,0.0,0.0
1,002d229ffb247009810828f648afc2ef593rb,0.269304,0.0,0.0
2,0030a2dd87ad2733e0873062e4f83954mkj86,0.035387,0.0,0.0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0.070734,0.0,0.0
4,00486a11dff552c4bd7696265724ff81yeo9v,0.052702,0.0,0.0


## Unique Device

In [7]:
device_counts = (bids.groupby(['bidder_id'])['device']
                  .nunique()
                  .rename('unique_device_per_bidder')
                  .reset_index())

avg_device_counts_per_auction = (bids.groupby(['bidder_id','auction'])['device']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['device']
                  .mean()
                  .rename('avg_unique_device_per_auction')
                  .reset_index())

max_device_counts_per_auction = (bids.groupby(['bidder_id','auction'])['device']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['device']
                  .max()
                  .rename('max_unique_device_per_auction')
                  .reset_index())

unique_device_features = device_counts.merge(avg_device_counts_per_auction,on = 'bidder_id',how = 'left').merge(max_device_counts_per_auction,on = 'bidder_id',how = 'left')

unique_device_features.head()

Unnamed: 0,bidder_id,unique_device_per_bidder,avg_unique_device_per_auction,max_unique_device_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1,1.0,1
1,002d229ffb247009810828f648afc2ef593rb,2,2.0,2
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,1.0,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,1.0,1
4,00486a11dff552c4bd7696265724ff81yeo9v,8,1.538462,3


# URL

## Response Encoding Features

In [8]:
#rE: Response Encoding
#Flag: We have set a threshold of >0.5 meaning that this attribute tends to point towards bots
url_rE = ((df[df['outcome']==1]['url'].value_counts()/df['url'].value_counts())
                                              .fillna(0).reset_index()
                                              .rename(columns = {'index':'url','url':'url_rE'})
                                              .merge(bids, on = 'url',how = 'right'))
avg_url_rE = (url_rE.groupby('bidder_id')['url_rE']
                      .mean()
                      .rename('avg_url_rE')
                      .reset_index())
url_rE_flags = (url_rE[url_rE['url_rE']>0.5].groupby(['bidder_id'])['url_rE']
                                                            .size()
                                                            .rename('flagurl_rE')
                                                            .reset_index())
url_rE_flags_norm = ((url_rE[url_rE['url_rE']>0.5]['bidder_id'].value_counts()/url_rE['bidder_id'].value_counts())
                                                                               .fillna(0).reset_index()
                                                                               .rename(columns = {'index':'bidder_id','bidder_id':'flagurl_rE_norm'}))
url_rE_features = avg_url_rE.merge(url_rE_flags,on = 'bidder_id',how = 'left').merge(url_rE_flags_norm,on = 'bidder_id',how = 'left').fillna(0)

url_rE_features.head()

Unnamed: 0,bidder_id,avg_url_rE,flagurl_rE,flagurl_rE_norm
0,001068c415025a009fee375a12cff4fcnht8y,0.189083,0.0,0.0
1,002d229ffb247009810828f648afc2ef593rb,0.189083,0.0,0.0
2,0030a2dd87ad2733e0873062e4f83954mkj86,0.0,0.0,0.0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0.189083,0.0,0.0
4,00486a11dff552c4bd7696265724ff81yeo9v,0.189083,0.0,0.0


## Unique URL

In [9]:
url_counts = (bids.groupby(['bidder_id'])['url']
                  .nunique()
                  .rename('unique_url_per_bidder')
                  .reset_index())

avg_url_counts_per_auction = (bids.groupby(['bidder_id','auction'])['url']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['url']
                  .mean()
                  .rename('avg_unique_url_per_auction')
                  .reset_index())

max_url_counts_per_auction = (bids.groupby(['bidder_id','auction'])['url']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['url']
                  .max()
                  .rename('max_unique_url_per_auction')
                  .reset_index())

unique_url_features = url_counts.merge(avg_url_counts_per_auction,on = 'bidder_id',how = 'left').merge(max_url_counts_per_auction,on = 'bidder_id',how = 'left')

unique_url_features.head()

Unnamed: 0,bidder_id,unique_url_per_bidder,avg_unique_url_per_auction,max_unique_url_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1,1.0,1
1,002d229ffb247009810828f648afc2ef593rb,1,1.0,1
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,1.0,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,2,1.0,1
4,00486a11dff552c4bd7696265724ff81yeo9v,7,1.307692,3


## Main URL(vasstdc27m7nks3) Usage

In [10]:
main_url_count = (bids[bids['url']=="vasstdc27m7nks3"]
                    .groupby(['bidder_id'])
                    .size()
                    .rename('main_url_count')
                    .reset_index()
                    .fillna(0))
norm_main_url_count = ((bids[bids['url']=="vasstdc27m7nks3"]['bidder_id'].value_counts()/bids['bidder_id'].value_counts())
                                                                        .fillna(0).reset_index()
                                                                        .rename(columns = {'index':'bidder_id','bidder_id':'norm_main_url_count'}))


main_url_features = main_url_count.merge(norm_main_url_count,on = 'bidder_id',how = 'left')

main_url_features.head()

Unnamed: 0,bidder_id,main_url_count,norm_main_url_count
0,001068c415025a009fee375a12cff4fcnht8y,1,1.0
1,002d229ffb247009810828f648afc2ef593rb,2,1.0
2,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1,0.333333
3,00486a11dff552c4bd7696265724ff81yeo9v,11,0.55
4,0051aef3fdeacdadba664b9b3b07e04e4coc6,53,0.779412


# IP Address(Network)

In [11]:
# Rather than dealing with IP Address, it makes more sense to us Ip Network rather than the complete IP Address

ip_address = bids['ip'].str.split('.', expand = True)
bids['ip_network'] = ip_address[0]+'.'+ip_address[1]

ip_address = df['ip'].str.split('.', expand = True)
df['ip_network'] = ip_address[0]+'.'+ip_address[1]

## Response Encoding Features

In [12]:
#rE: Response Encoding
#Flag: We have set a threshold of >0.5 meaning that this attribute tends to point towards bots
ip_network_rE = ((df[df['outcome']==1]['ip_network'].value_counts()/df['ip_network'].value_counts())
                                              .fillna(0).reset_index()
                                              .rename(columns = {'index':'ip_network','ip_network':'ip_network_rE'})
                                              .merge(bids, on = 'ip_network',how = 'right'))
avg_ip_network_rE = (ip_network_rE.groupby('bidder_id')['ip_network_rE']
                      .mean()
                      .rename('avg_ip_network_rE')
                      .reset_index())
ip_network_rE_flags = (ip_network_rE[ip_network_rE['ip_network_rE']>0.5].groupby(['bidder_id'])['ip_network_rE']
                                                            .size()
                                                            .rename('flagip_network_rE')
                                                            .reset_index())
ip_network_rE_flags_norm = ((ip_network_rE[ip_network_rE['ip_network_rE']>0.5]['bidder_id'].value_counts()/ip_network_rE['bidder_id'].value_counts())
                                                                               .fillna(0).reset_index()
                                                                               .rename(columns = {'index':'bidder_id','bidder_id':'flagip_network_rE_norm'}))
ip_network_rE_features = avg_ip_network_rE.merge(ip_network_rE_flags,on = 'bidder_id',how = 'left').merge(ip_network_rE_flags_norm,on = 'bidder_id',how = 'left').fillna(0)

ip_network_rE_features.head()

Unnamed: 0,bidder_id,avg_ip_network_rE,flagip_network_rE,flagip_network_rE_norm
0,001068c415025a009fee375a12cff4fcnht8y,0.032663,0.0,0.0
1,002d229ffb247009810828f648afc2ef593rb,0.1,0.0,0.0
2,0030a2dd87ad2733e0873062e4f83954mkj86,0.052052,0.0,0.0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0.026455,0.0,0.0
4,00486a11dff552c4bd7696265724ff81yeo9v,0.018827,0.0,0.0


## Unique IP Network

In [13]:
ip_network_counts = (bids.groupby(['bidder_id'])['ip_network']
                  .nunique()
                  .rename('unique_ip_network_per_bidder')
                  .reset_index())

avg_ip_network_counts_per_auction = (bids.groupby(['bidder_id','auction'])['ip_network']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['ip_network']
                  .mean()
                  .rename('avg_unique_ip_network_per_auction')
                  .reset_index())

max_ip_network_counts_per_auction = (bids.groupby(['bidder_id','auction'])['ip_network']
                  .nunique()
                  .reset_index()
                  .groupby('bidder_id')['ip_network']
                  .max()
                  .rename('max_unique_ip_network_per_auction')
                  .reset_index())

unique_ip_network_features = ip_network_counts.merge(avg_ip_network_counts_per_auction,on = 'bidder_id',how = 'left').merge(max_ip_network_counts_per_auction,on = 'bidder_id',how = 'left')
unique_ip_network_features.head()

Unnamed: 0,bidder_id,unique_ip_network_per_bidder,avg_unique_ip_network_per_auction,max_unique_ip_network_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1,1.0,1
1,002d229ffb247009810828f648afc2ef593rb,1,1.0,1
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,1.0,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,1.0,1
4,00486a11dff552c4bd7696265724ff81yeo9v,10,1.538462,3


# Auction

In [14]:
auction_counts = (bids.groupby(['bidder_id'])['auction']
                  .nunique()
                  .rename('unique_auction_per_bidder')
                  .reset_index())

# It is safe to believe that the last bid in an auction is the winner of the auction
auction_won = (bids.sort_values(by = 'time')
                  .groupby('auction')
                  .tail(1)
                  .reset_index()
                  .groupby('bidder_id')
                  .size()
                  .rename('auction_won')
                  .reset_index())

auction_features = auction_counts.merge(auction_won,on = 'bidder_id',how = 'left').fillna(0)
auction_features.head()

Unnamed: 0,bidder_id,unique_auction_per_bidder,auction_won
0,001068c415025a009fee375a12cff4fcnht8y,1,0.0
1,002d229ffb247009810828f648afc2ef593rb,1,0.0
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,0.0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,0.0
4,00486a11dff552c4bd7696265724ff81yeo9v,13,0.0


# Bids

## Number of Bids

In [15]:
bids_counts = (bids.groupby(['bidder_id'])
                .size()
                .rename('no_of_bids')
                .reset_index())

bids_counts_avg = (bids.groupby(['bidder_id','auction'])
                .size()
                .rename('avg_bids_per_auction')
                .reset_index()
                .groupby('bidder_id')['avg_bids_per_auction']
                .mean()
                .reset_index())

bids_counts_min = (bids.groupby(['bidder_id','auction'])
                .size()
                .rename('min_bids_per_auction')
                .reset_index()
                .groupby('bidder_id')['min_bids_per_auction']
                .min()
                .reset_index())

bids_counts_max = (bids.groupby(['bidder_id','auction'])
                .size()
                .rename('max_bids_per_auction')
                .reset_index()
                .groupby('bidder_id')['max_bids_per_auction']
                .max()
                .reset_index())

bids_features = bids_counts.merge(bids_counts_avg,on = 'bidder_id',how = 'left').merge(bids_counts_min,on = 'bidder_id',how = 'left').merge(bids_counts_max,on = 'bidder_id',how = 'left')
bids_features.head()

Unnamed: 0,bidder_id,no_of_bids,avg_bids_per_auction,min_bids_per_auction,max_bids_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1,1.0,1,1
1,002d229ffb247009810828f648afc2ef593rb,2,2.0,2,2
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,1.0,1,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,1.0,1,1
4,00486a11dff552c4bd7696265724ff81yeo9v,20,1.538462,1,3


## Bid Order/Value

In [16]:
bid_order_dict = {ix:1 for ix in bids['auction'].unique()}
auction_count = bids['auction'].value_counts()

def bid_update(auction):
  val = bid_order_dict[auction]
  norm_val = val/auction_count[auction]
  bid_order_dict[auction]+=1
  return norm_val

bids['bid_order_norm'] = bids['auction'].apply(lambda x: bid_update(x))

In [17]:
bid_order_norm_avg = (bids.groupby(['bidder_id'])['bid_order_norm']
                  .mean()
                  .rename('bid_order_norm_avg')
                  .reset_index())

bid_order_norm_min = (bids.groupby(['bidder_id'])['bid_order_norm']
                  .min()
                  .rename('bid_order_norm_min')
                  .reset_index()) 

bid_order_norm_max = (bids.groupby(['bidder_id'])['bid_order_norm']
                  .max()
                  .rename('bid_order_norm_max')
                  .reset_index())

bid_order_features = bid_order_norm_avg.merge(bid_order_norm_min,on = 'bidder_id',how = 'left').merge(bid_order_norm_max,on = 'bidder_id',how = 'left')
bid_order_features.head()

Unnamed: 0,bidder_id,bid_order_norm_avg,bid_order_norm_min,bid_order_norm_max
0,001068c415025a009fee375a12cff4fcnht8y,0.935007,0.935007,0.935007
1,002d229ffb247009810828f648afc2ef593rb,0.287709,0.284916,0.290503
2,0030a2dd87ad2733e0873062e4f83954mkj86,0.849812,0.849812,0.849812
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0.392574,0.229167,0.628173
4,00486a11dff552c4bd7696265724ff81yeo9v,0.392629,0.054337,0.983966


# Time

## Time Difference

In [18]:
time_diff_dict = {ix:iy for ix,iy in zip(sorted(bids['auction'].unique()),bids.groupby('auction')['time'].min())}
def time_update(row):
  val = row[1]-time_diff_dict[row[0]]
  time_diff_dict[row[0]] = row[1]
  return val

bids['time_diff'] = bids[['auction','time']].sort_values(by = 'time').apply(lambda x: time_update(x),axis=1)

#Normalized Time Difference
min_max_scaler = preprocessing.MinMaxScaler()
bids['time_diff_norm'] = min_max_scaler.fit_transform(bids['time_diff'].to_numpy().reshape(-1,1))

In [19]:
time_diff_norm_avg = (bids.groupby(['bidder_id'])['time_diff_norm']
                  .mean()
                  .rename('time_diff_norm_avg')
                  .reset_index())

time_diff_norm_min = (bids.groupby(['bidder_id'])['time_diff_norm']
                  .min()
                  .rename('time_diff_norm_min')
                  .reset_index()) 

time_diff_norm_max = (bids.groupby(['bidder_id'])['time_diff_norm']
                  .max()
                  .rename('time_diff_norm_max')
                  .reset_index())

time_diff_features = time_diff_norm_avg.merge(time_diff_norm_min, on='bidder_id', how='left').merge(time_diff_norm_max, on='bidder_id', how='left')

time_diff_features.head()

Unnamed: 0,bidder_id,time_diff_norm_avg,time_diff_norm_min,time_diff_norm_max
0,001068c415025a009fee375a12cff4fcnht8y,5.1e-05,5.067484e-05,5.1e-05
1,002d229ffb247009810828f648afc2ef593rb,1e-06,6.941759e-07,1e-06
2,0030a2dd87ad2733e0873062e4f83954mkj86,1e-05,9.718463e-06,1e-05
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,2.3e-05,6.941759e-06,5e-05
4,00486a11dff552c4bd7696265724ff81yeo9v,3.6e-05,6.941759e-07,0.000128


## Activity in Time Slots

In [20]:
bin_range = np.ceil((max(bids['time'])-min(bids['time']))/10000)
bids['time_slot'] = np.floor((bids['time']-min(bids['time']))/bin_range).astype(int)

In [21]:
activity_time_slot = (bids.groupby(['bidder_id','time_slot'])
                        .size()
                        .unstack('time_slot', fill_value=0)
                        .reset_index())

activity_time_slot.head()

time_slot,bidder_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,001068c415025a009fee375a12cff4fcnht8y,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,002d229ffb247009810828f648afc2ef593rb,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0030a2dd87ad2733e0873062e4f83954mkj86,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00486a11dff552c4bd7696265724ff81yeo9v,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


The activity of each bidder is observed to be dominant in one of the slices of time slots in most cases as shown below, each of 968 bins length. We could say the activity to be divided for 3 different days, where the 968 bins define activity in each day.

*   0:967
*   4516:5483
*   9032:9999

### Dominant Time Series

In [22]:
activity_time_slot['day1_activity'] = activity_time_slot[[ix for ix in range(0,968)]].sum(axis=1)
activity_time_slot['day2_activity'] = activity_time_slot[[ix for ix in range(4516,5484)]].sum(axis=1)
activity_time_slot['day3_activity'] = activity_time_slot[[ix for ix in range(9032,10000)]].sum(axis=1)

print('Day 1 dominating bidders:',activity_time_slot[activity_time_slot['day1_activity']>activity_time_slot['day2_activity']+activity_time_slot['day3_activity']].shape[0])
print('Day 2 dominating bidders:',activity_time_slot[activity_time_slot['day2_activity']>activity_time_slot['day1_activity']+activity_time_slot['day3_activity']].shape[0])
print('Day 3 dominating bidders:',activity_time_slot[activity_time_slot['day3_activity']>activity_time_slot['day2_activity']+activity_time_slot['day1_activity']].shape[0])
print('Less Bidding Bidders(Mostly Humans):',activity_time_slot[(activity_time_slot['day2_activity']==activity_time_slot['day1_activity']) & (activity_time_slot['day3_activity']<activity_time_slot['day1_activity'])].shape[0])
print('Total Bidders:',activity_time_slot.shape[0])


print('*'*50)
print('')
print('The above four categories add up to all the total bidders, and so we are going to only pick dominant day for our features, for our last category we will pick day 1 as default')

Day 1 dominating bidders: 1734
Day 2 dominating bidders: 1741
Day 3 dominating bidders: 2954
Less Bidding Bidders(Mostly Humans): 185
Total Bidders: 6614
**************************************************

The above four categories add up to all the total bidders, and so we are going to only pick dominant day for our features, for our last category we will pick day 1 as default


In [23]:
dominant_day_ts = np.zeros((6614,972)) # 972 columns = 968(Dominating bins) + 3(sum of activity on the 3 days) + 1(Flag if it has dominant day)
k=0
for _, row in tqdm(activity_time_slot.iterrows()):
    if (row['day1_activity']>row['day2_activity']+row['day3_activity']):
        dominant_day_ts[k]= [row[ix] for ix in range(0,968)] + [row['day1_activity'] , row['day2_activity'], row['day3_activity']] + [1] 
    
    elif (row['day2_activity']>row['day1_activity']+row['day3_activity']):
        dominant_day_ts[k]= [row[ix] for ix in range(4516,5484)] + [row['day1_activity'] , row['day2_activity'], row['day3_activity']] + [1]
    
    elif (row['day3_activity']>row['day1_activity']+row['day2_activity']):
        dominant_day_ts[k]= [row[ix] for ix in range(9032,10000)] + [row['day1_activity'] , row['day2_activity'], row['day3_activity']] + [1]
    
    elif (row['day1_activity']==row['day2_activity']):
        dominant_day_ts[k]= [row[ix] for ix in range(0,968)] + [row['day1_activity'] , row['day2_activity'], row['day3_activity']] + [0]
    
    else:
      print('ERROR')
    k+=1

6614it [00:33, 199.66it/s]


In [24]:
cols = ['ts_'+str(ix) for ix in range(0,968)] + ['day1_activity','day2_activity','day3_activity'] + ['dominant_day_exists']
dom_activity_timeSeries = pd.DataFrame(dominant_day_ts, columns = cols)
dom_activity_timeSeries['bidder_id'] = activity_time_slot['bidder_id']

dom_activity_timeSeries['median_ts'] = dom_activity_timeSeries[['ts_'+str(ix) for ix in range(0,968)]].median(axis=1)
dom_activity_timeSeries['mean_ts'] = dom_activity_timeSeries[['ts_'+str(ix) for ix in range(0,968)]].mean(axis=1)
dom_activity_timeSeries['max_ts'] = dom_activity_timeSeries[['ts_'+str(ix) for ix in range(0,968)]].max(axis=1)

dom_activity_timeSeries.head()

Unnamed: 0,ts_0,ts_1,ts_2,ts_3,ts_4,ts_5,ts_6,ts_7,ts_8,ts_9,ts_10,ts_11,ts_12,ts_13,ts_14,ts_15,ts_16,ts_17,ts_18,ts_19,ts_20,ts_21,ts_22,ts_23,ts_24,ts_25,ts_26,ts_27,ts_28,ts_29,ts_30,ts_31,ts_32,ts_33,ts_34,ts_35,ts_36,ts_37,ts_38,ts_39,...,ts_936,ts_937,ts_938,ts_939,ts_940,ts_941,ts_942,ts_943,ts_944,ts_945,ts_946,ts_947,ts_948,ts_949,ts_950,ts_951,ts_952,ts_953,ts_954,ts_955,ts_956,ts_957,ts_958,ts_959,ts_960,ts_961,ts_962,ts_963,ts_964,ts_965,ts_966,ts_967,day1_activity,day2_activity,day3_activity,dominant_day_exists,bidder_id,median_ts,mean_ts,max_ts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,001068c415025a009fee375a12cff4fcnht8y,0.0,0.001033,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,002d229ffb247009810828f648afc2ef593rb,0.0,0.002066,2.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0030a2dd87ad2733e0873062e4f83954mkj86,0.0,0.001033,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,0.0,0.002066,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,5.0,0.0,1.0,00486a11dff552c4bd7696265724ff81yeo9v,0.0,0.015496,2.0


### Wavelt Decomposition

In [25]:
# db(wavelet type): Daubechies family
# Wavelet Output: [cA_n, cD_n, cD_n-1, …, cD2, cD1] Ordered list of coefficients arrays where n denotes the level of decomposition. The first element (cA_n) of the result is approximation coefficients array and the following elements (cD_n - cD_1) are details coefficients arrays.

def waveletDecomposition(row):
    signal = row[['ts_'+str(ix) for ix in range(0,968)]]
    waves = pywt.wavedec(signal,'db7',mode='sym',level=5)
    output_norm = [np.linalg.norm(wave) for wave in waves]

    row['wavelet_amp'],row['wavelat_1'],row['wavelet_2'],row['wavelat_3'],row['wavelet_4'],row['wavelat_5']  = output_norm[0], output_norm[1], output_norm[2], output_norm[3], output_norm[4], output_norm[5]
    return row

dom_activity_timeSeries = dom_activity_timeSeries.apply(lambda x: waveletDecomposition(x),axis=1)
# dom_activity_timeSeries = dom_activity_timeSeries.drop(['ts_'+str(ix) for ix in range(0,968)],axis=1)
# dom_activity_timeSeries.head()

# **Feature Set**

In [26]:
feature_set = merchandise_features.merge(country_rE_features,on='bidder_id',how = 'left')\
                                    .merge(unique_country_features,on='bidder_id',how = 'left')\
                                    .merge(device_rE_features,on='bidder_id',how = 'left')\
                                    .merge(unique_device_features, on='bidder_id',how = 'left')\
                                    .merge(url_rE_features, on='bidder_id',how = 'left')\
                                    .merge(unique_url_features,on='bidder_id',how = 'left')\
                                    .merge(main_url_features,on='bidder_id',how = 'left')\
                                    .merge(ip_network_rE_features,on='bidder_id',how = 'left')\
                                    .merge(unique_ip_network_features,on='bidder_id',how = 'left')\
                                    .merge(auction_features,on='bidder_id',how = 'left')\
                                    .merge(bids_features,on='bidder_id',how = 'left')\
                                    .merge(bid_order_features,on='bidder_id',how = 'left')\
                                    .merge(time_diff_features,on='bidder_id',how = 'left')\
                                    .merge(dom_activity_timeSeries,on='bidder_id',how = 'left')

with open('feature_set.pkl', 'wb') as output_file:
    pkl.dump(feature_set, output_file)