In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from calendar import weekday, day_name

In [None]:
bids = pd.read_csv('bids.csv')
bidder = pd.read_csv('bottrain.csv')

In [None]:
bidder.head()

## Data Processing 

In [None]:
bids['time'] = pd.to_datetime(bids['time'])

In [None]:
# convert weekday to weekday name
bids['weekday'] = bids.time.apply(lambda x: day_name[weekday(x.year, x.month, x.day)])

In [None]:
# merge two dataframes bidder and bids
bidInfo = pd.merge(bidder, bids, on='bidder_id', how='left')
bidInfo.head()

In [None]:
bidInfo.isna().sum()

In [None]:
bidInfo.dropna(inplace=True)
bidInfo.isna().sum()

In [None]:
# groupby country and outcome
grouped_co = bidInfo.groupby('country')['outcome'].value_counts(normalize=True)
grouped_co
# robot_Country = grouped_co.loc[:, 1].sort_values(ascending=False)
# robot_Country

In [None]:
bidInfo.head()

In [None]:
#  drop the columns that are not useful for the prediction
bidInfo.drop(['url', 'ip', 'address', 'payment_account'], axis=1, inplace=True)

In [None]:
bidInfo.head()

In [None]:
bots = bidInfo.loc[bidInfo.outcome == 1]

# EDA

In [None]:
# trop 10 country with most bidders
sns.countplot(data=bidInfo, x='country', order=bidInfo['country'].value_counts().iloc[:10].index, hue='outcome')

 1. It seems "in" and "ng" has highest number of human bidding among all other countries.  
 2. However, among the bots "in" and "us" seems to be popular. 

In [None]:
# top 10 popular merchandise 
sns.countplot(data=bidInfo, x='merchandise', order=bidInfo['merchandise'].value_counts().iloc[:10].index, hue='outcome')
plt.xticks(rotation=60)

1. Among all the merchandise maximum number of bidding by humans are observed in Sporting goods and jewelery on the contrary, bots also have highest number of bidding in Sporting goods, mobile and jewelry.
2. Jewelery merchandise seems to be most authentic and valuable since its has more number of human outcomes as compared to bots.
3. Obviously, bots have no interest in furniture.

In [None]:
#Top 5 bidder id for bots
sns.countplot(y="bidder_id", data=bots, 
             order=bots.bidder_id.value_counts().iloc[:5].index, hue='outcome')

In [None]:
# influence of weekday for merchandise
sns.set(style='ticks')
g = sns.FacetGrid(bidInfo, col='weekday')
g.map(plt.hist, 'merchandise', bins=20)
g.set_xticklabels(rotation=90)

It seems decreasing trend for bidders of each merchandise as the weekend aproaches. We could assume that in the weekday, biiders demonstrate the same behavior. Furthermore, Wednesday has the highest number for sporting goods while friday has lowest. 



In [None]:
# create new feature auction_duration
auction_duration = bidInfo.groupby('auction').agg({'time': ['min', 'max']})
auction_duration.columns = ['start_time', 'end_time']
auction_duration['duration'] = auction_duration['end_time'] - auction_duration['start_time']
# merge auction_duration to bidInfo
bidInfo = pd.merge(bidInfo, auction_duration, on='auction', how='left')
bidInfo['duration'] = bidInfo['duration'].dt.total_seconds()
bidInfo.head()

In [None]:
#  top10 human bidders
df_human = bidInfo.query('outcome == 0')
top10_human = df_human.pivot_table(index='bidder_id', values='outcome', aggfunc='count', fill_value=0).sort_values(by='outcome', ascending=False).iloc[:10]


In [None]:
# top10 robot bidders
top10_robot = bidInfo.pivot_table(index='bidder_id', values='outcome', aggfunc='sum', fill_value=0).sort_values(by='outcome', ascending=False).iloc[:10]


In [None]:
# top 10 bidders including robot and human
top10_bidder = bidInfo.loc[bidInfo.bidder_id.isin(top10_robot.index) | bidInfo.bidder_id.isin(top10_human.index)]
top10_bidder

In [None]:
px.histogram(top10_bidder, x='duration', color='outcome', marginal='box', nbins=90, title='duration distribution for human and robot bidders')

For the human case:
we see lower 25% took 0 to 13.4093k seconds and the rest uper 25% took  from 76 to 77k second and in the interquartile range median is 64.53k seconds. It's showing that most human is concentrated on bidding during the period of 64k to 76k.

For the Bot case:
we see lower 25% took 0 to 12.94k and in the interquartile range median is 74.56k seconds. There is a huge dispersion between 12.94k to 77.305k, while there is high density from 74.56k to 77.305k .

It's interesting to see that human and robot have similar bidding pattern. However, robot is more active in bidding during the period of 74k to 77k seconds. Most importantly, we oberserve that robot don't bid at all during the period around 60k, even thought there is few human bidding during that period, this would be a good sign that robot is not human.