In [1]:
## import required packages
import numpy as np
import pandas as pd

## import the training data set, including bidder_id and type of the agent ('outcome')
train_data = pd.read_csv("train.csv")
## bidding history, including both training ant test sets
bid_history = pd.read_csv("bids.csv")

In [2]:
def GetBlacklist():
    blacklist_bidder = train_data[train_data['outcome'] == 1].bidder_id
    blacklist_ip = bid_history[bid_history['bidder_id'].isin(blacklist_bidder)].ip.unique()
    return blacklist_ip

In [3]:
def ConvertTime(history):
    
    SECONDS_PER_MINUTE = 60
    SECONDS_PER_HOUR = SECONDS_PER_MINUTE * 60
    SECONDS_PER_DAY  = SECONDS_PER_HOUR * 24
    # convert time to seconds
    history['time'] /= 52631580
    # time start with 0 second
    history['time'] -= min(history['time'])
    history['day'] = (history['time'] / SECONDS_PER_DAY).astype(int)
    history['hour'] = ((history['time'] % SECONDS_PER_DAY) / SECONDS_PER_HOUR).astype(int)
    history['minute'] = (((history['time'] % SECONDS_PER_DAY) % SECONDS_PER_HOUR) / SECONDS_PER_MINUTE).astype(int)
    history['second'] = (((history['time'] % SECONDS_PER_DAY) % SECONDS_PER_HOUR) % SECONDS_PER_MINUTE)

In [4]:
ConvertTime(bid_history)

In [5]:
def GenerateFeature(bidders, history, training):
    """
    Read the data set and generate the corresponding features
    @param bidders: dataframe including fields of 'bidder_id'. If it is a training set, it also includes the label of the bidder
    @param history: bidding history of all bidders
    @param training: Boolean value. True if data is a training set; False if data is a test set
    @return df: data frame with features
    """
    if training:
        df = bidders.loc[:,['bidder_id','outcome']]
    else:
        df = bidders.loc[:,['bidder_id']]
    
    # total number of bids performed by each bidder
    num_bids = history['bidder_id'].value_counts().to_frame()
    num_bids.columns = ['num_bids']
    df = bidders.loc[:,['bidder_id','outcome']].join(num_bids, on = 'bidder_id', lsuffix='_l')
    print "Get num_bids"
    # discard bidders who did not submit any bid
    # df = df[~df.num_bids.isnull()]
    
    
    history = history.sort_values(by=['bidder_id', 'time', 'auction'])
    history['time_diff_same_auction'] = history.groupby(['bidder_id','auction'])['time'].diff()
    min_time_diff_same_auction = history.groupby(['bidder_id'])['time_diff_same_auction'].median()
    min_time_diff_same_auction = min_time_diff_same_auction.fillna(min_time_diff_same_auction.max()*100)
    min_time_diff_same_auction = pd.DataFrame({'bidder_id': min_time_diff_same_auction.index, 'min_time_diff_same_auction': min_time_diff_same_auction.values})
    df = pd.merge(df, min_time_diff_same_auction, on = 'bidder_id', how = 'left')
    print "Get min_time_diff_same_auction"
    
    # get the max number of bids by each bidder in 20 minute
    
    TIME_SEGMENT = 20
    history['segement_id_20min'] = (history['minute'] / TIME_SEGMENT).astype(int)
    numbids_per_auction_20min = history.groupby(['bidder_id','auction', 'segement_id_20min']).apply(len)
    max_numbids_per_auction_20min = numbids_per_auction_20min.max(level=[0])
    max_numbids_per_auction_20min = pd.DataFrame({'bidder_id': max_numbids_per_auction_20min.index, 'max_numbids_per_auction_20min': max_numbids_per_auction_20min.values})
    df = pd.merge(df, max_numbids_per_auction_20min, on = 'bidder_id', how = 'left')
    print "Get max_numbids_per_auction_20min"
     
    
    # get the number of auctions participated by a bidder
    num_auction = bid_history.groupby('bidder_id')['auction'].unique().apply(len)
    num_auction = pd.DataFrame({'bidder_id': num_auction.index, 'num_auction': num_auction.values})
    df = pd.merge(df,num_auction, on = 'bidder_id', how = 'left')
    print "Get num_auction"
    

    # get the number of urls used by a bidder
    num_url = bid_history.groupby('bidder_id')['url'].unique().apply(len)
    num_url = pd.DataFrame({'bidder_id': num_url.index, 'num_url': num_url.values})
    df = pd.merge(df,num_url, on = 'bidder_id', how = 'left')
    print "Get num_url"
    
    
    # get the number of urls used by a bidder
    #num_url_each_device = bid_history.groupby(['bidder_id','device'])['url'].unique().apply(len)
    #std_num_url_each_device = num_url_each_device.std(level=[0])
    #std_num_url_each_device = pd.DataFrame({'bidder_id': std_num_url_each_device.index, 'std_num_url_each_device': std_num_url_each_device.values})
    #df = pd.merge(df, std_num_url_each_device, on = 'bidder_id', how = 'left')
    #print "Get std_num_url_each_device"
    
    
    # get the number of countries that a bidder is located
    num_country = bid_history.groupby('bidder_id')['country'].unique().apply(len)
    num_country = pd.DataFrame({'bidder_id': num_country.index, 'num_country': num_country.values})
    df = pd.merge(df,num_country, on = 'bidder_id', how = 'left')
    print "Get num_country"
    
    # get the number of devices used by a bidder
    num_device = bid_history.groupby('bidder_id')['device'].unique().apply(len)
    num_device = pd.DataFrame({'bidder_id': num_device.index, 'num_device': num_device.values})
    df = pd.merge(df,num_device, on = 'bidder_id', how = 'left')
    print "Get num_device"
    
    # get the number of ip addresses used by a bidder
    num_ip = bid_history.groupby('bidder_id')['ip'].unique().apply(len)
    num_ip = pd.DataFrame({'bidder_id': num_ip.index, 'num_ip': num_ip.values})
    df = pd.merge(df,num_ip, on = 'bidder_id', how = 'left')
    print "Get num_ip"
    
    
    """
    # get blacklist ip count for each bidder
    blacklist_ip = GetBlackList()
    history['is_blacklist_ip'] = history['ip'].isin(blacklist_ip)
    count_blacklist_ip = history.groupby(['bidder_id'])['is_blacklist_ip'].sum()
    count_blacklist_ip = pd.DataFrame({'bidder_id': count_blacklist_ip.index, 'count_blacklist_ip': count_blacklist_ip.values})
    df = pd.merge(df,count_blacklist_ip, on = 'bidder_id', how = 'left')
    df['percent_blacklist_per_bidder'] = df['count_blacklist_ip'] / df['num_bids']
    df = df.drop('count_blacklist_ip', 1)
    print "Get percent_blacklist_per_bidder"
    """
    
    # get the probability that a bidder wins an auction
    winner_id = bid_history.loc[bid_history.groupby('auction')['time'].idxmax()].bidder_id
    win_counts = pd.DataFrame({'bidder_id': winner_id.value_counts().index, 'win_counts': winner_id.value_counts().values})
    df = pd.merge(df,win_counts, on = 'bidder_id', how = 'left')
    df.win_counts = df.win_counts.fillna(0)
    df['win_prob'] = df['win_counts'] / df['num_auction']
    df = df.drop('win_counts', 1)
    print "Get win_prob"
    
    # get the average number of urls used by a device
    df['avg_num_url_per_device'] = df.num_url / df.num_device
    df['avg_num_url_per_device'] = df['avg_num_url_per_device'].fillna(0)
    
    
    # For users not
    df = df.fillna(0)
    return df




    

In [6]:
df = GenerateFeature(train_data, bid_history, training = True)

Get num_bids
Get min_time_diff_same_auction
Get max_numbids_per_auction_20min
Get num_auction
Get num_url
Get num_country
Get num_device
Get num_ip
Get win_prob


In [7]:
df_human = df[df['outcome'] == 0]
bidder_maxbids_human = df_human.ix[df_human['num_bids'].idxmax()].bidder_id
df_robot = df[df['outcome'] == 1]
bidder_maxbids_robot = df_robot.ix[df_robot['num_bids'].idxmax()].bidder_id

In [8]:
bid_history.groupby(['bidder_id','day']).apply(len)[bidder_maxbids_human]

day
0     117367
1     115310
2     136371
13         1
14     44522
15     44356
16     57106
dtype: int64

In [9]:
bid_history.groupby(['bidder_id','auction','day']).apply(len)[bidder_maxbids_robot]

auction  day
00270    14      17
         15     278
         16     532
021em    15       9
         16      82
030jb    14       2
         16       1
05bab    14       1
05x67    15      34
         16       8
06dre    14       1
074j7    15      37
         16       7
0bnvn    1        1
0bpbx    1        1
0cuy6    14      38
0d5z1    14      34
         15      37
0i42l    1        1
0jy69    14      18
         15      11
         16       8
0kr63    0        3
0lm5o    15       2
         16       1
0lou5    1        2
         2        3
0mlsx    14     412
         15     315
         16     474
               ... 
zlh83    1       14
         2        9
zlz1o    2        1
         14       1
zm30m    1        1
         14       2
         15       6
         16       3
zm7mm    14       5
         15       2
         16       7
zoodn    14       3
         15       1
zprwa    0        7
         1        5
         2       11
ztr30    16       1
zu7ij    14       4
       

In [10]:
df['avg_num_url_per_device'] = df.num_url / df.num_device
df['avg_num_url_per_device'] = df['avg_num_url_per_device'].fillna(0)
df[(df['outcome'] == 1)].describe()

Unnamed: 0,outcome,num_bids,min_time_diff_same_auction,max_numbids_per_auction_20min,num_auction,num_url,num_country,num_device,num_ip,win_prob,avg_num_url_per_device
count,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0
mean,1.0,4004.038835,6640188.0,340.912621,145.038835,544.582524,26.475728,163.61165,2387.796117,0.043726,7.611989
std,0.0,16370.097403,29533480.0,2082.187157,195.103186,1163.909786,31.158622,222.811854,11269.674137,0.113448,29.596003
min,1.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.008065
25%,1.0,288.0,100.0,24.0,23.0,4.5,3.0,4.5,34.0,0.0,0.607692
50%,1.0,716.0,609.0,55.0,74.0,88.0,13.0,78.0,290.0,0.009901,2.0
75%,1.0,2332.5,1405.75,137.0,170.5,591.0,41.5,219.0,1089.0,0.039779,4.195547
max,1.0,161935.0,136754200.0,21104.0,1018.0,8551.0,179.0,1144.0,111918.0,1.0,248.666667


In [11]:
df[(df['outcome'] == 0)].describe()

Unnamed: 0,outcome,num_bids,min_time_diff_same_auction,max_numbids_per_auction_20min,num_auction,num_url,num_country,num_device,num_ip,win_prob,avg_num_url_per_device
count,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0
mean,0.0,1392.046073,40271830.0,70.293717,57.189005,330.097906,12.484293,72.824607,572.43089,0.011359,1.234794
std,0.0,14487.265477,62314040.0,446.081612,142.021381,2714.979379,22.748897,183.376886,4109.721882,0.047504,2.094095
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,3.0,2704.375,1.0,2.0,1.0,1.0,2.0,2.0,0.0,0.5
50%,0.0,14.0,24986.0,2.0,9.0,4.0,3.0,7.0,10.0,0.0,1.0
75%,0.0,136.0,136754200.0,11.0,40.0,33.0,10.0,50.0,86.75,0.0,1.0
max,0.0,515033.0,136754200.0,8543.0,1623.0,81376.0,164.0,2618.0,109159.0,0.916667,39.14366


In [12]:
X_all = df.loc[:,'num_bids':]
y_all = df['outcome']
num_features = X_all.shape[1]


In [13]:
#from sklearn.cross_validation import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, stratify = y_all, test_size=0.2, random_state=42)

In [15]:
from sklearn.grid_search import GridSearchCV 
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestRegressor


def PredictMultiRandomForest(X_all, seed):
    
    
    num_features = X_all.shape[1]
    
    # Create the parameters list you wish to tune
    parameters = {'max_features': range(1,num_features+1), 'max_depth':range(1,10)}

    # Initialize the classifier
    clf = RandomForestRegressor(random_state = seed)

    # Make an f1 scoring function using 'make_scorer' 
    roc_auc_scorer = make_scorer(roc_auc_score)

    # Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_obj = GridSearchCV(clf, parameters, scoring = roc_auc_scorer, cv = 5)

    # Fit the grid search object to the training data and find the optimal parameters
    grid_obj = grid_obj.fit(X_all, y_all)

    # Get the estimator
    clf = grid_obj.best_estimator_
    
    return clf

seed = [0, 10, 20, 30, 40];
clf_list = [0] * len(seed)
i = 0

for s in seed:
    clf_list[i] = PredictMultiRandomForest(X_all, s)
    #y_pred_train = clf.predict(X_train)
    #y_pred_test += clf_list[i].predict(X_test) * 1.0 / len(seed)
    i += 1
# Report the final AUC score for training and testing after parameter tuning
#print "Tuned model has a training AUC score of {:.4f}.".format(roc_auc_score(y_train, y_pred_train))
#print "Tuned model has a testing AUC score of {:.4f}.".format(roc_auc_score(y_test, y_pred_test))

In [16]:
test_data = pd.read_csv("test.csv")
df_test = GenerateFeature(test_data, bid_history, training = False)

Get num_bids
Get min_time_diff_same_auction
Get max_numbids_per_auction_20min
Get num_auction
Get num_url
Get num_country
Get num_device
Get num_ip
Get win_prob


In [17]:
X_test_all = df_test.loc[:,'num_bids':]

In [18]:
print len(clf_list)
final_pred =[0] * X_test_all.shape[0]
for clf in clf_list:
    final_pred += clf.predict(X_test_all) * 1.0 / len(clf_list)


5


In [19]:
max(final_pred)

0.77284615322121175

In [22]:
output_dataframe = pd.DataFrame({'bidder_id':df_test.bidder_id,'prediction':final_pred})

In [23]:
output_dataframe.to_csv('out.csv', index = False)