In [1]:
## import required packages
import numpy as np
import pandas as pd

## import the training data set, including bidder_id and type of the agent ('outcome')
train_data = pd.read_csv("train.csv")
## bidding history, including both training ant test sets
bid_history = pd.read_csv("bids.csv")

In [2]:
def GetBlacklist():
    blacklist_bidder = train_data[train_data['outcome'] == 1].bidder_id
    blacklist_ip = bid_history[bid_history['bidder_id'].isin(blacklist_bidder)].ip.unique()
    return blacklist_ip

In [3]:
def ConvertTime(history):
    
    SECONDS_PER_MINUTE = 60
    SECONDS_PER_HOUR = SECONDS_PER_MINUTE * 60
    SECONDS_PER_DAY  = SECONDS_PER_HOUR * 24
    # convert time to seconds
    history['time'] /= 52631580
    # time start with 0 second
    history['time'] -= min(history['time'])
    history['day'] = (history['time'] / SECONDS_PER_DAY).astype(int)
    history['hour'] = ((history['time'] % SECONDS_PER_DAY) / SECONDS_PER_HOUR).astype(int)
    history['minute'] = (((history['time'] % SECONDS_PER_DAY) % SECONDS_PER_HOUR) / SECONDS_PER_MINUTE).astype(int)
    history['second'] = (((history['time'] % SECONDS_PER_DAY) % SECONDS_PER_HOUR) % SECONDS_PER_MINUTE)

In [4]:
def entropy(arr):
    pk = arr / sum(arr)
    return -sum(pk * np.log(pk))

In [5]:
ConvertTime(bid_history)
blacklist_ip = GetBlacklist()

In [6]:
def GenerateFeature(bidders, history, training):
    """
    Read the data set and generate the corresponding features
    @param bidders: dataframe including fields of 'bidder_id'. If it is a training set, it also includes the label of the bidder
    @param history: bidding history of all bidders
    @param training: Boolean value. True if data is a training set; False if data is a test set
    @return df: data frame with features
    """
    if training:
        df = bidders.loc[:,['bidder_id','outcome']]
    else:
        df = bidders.loc[:,['bidder_id']]
    
    # total number of bids performed by each bidder
    num_bids = history['bidder_id'].value_counts().to_frame()
    num_bids.columns = ['num_bids']
    df = bidders.loc[:,['bidder_id','outcome']].join(num_bids, on = 'bidder_id', lsuffix='_l')
    print "Get num_bids"
    # discard bidders who did not submit any bid
    # df = df[~df.num_bids.isnull()]
    
    
    history = history.sort_values(by=['bidder_id', 'time', 'auction'])
    history['time_diff_same_auction'] = history.groupby(['bidder_id','auction'])['time'].diff()
    min_time_diff_same_auction = history.groupby(['bidder_id'])['time_diff_same_auction'].median()
    min_time_diff_same_auction = min_time_diff_same_auction.fillna(min_time_diff_same_auction.max()*100)
    min_time_diff_same_auction = pd.DataFrame({'bidder_id': min_time_diff_same_auction.index, 'min_time_diff_same_auction': min_time_diff_same_auction.values})
    df = pd.merge(df, min_time_diff_same_auction, on = 'bidder_id', how = 'left')
    print "Get min_time_diff_same_auction"
    
    # get the max number of bids by each bidder in 20 minute
    
    TIME_SEGMENT = 20
    history['segement_id_20min'] = (history['minute'] / TIME_SEGMENT).astype(int)
    numbids_per_auction_20min = history.groupby(['bidder_id','auction', 'segement_id_20min']).apply(len)
    max_numbids_per_auction_20min = numbids_per_auction_20min.max(level=[0])
    max_numbids_per_auction_20min = pd.DataFrame({'bidder_id': max_numbids_per_auction_20min.index, 'max_numbids_per_auction_20min': max_numbids_per_auction_20min.values})
    df = pd.merge(df, max_numbids_per_auction_20min, on = 'bidder_id', how = 'left')
    print "Get max_numbids_per_auction_20min"
     
    
    # get the number of auctions participated by a bidder
    num_auction = history.groupby('bidder_id')['auction'].unique().apply(len)
    num_auction = pd.DataFrame({'bidder_id': num_auction.index, 'num_auction': num_auction.values})
    df = pd.merge(df,num_auction, on = 'bidder_id', how = 'left')
    print "Get num_auction"
    

    # get the number of urls used by a bidder
    num_url = history.groupby('bidder_id')['url'].unique().apply(len)
    num_url = pd.DataFrame({'bidder_id': num_url.index, 'num_url': num_url.values})
    df = pd.merge(df,num_url, on = 'bidder_id', how = 'left')
    print "Get num_url"
    
    
    # get the number of urls used by a bidder
    #num_url_each_device = bid_history.groupby(['bidder_id','device'])['url'].unique().apply(len)
    #std_num_url_each_device = num_url_each_device.std(level=[0])
    #std_num_url_each_device = pd.DataFrame({'bidder_id': std_num_url_each_device.index, 'std_num_url_each_device': std_num_url_each_device.values})
    #df = pd.merge(df, std_num_url_each_device, on = 'bidder_id', how = 'left')
    #print "Get std_num_url_each_device"
    
    
    # get the number of countries that a bidder is located
    num_country = history.groupby('bidder_id')['country'].unique().apply(len)
    num_country = pd.DataFrame({'bidder_id': num_country.index, 'num_country': num_country.values})
    df = pd.merge(df,num_country, on = 'bidder_id', how = 'left')
    print "Get num_country"
    
    # get the number of devices used by a bidder
    num_device = history.groupby('bidder_id')['device'].unique().apply(len)
    num_device = pd.DataFrame({'bidder_id': num_device.index, 'num_device': num_device.values})
    df = pd.merge(df,num_device, on = 'bidder_id', how = 'left')
    print "Get num_device"
    
    # get the number of ip addresses used by a bidder
    num_ip = history.groupby('bidder_id')['ip'].unique().apply(len)
    num_ip = pd.DataFrame({'bidder_id': num_ip.index, 'num_ip': num_ip.values})
    df = pd.merge(df,num_ip, on = 'bidder_id', how = 'left')
    print "Get num_ip"
    
    
    """
    # get blacklist ip count for each bidder

    history['is_blacklist_ip'] = history['ip'].isin(blacklist_ip)
    count_blacklist_ip = history.groupby(['bidder_id'])['is_blacklist_ip'].sum()
    count_blacklist_ip = pd.DataFrame({'bidder_id': count_blacklist_ip.index, 'count_blacklist_ip': count_blacklist_ip.values})
    df = pd.merge(df,count_blacklist_ip, on = 'bidder_id', how = 'left')
    df['percent_blacklist_per_bidder'] = df['count_blacklist_ip'] / df['num_bids']
    df = df.drop('count_blacklist_ip', 1)
    print "Get percent_blacklist_per_bidder"
    """
    
    
    # get ip entropy
    grouped_ip_count = history.groupby(['bidder_id','ip']).apply(len)
    ip_entropy = grouped_ip_count.groupby(level = 0).apply(entropy)
    ip_entropy = pd.DataFrame({'bidder_id': ip_entropy.index, 'ip_entropy': ip_entropy.values})
    df = pd.merge(df,ip_entropy, on = 'bidder_id', how = 'left')
    print("Get ip_entropy")
    
    # get url entropy
    grouped_url_count = history.groupby(['bidder_id','url']).apply(len)
    url_entropy = grouped_url_count.groupby(level = 0).apply(entropy)
    url_entropy = pd.DataFrame({'bidder_id': url_entropy.index, 'url_entropy': url_entropy.values})
    df = pd.merge(df,url_entropy, on = 'bidder_id', how = 'left')
    print("Get url_entropy")
    
    # get device entropy
    grouped_device_count = history.groupby(['bidder_id','device']).apply(len)
    device_entropy = grouped_device_count.groupby(level = 0).apply(entropy)
    device_entropy = pd.DataFrame({'bidder_id': device_entropy.index, 'device_entropy': device_entropy.values})
    df = pd.merge(df,device_entropy, on = 'bidder_id', how = 'left')
    print("Get device_entropy")
    
    # get average hour entropy (average over countries)
    
    """
    grouped_hour_count = history.groupby(['bidder_id','hour']).apply(len)
    hour_entropy = grouped_hour_count.groupby(level = 0).apply(entropy)
    """
    hour_count_per_country = bid_history.groupby(['bidder_id','country','hour']).apply(len)
    hour_entropy_per_country = hour_count_per_country.groupby(level = [0,1]).apply(entropy)
    hour_entropy = hour_entropy_per_country.groupby(level = 0).apply(np.mean)
    hour_entropy = pd.DataFrame({'bidder_id': hour_entropy.index, 'hour_entropy': hour_entropy.values})
    df = pd.merge(df,hour_entropy, on = 'bidder_id', how = 'left')
    print("Get hour_entropy")

    
    """
    grouped_ip_count = history.groupby(['bidder_id','auction','ip']).apply(len)
    ip_entropy = grouped_ip_count.groupby(level = [0,1]).apply(entropy)
    mean_ip_entropy = ip_entropy.groupby(level = 0).apply(np.mean)
    mean_ip_entropy = pd.DataFrame({'bidder_id': mean_ip_entropy.index, 'mean_ip_entropy': mean_ip_entropy.values})
    df = pd.merge(df,mean_ip_entropy, on = 'bidder_id', how = 'left')
    print "Get mean_ip_entropy"
    """
    
    # get the probability that a bidder wins an auction
    winner_id = history.loc[bid_history.groupby('auction')['time'].idxmax()].bidder_id
    win_counts = pd.DataFrame({'bidder_id': winner_id.value_counts().index, 'win_counts': winner_id.value_counts().values})
    df = pd.merge(df,win_counts, on = 'bidder_id', how = 'left')
    df.win_counts = df.win_counts.fillna(0)
    df['win_prob'] = df['win_counts'] / df['num_auction']
    df = df.drop('win_counts', 1)
    print("Get win_prob")
    
    # get the average number of urls used by a device
    df['avg_num_url_per_device'] = df.num_url / df.num_device
    df['avg_num_url_per_device'] = df['avg_num_url_per_device'].fillna(0)
    
    
    # For users not
    df = df.fillna(0)
    return df




    

In [7]:
df = GenerateFeature(train_data, bid_history, training = True)

Get num_bids
Get min_time_diff_same_auction
Get max_numbids_per_auction_20min
Get num_auction
Get num_url
Get num_country
Get num_device
Get num_ip
Get ip_entropy
Get url_entropy
Get device_entropy
Get hour_entropy
Get win_prob


In [8]:
df_human = df[df['outcome'] == 0]
bidder_maxbids_human = df_human.ix[df_human['num_bids'].idxmax()].bidder_id
df_robot = df[df['outcome'] == 1]
bidder_maxbids_robot = df_robot.ix[df_robot['num_bids'].idxmax()].bidder_id

In [9]:
import matplotlib.pyplot as plt

In [10]:
bid_history.groupby(['bidder_id','auction','day']).apply(len)[bidder_maxbids_robot]

auction  day
00270    14      17
         15     278
         16     532
021em    15       9
         16      82
030jb    14       2
         16       1
05bab    14       1
05x67    15      34
         16       8
06dre    14       1
074j7    15      37
         16       7
0bnvn    1        1
0bpbx    1        1
0cuy6    14      38
0d5z1    14      34
         15      37
0i42l    1        1
0jy69    14      18
         15      11
         16       8
0kr63    0        3
0lm5o    15       2
         16       1
0lou5    1        2
         2        3
0mlsx    14     412
         15     315
         16     474
               ... 
zlh83    1       14
         2        9
zlz1o    2        1
         14       1
zm30m    1        1
         14       2
         15       6
         16       3
zm7mm    14       5
         15       2
         16       7
zoodn    14       3
         15       1
zprwa    0        7
         1        5
         2       11
ztr30    16       1
zu7ij    14       4
       

In [11]:
df['avg_num_url_per_device'] = df.num_url / df.num_device
df['avg_num_url_per_device'] = df['avg_num_url_per_device'].fillna(0)
df[(df['outcome'] == 1)].describe()

Unnamed: 0,outcome,num_bids,min_time_diff_same_auction,max_numbids_per_auction_20min,num_auction,num_url,num_country,num_device,num_ip,ip_entropy,url_entropy,device_entropy,hour_entropy,win_prob,avg_num_url_per_device
count,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0
mean,1.0,4004.038835,6640188.0,340.912621,145.038835,544.582524,26.475728,163.61165,2387.796117,4.599,2.638433,2.75817,1.075393,0.043726,7.611989
std,0.0,16370.097403,29533480.0,2082.187157,195.103186,1163.909786,31.158622,222.811854,11269.674137,2.687466,2.347788,1.765078,0.601961,0.113448,29.596003
min,1.0,1.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.0,-0.0,-0.0,0.0,0.0,0.008065
25%,1.0,288.0,100.0,24.0,23.0,4.5,3.0,4.5,34.0,2.155149,0.197655,1.101417,0.679991,0.0,0.607692
50%,1.0,716.0,609.0,55.0,74.0,88.0,13.0,78.0,290.0,5.137297,2.34579,3.270671,1.052194,0.009901,2.0
75%,1.0,2332.5,1405.75,137.0,170.5,591.0,41.5,219.0,1089.0,6.645241,5.033401,4.400002,1.41223,0.039779,4.195547
max,1.0,161935.0,136754200.0,21104.0,1018.0,8551.0,179.0,1144.0,111918.0,11.254165,6.658489,5.162447,3.078652,1.0,248.666667


In [12]:
df[(df['outcome'] == 0)].describe()

Unnamed: 0,outcome,num_bids,min_time_diff_same_auction,max_numbids_per_auction_20min,num_auction,num_url,num_country,num_device,num_ip,ip_entropy,url_entropy,device_entropy,hour_entropy,win_prob,avg_num_url_per_device
count,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0,1910.0
mean,0.0,1392.046073,40271830.0,70.293717,57.189005,330.097906,12.484293,72.824607,572.43089,2.68329,1.56055,2.018459,0.700911,0.011359,1.234794
std,0.0,14487.265477,62314040.0,446.081612,142.021381,2714.979379,22.748897,183.376886,4109.721882,2.288652,1.624869,1.650189,0.628644,0.047504,2.094095
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
25%,0.0,3.0,2704.375,1.0,2.0,1.0,1.0,2.0,2.0,0.693147,-0.0,0.636514,0.126346,0.0,0.5
50%,0.0,14.0,24986.0,2.0,9.0,4.0,3.0,7.0,10.0,2.271869,1.070967,1.816563,0.654323,0.0,1.0
75%,0.0,136.0,136754200.0,11.0,40.0,33.0,10.0,50.0,86.75,4.246183,2.487472,3.433878,1.030862,0.0,1.0
max,0.0,515033.0,136754200.0,8543.0,1623.0,81376.0,164.0,2618.0,109159.0,10.106916,8.942253,5.924976,3.054211,0.916667,39.14366


In [None]:
df

In [13]:
X_all = df.loc[:,'num_bids':]
y_all = df['outcome']
num_features = X_all.shape[1]


In [14]:
#from sklearn.cross_validation import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, stratify = y_all, test_size=0.2, random_state=42)

In [15]:
from sklearn.grid_search import GridSearchCV 
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.ensemble import RandomForestRegressor


def PredictMultiRandomForest(X_all, seed):
    
    
    num_features = X_all.shape[1]
    
    # Create the parameters list you wish to tune
    parameters = {'max_features': range(1,num_features+1), 'max_depth':range(1,10)}

    # Initialize the classifier
    clf = RandomForestRegressor(n_estimators= 20, random_state = seed)

    # Make an f1 scoring function using 'make_scorer' 
    roc_auc_scorer = make_scorer(roc_auc_score)

    # Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_obj = GridSearchCV(clf, parameters, scoring = roc_auc_scorer, cv = 5)

    # Fit the grid search object to the training data and find the optimal parameters
    grid_obj = grid_obj.fit(X_all, y_all)

    # Get the estimator
    clf = grid_obj.best_estimator_
    
    print clf.feature_importances_
    
    return clf

seed = [0, 10, 20, 30, 40, 50, 60, 70, 80];
clf_list = [0] * len(seed)
i = 0

for s in seed:
    clf_list[i] = PredictMultiRandomForest(X_all, s)
    #y_pred_train = clf.predict(X_train)
    #y_pred_test += clf_list[i].predict(X_test) * 1.0 / len(seed)
    i += 1
# Report the final AUC score for training and testing after parameter tuning
#print "Tuned model has a training AUC score of {:.4f}.".format(roc_auc_score(y_train, y_pred_train))
#print "Tuned model has a testing AUC score of {:.4f}.".format(roc_auc_score(y_test, y_pred_test))



[ 0.17260413  0.09492206  0.11737857  0.02273279  0.04978224  0.0216935
  0.11570427  0.03228585  0.0572761   0.08899564  0.11300234  0.04698967
  0.01324987  0.05338296]
[ 0.12281355  0.08565146  0.07799633  0.07561114  0.06660152  0.05154258
  0.07099101  0.06220725  0.06538525  0.087898    0.07034845  0.04756915
  0.04091438  0.07446993]
[ 0.10909745  0.08357934  0.17426606  0.04259872  0.05817605  0.05146162
  0.06151469  0.05384113  0.064787    0.08069753  0.06308924  0.0489729
  0.02983983  0.07807843]
[ 0.14175165  0.11454418  0.11091924  0.04597318  0.03542773  0.03066105
  0.05992491  0.02314367  0.09686546  0.0651744   0.10216197  0.03580017
  0.0399844   0.09766798]
[ 0.11865647  0.06941477  0.13483289  0.04793999  0.06029043  0.02266894
  0.09419854  0.04739753  0.10218524  0.05908784  0.07998585  0.03044413
  0.024182    0.10871536]
[ 0.11444168  0.08621671  0.13587945  0.04806832  0.04229227  0.02143098
  0.07358755  0.04519323  0.09978992  0.06114271  0.11292126  0.04482

In [22]:
train_pred =[0] * X_all.shape[0]

for clf in clf_list:
    train_pred += clf.predict(X_all) * 1.0 / len(clf_list)
print "Tuned model has a training AUC score of {:.4f}.".format(roc_auc_score(y_all, train_pred))

Tuned model has a training AUC score of 0.9766.


In [16]:
test_data = pd.read_csv("test.csv")
df_test = GenerateFeature(test_data, bid_history, training = False)

Get num_bids
Get min_time_diff_same_auction
Get max_numbids_per_auction_20min
Get num_auction
Get num_url
Get num_country
Get num_device
Get num_ip
Get ip_entropy
Get url_entropy
Get device_entropy
Get hour_entropy
Get win_prob


In [17]:
X_test_all = df_test.loc[:,'num_bids':]

In [18]:
print len(clf_list)
final_pred =[0] * X_test_all.shape[0]
for clf in clf_list:
    final_pred += clf.predict(X_test_all) * 1.0 / len(clf_list)


9


In [19]:
max(final_pred)

0.73422270037791992

In [20]:
output_dataframe = pd.DataFrame({'bidder_id':df_test.bidder_id,'prediction':final_pred})

In [21]:
output_dataframe.to_csv('out.csv', index = False)