#Bot detection in Online bidding - Kaggle (Facebook)

###Overview

Data: Bids data that contains metadata about 7.6 million bids and bidder data with around 4000 labeled rows.

Task: Predict if an online bid is made by a machine or a human.

Implementation: Extracted several features from the dataset, performed oversampling and developed Random Forest and Logistic Regression models to predict if bidder is a bot or a human.

Results: Achieved an AUC of 0.90342 on the test dataset.

###Imports

In [1]:
import os
os.chdir('UnbalancedDataset/')
import unbalanced_dataset
import pandas as pd
import numpy as np
import pickle
import sys
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.lda import LDA
from sklearn.qda import QDA

In [2]:
from unbalanced_dataset import SMOTE, SMOTEENN, SMOTETomek

###Extract data into dataframes & clean the data

In [3]:
bidsData = pd.read_csv('../bids.csv')
trainData = pd.read_csv('../train.csv')
testData = pd.read_csv('../test.csv')
trainLabels = trainData['outcome']

In [4]:
botData = trainData[trainData['outcome'] == 1]
botIds = list(botData['bidder_id'])
bidsData['outcome'] = pickle.load(open( "bidsLabels.p", "rb" ))

In [8]:
oversampledData = trainData
oversampledLabels = trainData['outcome']

In [5]:
botDupData = pd.concat([botData]*9)
print len(botDupData)
humanData = trainData[trainData['outcome'] == 0]
print len(humanData)
oversampledData = pd.concat([humanData,botDupData])
print len(oversampledData)

927
1910
2837


In [6]:
ratio = float(np.count_nonzero(trainLabels==1)) / float(np.count_nonzero(trainLabels==0))

In [7]:
ratio

0.05392670157068063

In [15]:
def convertStringToHash(string):
    return hash(string)

In [9]:
def convertStringToHashStr(string):
    return str(hash(string))

In [16]:
print bidsData.columns
print oversampledData.columns

Index([u'bid_id', u'bidder_id', u'auction', u'merchandise', u'device', u'time', u'country', u'ip', u'url', u'outcome'], dtype='object')
Index([u'bidder_id', u'payment_account', u'address', u'outcome'], dtype='object')


In [6]:
cols = ['bidder_id','payment_account','address']
oversampledData = oversampledData[cols]

In [79]:
slicedDataTest = []
slicedLabelsTest = []
indicesList = range(0,2940)
for i in range(0,1000):
    idx = np.random.randint(0,len(oversampledData)-1)
    #print idx
    if idx in indicesList:
        indicesList.remove(idx)
    slicedDataTest.append(list(oversampledData.iloc[idx]))
    slicedLabelsTest.append(oversampledLabels.iloc[idx])

In [77]:
print len(slicedDataTest),len(indicesList)

1000 2087


In [6]:
oversampledLabels = pd.concat([humanData,botDupData])['outcome']
print len(oversampledLabels)

2837


In [86]:
oversampledLabels = list(oversampledLabels)
slicedDataTrain = oversampledData.iloc[indicesList]
slicedLabelsTrain = [oversampledLabels[idx] for idx in indicesList]

In [19]:
trainDataSliced = trainData[['bidder_id','outcome']]
trainDataSliced['bidder_id'] = trainDataSliced['bidder_id'].apply(convertStringToHash)
trainDataSlicedDF = trainDataSliced.copy(deep=True)
trainDataSliced = np.array(trainDataSliced)
smote = SMOTE(ratio=1/ratio, verbose=True)
smox, smoy = smote.fit_transform(trainDataSliced, trainLabels)

Determining classes statistics... 2 classes detected: {0.0: 1910, 1.0: 103}
Finding the 5 nearest neighbours...done!
Creating synthetic samples...Generated 1910 new samples ...
done!


Try using .loc[row_index,col_indexer] = value instead
  from IPython.kernel.zmq import kernelapp as app


In [10]:
trainDataSlicedDF['bidder_id'] = trainDataSlicedDF['bidder_id'].apply(convertStringToHashStr)

In [12]:
bidsData['bidder_id'] = bidsData['bidder_id'].apply(convertStringToHashStr)
bidsData['bid_id'] = bidsData['bid_id'].apply(convertStringToHashStr)
bidsData['auction'] = bidsData['auction'].apply(convertStringToHashStr)
bidsData['merchandise'] = bidsData['merchandise'].apply(convertStringToHashStr)
bidsData['device'] = bidsData['device'].apply(convertStringToHashStr)
bidsData['time'] = bidsData['time'].apply(convertStringToHashStr)
bidsData['country'] = bidsData['country'].apply(convertStringToHashStr)
bidsData['ip'] = bidsData['ip'].apply(convertStringToHashStr)
bidsData['url'] = bidsData['url'].apply(convertStringToHashStr)
bidsData['outcome'] = pickle.load(open( "bidsLabels.p", "rb" ))

In [8]:
bidsTrainDF = bidsData[bidsData['outcome'] != -1]
bidsTestDF = bidsData[bidsData['outcome'] == -1]
bidsColumns = ['bidder_id', 'bid_id', 'auction', 'merchandise', 'device', 'time', 'country', 'ip', 'url']
bidsTrainLabels = bidsTrainDF['outcome']
bidsTrainDF = bidsTrainDF[bidsColumns]
bidsTestDF = bidsTestDF[bidsColumns]
bidsTrainDF['bidder_id'] = bidsTrainDF['bidder_id'].apply(convertStringToHash)
bidsTrainDF['bid_id'] = bidsTrainDF['bid_id'].apply(convertStringToHash)
bidsTrainDF['auction'] = bidsTrainDF['auction'].apply(convertStringToHash)
bidsTrainDF['merchandise'] = bidsTrainDF['merchandise'].apply(convertStringToHash)
bidsTrainDF['device'] = bidsTrainDF['device'].apply(convertStringToHash)
bidsTrainDF['time'] = bidsTrainDF['time'].apply(convertStringToHash)
bidsTrainDF['country'] = bidsTrainDF['country'].apply(convertStringToHash)
bidsTrainDF['ip'] = bidsTrainDF['ip'].apply(convertStringToHash)
bidsTrainDF['url'] = bidsTrainDF['url'].apply(convertStringToHash)

In [65]:
bidsTestDF['bidder_id'] = bidsTestDF['bidder_id'].apply(convertStringToHash)
bidsTestDF['bid_id'] = bidsTestDF['bid_id'].apply(convertStringToHash)
bidsTestDF['auction'] = bidsTestDF['auction'].apply(convertStringToHash)
bidsTestDF['merchandise'] = bidsTestDF['merchandise'].apply(convertStringToHash)
bidsTestDF['device'] = bidsTestDF['device'].apply(convertStringToHash)
bidsTestDF['time'] = bidsTestDF['time'].apply(convertStringToHash)
bidsTestDF['country'] = bidsTestDF['country'].apply(convertStringToHash)
bidsTestDF['ip'] = bidsTestDF['ip'].apply(convertStringToHash)
bidsTestDF['url'] = bidsTestDF['url'].apply(convertStringToHash)

In [43]:
#ratio = float(np.count_nonzero(bidsTrainLabels==1)) / float(np.count_nonzero(bidsTrainLabels==0))
#print ratio

0.15511311836


In [33]:
nameSet = set()
for name,grp in bidderGroups:
    nameSet.add(name)

###Parse the data and construct features

In [35]:
#bidsData['bidder_id'] = bidsData['bidder_id'].apply(convertStringToHash)
bidderGroups = bidsData.groupby("bidder_id")
bidderFreqTrain = []
smoxSet = [smox[i][0] for i in range(0,len(smox))]
for smoxi in smoxSet:
    try:
        auc = bidderGroups.get_group(int(smoxi))
        freqList = []
        auctionGroups = auc.groupby('auction')
        bcount, mcount , tcount , ipcount , ucount, countrycount, dcount = 0, 0, 0, 0, 0, 0, 0
        for name,grp in auctionGroups:
            bcount = bcount + len(auctionGroups.get_group(name).groupby('bid_id'))
            mcount = mcount + len(auctionGroups.get_group(name).groupby('merchandise'))
            tcount = tcount + len(auctionGroups.get_group(name).groupby('time'))
            ipcount = ipcount + len(auctionGroups.get_group(name).groupby('ip'))
            ucount = ucount + len(auctionGroups.get_group(name).groupby('url'))
            countrycount = countrycount + len(auctionGroups.get_group(name).groupby('country'))
            dcount = dcount + len(auctionGroups.get_group(name).groupby('device'))
        freqList.append(float(bcount)/float(len(auctionGroups)))
        freqList.append(float(mcount)/float(len(auctionGroups)))
        freqList.append(float(tcount)/float(len(auctionGroups)))
        freqList.append(float(ipcount)/float(len(auctionGroups)))
        freqList.append(float(ucount)/float(len(auctionGroups)))
        freqList.append(float(countrycount)/float(len(auctionGroups)))
        freqList.append(float(dcount)/float(len(auctionGroups)))
        bidderFreqTrain.append(freqList)
    except:
        bidderFreqTrain.append([0.,0.,0.,0.,0.,0.,0.])
        pass

In [7]:
len(trainData)

2013

In [22]:
bidderGroups = bidsData.groupby("bidder_id")
smoxSet = set(smox[i][0] for i in range(0,len(smox)))
bidderFreqTrain = []
# print len(bidderGroups)
# #auc = bidderGroups.get_group(str(smox[0][0])).groupby('auction')
# bidderGroups = dict(list(bidderGroups))
# key = smox[0][0]
# print bidderGroups
# if key in bidderGroups.keys():
#     print bidderGroups[key]
i = 0
print len(bidderGroups)
for i in range(0,len(trainData)):
    bidder_id = trainData.bidder_id[i]
    bidderGrp = None
    
    for name,grp in bidderGroups:
        if name == bidder_id:
            #print "yay", name
            bidderGrp =grp
            break
    
    if bidderGrp is not None:
        freqList = []
        auctionGroups = bidderGrp.groupby('auction')
        bcount, mcount , tcount , ipcount , ucount, countrycount, dcount = 0, 0, 0, 0, 0, 0, 0
        for name,grp in auctionGroups:
            bcount = bcount + len(auctionGroups.get_group(name).groupby('bid_id'))
            mcount = mcount + len(auctionGroups.get_group(name).groupby('merchandise'))
            tcount = tcount + len(auctionGroups.get_group(name).groupby('time'))
            ipcount = ipcount + len(auctionGroups.get_group(name).groupby('ip'))
            ucount = ucount + len(auctionGroups.get_group(name).groupby('url'))
            countrycount = countrycount + len(auctionGroups.get_group(name).groupby('country'))
            dcount = dcount + len(auctionGroups.get_group(name).groupby('device'))
        freqList.append(float(bcount)/float(len(auctionGroups)))
        freqList.append(float(mcount)/float(len(auctionGroups)))
        freqList.append(float(tcount)/float(len(auctionGroups)))
        freqList.append(float(ipcount)/float(len(auctionGroups)))
        freqList.append(float(ucount)/float(len(auctionGroups)))
        freqList.append(float(countrycount)/float(len(auctionGroups)))
        freqList.append(float(dcount)/float(len(auctionGroups)))
        bidderFreqTrain.append(freqList)
    else:
        bidderFreqTrain.append([0.,0.,0.,0.,0.,0.,0.,0.])

NameError: name 'smox' is not defined

In [221]:
bidderFreqFeatures = {}
bidderGroups = bidsData.groupby("bidder_id")
for i in range(0,len(trainData)):
    freqList = []
    try:
        auctionGroups = bidderGroups.get_group(trainData.bidder_id[i]).groupby('auction')
        bcount, mcount , tcount , ipcount , ucount, countrycount, dcount = 0, 0, 0, 0, 0, 0, 0
        for name,grp in auctionGroups:
            bcount = bcount + len(auctionGroups.get_group(name).groupby('bid_id'))
            mcount = mcount + len(auctionGroups.get_group(name).groupby('merchandise'))
            tcount = tcount + len(auctionGroups.get_group(name).groupby('time'))
            ipcount = ipcount + len(auctionGroups.get_group(name).groupby('ip'))
            ucount = ucount + len(auctionGroups.get_group(name).groupby('url'))
            countrycount = countrycount + len(auctionGroups.get_group(name).groupby('country'))
            dcount = dcount + len(auctionGroups.get_group(name).groupby('device'))
        freqList.append(float(bcount)/float(len(auctionGroups)))
        freqList.append(float(mcount)/float(len(auctionGroups)))
        freqList.append(float(tcount)/float(len(auctionGroups)))
        freqList.append(float(ipcount)/float(len(auctionGroups)))
        freqList.append(float(ucount)/float(len(auctionGroups)))
        freqList.append(float(countrycount)/float(len(auctionGroups)))
        freqList.append(float(dcount)/float(len(auctionGroups)))
        bidderFreqFeatures[trainData.bidder_id[i]] = freqList
    except:
        #print sys.exc_info()[0]
        bidderFreqFeatures[trainData.bidder_id[i]] = [0.,0.,0.,0.,0.,0.,0.]
        pass

In [11]:
len(oversampledData.bidder_id)

2837

In [6]:
bidderFreqTrain = []
bidderGroups = bidsData.groupby("bidder_id")
#for i in range(0,len(oversampledData)):
for row in oversampledData.iterrows():
    #print row[1][0]
    #break
    freqList = []
    idHash = row[1][0]
    try:
        numAuctions = len(bidderGroups.get_group(idHash).groupby('auction'))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('bid_id')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('merchandise')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('time')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('ip')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('url')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('country')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('device')))
        freqList = [float(item)/float(numAuctions) for item in freqList]
        #freqList = [float(item)/float(max(freqList)) for item in freqList]
        bidderFreqTrain.append(freqList)
    except:
        #print "Unexpected error:", sys.exc_info()[0]
        bidderFreqTrain.append([0.,0.,0.,0.,0.,0.,0.])
        pass

In [7]:
bidderFreqTrainArr = np.array(bidderFreqTrain)
import pickle
pickle.dump(bidderFreqTrainArr, open('trainFeatures.p','wb'))

In [25]:
bidderFreqTest = []
for row in testData.iterrows():
    #print row[1][0]
    #break
    freqList = []
    idHash = row[1][0]
    try:
        numAuctions = len(bidderGroups.get_group(idHash).groupby('auction'))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('bid_id')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('merchandise')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('time')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('ip')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('url')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('country')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('device')))
        freqList = [float(item)/float(numAuctions) for item in freqList]
        #freqList = [float(item)/float(max(freqList)) for item in freqList]
        bidderFreqTest.append(freqList)
    except:
        #print "Unexpected error:", sys.exc_info()[0]
        bidderFreqTest.append([0.,0.,0.,0.,0.,0.,0.])
        pass

In [26]:
bidderFreqTestArr = np.array(bidderFreqTest)
import pickle
pickle.dump(bidderFreqTestArr, open('testFeatures.p','wb'))

In [9]:
trainLabelsArr = np.array(oversampledLabels)
pickle.dump(trainLabelsArr, open('trainLabels.p', 'wb'))

In [21]:
trainLabelsArr

array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [70]:
#oversampledData = np.array(oversampledData)
#bidderFreqFeatures = {}
bidderFreqTrain = []
bidderGroups = bidsData.groupby("bidder_id")
for i in range(0,len(oversampledData)):
    freqList = []
    idHash = oversampledData[i][0]
    try:
        freqList.append(len(bidderGroups.get_group(idHash).groupby('auction')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('bid_id')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('merchandise')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('time')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('ip')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('url')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('country')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('device')))
        #freqList = [float(item)/float(numAuctions) for item in freqList]
        if np.std(freqList) == 0:
            freqList = [0. for item in freqList]
        else:
            freqList = [float(item)-np.mean(freqList)/float(np.std(freqList)) for item in freqList]
        bidderFreqTrain.append(freqList)
    except:
        bidderFreqTrain.append([0.,0.,0.,0.,0.,0.,0.,0.])
        pass

In [96]:
slicedDataTrain = np.array(slicedDataTrain)
#bidderFreqFeatures = {}
bidderFreqTrain = []
bidderGroups = bidsData.groupby("bidder_id")
for i in range(0,len(slicedDataTrain)):
    freqList = []
    idHash = slicedDataTrain[i][0]
    try:
        #freqList.append(len(bidderGroups.get_group(idHash).groupby('auction')))
        numAuctions = len(bidderGroups.get_group(idHash).groupby('auction'))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('bid_id')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('merchandise')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('time')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('ip')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('url')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('country')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('device')))
        #freqList = [float(item)/float(numAuctions) for item in freqList]
        freqList = [float(item)/float(numAuctions) for item in freqList]
        bidderFreqTrain.append(freqList)
    except:
        bidderFreqTrain.append([0.,0.,0.,0.,0.,0.,0.])
        pass

In [71]:
#bidderFreqFeaturesTest = {}
bidderFreqTest = []
bidderGroups = bidsData.groupby("bidder_id")
for i in range(0,len(testData)):
    freqList = []
    idHash = testData.bidder_id[i]
    try:
        freqList.append(len(bidderGroups.get_group(idHash).groupby('auction')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('bid_id')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('merchandise')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('time')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('ip')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('url')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('country')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('device')))
        #freqList = [float(item)/float(numAuctions) for item in freqList]
        if np.std(freqList) == 0:
            freqList = [0. for item in freqList]
        else:
            freqList = [float(item)-np.mean(freqList)/float(np.std(freqList)) for item in freqList]
        bidderFreqTest.append(freqList)
    except:
        bidderFreqTest.append([0.,0.,0.,0.,0.,0.,0.,0.])
        pass

In [11]:
type(oversampledData)

numpy.ndarray

In [111]:
oversampledDF = pd.DataFrame(oversampledData,columns=['bidder_id','payment_account','address'])
#oversampledDF['bidder_id'] = oversampledDF['bidder_id'].apply(convertStringToHash)
oversampledDF['payment_account'] = oversampledDF['payment_account'].apply(convertStringToHash)
oversampledDF['address'] = oversampledDF['address'].apply(convertStringToHash)
trainData = pd.concat([oversampledDF,pd.DataFrame(bidderFreqTrain)],axis = 1)
trainData

  return -res
  return -res


Unnamed: 0,bidder_id,payment_account,address,0,1,2,3,4,5,6,7
0.0,91a3c57b13234af24875c56fb7e2b2f4rb56a,931616462463833716,-5086406051828771048,16.504608,22.504608,-0.495392,22.504608,18.504608,-0.495392,4.504608,12.504608
1.0,624f258b49e77713fc34034560f93fb3hu3jo,574007729285576591,-4397292638540504490,-1.309401,0.690599,-1.309401,0.690599,0.690599,-0.309401,-1.309401,-0.309401
2.0,1c5f4fc669099bfbfac515cd26997bd12ruaj,6711196979030722927,7273737260077819105,1.883049,1.883049,-1.116951,1.883049,1.883049,-0.116951,-1.116951,-0.116951
3.0,4bee9aba2abda51bf43d639013d6efe12iycd,4984552371081301766,-5057817965675053521,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4.0,4ab12bc61c82ddd9c2d65e60555808acqgos1,4050386565501899261,-846087952996735143,21.755616,153.755616,-0.244384,153.755616,121.755616,89.755616,0.755616,51.755616
5.0,7eaefc97fbf6af12e930528151f86eb91bafh,8798815051332061309,3615563784774484079,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6.0,25558d24bca82beef0f9db4ba1fe2045ynnvq,7762475301981469917,-5108273119361717315,6.466752,6.466752,-0.533248,6.466752,1.466752,6.466752,-0.533248,0.466752
7.0,88ae7a35e374a6fddd079ebb28c822eeohwse,-3167949981768272239,7616995667260086732,8.992740,32.992740,-0.007260,32.992740,30.992740,-0.007260,1.992740,1.992740
8.0,57db69e32163f3e486dc6ef7d615aa12usje6,422103476398979380,6750285072029618256,-2.041452,-2.041452,-3.041452,-2.041452,-2.041452,-2.041452,-3.041452,-2.041452
9.0,d1be739798ba0745a1fd72ac918a9f1929hei,-5351326307197685800,-4497620196521003581,7.845777,11.845777,-1.154223,11.845777,9.845777,4.845777,2.845777,8.845777


In [143]:
len(set(testData['payment_account']))

4700

In [132]:
#oversampledTestDF = pd.DataFrame(oversampledData)
#testData['bidder_id'] = testData['bidder_id'].apply(convertStringToHash)
testData['payment_account'] = testData['payment_account'].apply(convertStringToHash)
testData['address'] = testData['address'].apply(convertStringToHash)
testData2 = pd.concat([testData,pd.DataFrame(bidderFreqTest)],axis = 1)
testData2

  return -res
  return -res


Unnamed: 0,bidder_id,payment_account,address,0,1,2,3,4,5,6,7
0.0,49bb5a3c944b8fc337981cc7a9ccae41u31d7,5541475518607987058,-1011944063549235223,0.000000,1.000000,-2.000000,1.000000,1.000000,0.000000,0.000000,-1.000000
1.0,a921612b85a1494456e74c09393ccb65ylp4y,2792115275379746347,8678545645572533409,-0.722179,0.277821,-1.722179,0.277821,-0.722179,-1.722179,-0.722179,0.277821
2.0,6b601e72a4d264dab9ace9d7b229b47479v6i,-1298663123734603250,-4572198111325225731,12.810329,15.810329,-0.189671,15.810329,2.810329,0.810329,1.810329,2.810329
3.0,eaf0ed0afc9689779417274b4791726cn5udi,-329480397667736151,8417942082977304833,88.353712,146.353712,-0.646288,146.353712,127.353712,78.353712,12.353712,79.353712
4.0,cdecd8d02ed8c6037e38042c7745f688mx5sf,-8026921228716048020,8368280834780830364,18.600207,21.600207,-0.399793,21.600207,15.600207,-0.399793,0.600207,15.600207
5.0,d4aed439bdc854a56fc6cc3bdb986775w7hxw,3573881728470575486,7767803076247426218,94.509759,230.509759,-0.490241,229.509759,138.509759,151.509759,19.509759,85.509759
6.0,ed591299b162a19ff77f0479495831b31hl1q,-4026895228683048128,-1115239057421464036,1.498149,1.498149,-1.501851,1.498149,1.498149,1.498149,-1.501851,1.498149
7.0,eebdee08b0f67283126ef60307f49680sb9va,6806267584069628064,2693869730285310074,156.689014,381.689014,-0.310986,381.689014,258.689014,97.689014,24.689014,139.689014
8.0,6887f0abc4eb4c79eb0e23c48ceea186vjfih,-4086479052316771935,5983010404429134639,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9.0,37eb6e2979e66d4ce29a74ac1c8bc6a5lqs6t,-6510622772038343548,-7245098282477094993,15.774298,125.774298,-0.225702,125.774298,117.774298,25.774298,19.774298,54.774298


In [97]:
#bidderFreqFeaturesTest = {}
bidderFreqTest = []
bidderGroups = bidsData.groupby("bidder_id")
for i in range(0,len(slicedDataTest)):
    freqList = []
    idHash = slicedDataTest[i][0]
    try:
        #freqList.append(len(bidderGroups.get_group(idHash).groupby('auction')))
        numAuctions = len(bidderGroups.get_group(idHash).groupby('auction'))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('bid_id')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('merchandise')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('time')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('ip')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('url')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('country')))
        freqList.append(len(bidderGroups.get_group(idHash).groupby('device')))
        #freqList = [float(item)/float(numAuctions) for item in freqList]
        freqList = [float(item)/float(numAuctions) for item in freqList]
        bidderFreqTest.append(freqList)
    except:
        bidderFreqTest.append([0.,0.,0.,0.,0.,0.,0.])
        pass

In [26]:
bidderFreqTrain = []
for bidder_id in bidderFreqFeatures.keys():
    bidderFreqTrain.append(bidderFreqFeatures[bidder_id])
bidderFreqTrain = np.array(bidderFreqTrain)

In [69]:
bidderFreqTest = []
for bidder_id in bidderFreqFeaturesTest.keys():
    bidderFreqTest.append(bidderFreqFeaturesTest[bidder_id])
bidderFreqTest = np.array(bidderFreqTest)

In [30]:
len(oversampledData)

2940

In [230]:
smote = SMOTE(ratio=1/ratio, verbose=True)
smox, smoy = smote.fit_transform(trainDataSliced, trainLabels)

Determining classes statistics... 2 classes detected: {0.0: 1910, 1.0: 103}
Finding the 5 nearest neighbours...done!
Creating synthetic samples...Generated 1910 new samples ...
done!


In [222]:
smote = SMOTE(ratio=0.6/ratio, verbose=True)
smox, smoy = smote.fit_transform(bidderFreqTrain, trainLabels)

Determining classes statistics... 2 classes detected: {0.0: 1910, 1.0: 103}
Finding the 5 nearest neighbours...done!
Creating synthetic samples...Generated 1146 new samples ...
done!


In [162]:
print len(bidderFreqTrain), len(smox)

2013 2586


In [163]:
sum(smoy),sum(trainLabels)

(676.0, 103.0)

In [176]:
smoxTest = smox[1386:2386,:]
smoyTest = smoy[1386:2386]
print sum(smoyTest)
smox1 = smox[:1386]
np.append(smox1,smox[2386:,:])
smoy1 = smoy[:1386]
np.append(smoy1,smoy[2386:])
smox = smox1
smoy = smoy1

413.0


In [34]:
bidderFreqTrain = np.array(bidderFreqTrain)

In [134]:
trainData = pd.DataFrame(trainData)
testData2.columns

Index([u'bidder_id', u'payment_account', u'address', 0, 1, 2, 3, 4, 5, 6, 7], dtype='object')

In [92]:
cols = [2,3,4,5,6,7,8,9]
trainData = trainData[cols]

In [126]:
#trainData = pd.DataFrame(trainData)
cols = [3,4,5,6,7,8,9,10]
trainData2 = trainData[cols]

In [91]:
cols = [1,2,3,4,5,6,7]
testData2 = testData2[cols]

In [77]:
cols = ['payment_account',0,1,2,3,4,5,6,7]
testData2 = testData2[cols]

In [135]:
cols = [3,4,5,6,7,8,9,10]
testData2 = testData2[cols]

###Stratified K-fold cross-validation with RandomForestClassifier

In [139]:
#clf1 = ExtraTreesClassifier(n_estimators=500, max_features=None, n_jobs=1, criterion='gini')
#clf1 = LogisticRegression()
#clf1 = SVC(probability=True)
#clf1 = SVC(gamma=2, C=1, probability=True)
oversampledLabels = np.array(oversampledLabels)
trainData2 = np.array(trainData2)
clf1 = RandomForestClassifier()
#clf1 = GaussianNB()
#clf1 = KNeighborsClassifier(n_neighbors=5)
#clf1=QDA()
#kf = cross_validation.KFold(len(oversampledLabels),n_folds=3,shuffle=True)
y_prob = np.zeros((len(oversampledLabels),2))
y_pred = np.zeros(len(oversampledLabels))
skf = cross_validation.StratifiedKFold(oversampledLabels, n_folds=5)
slicedLabelsTrain = np.array(oversampledLabels)
# Iterate through folds
for train_index, test_index in skf:
    print train_index, len(test_index)
    X_train, X_test = trainData2[train_index], trainData2[test_index]
    y_train,y_test = slicedLabelsTrain[train_index], slicedLabelsTrain[test_index]
    #clf1 = ExtraTreesClassifier(n_estimators=500, max_features=None, min_samples_leaf=9, 
                             #min_samples_split=10, n_jobs=1, criterion='gini')
    clf1.fit(X_train,y_train)
    y_prob[test_index] = clf1.predict_proba(X_test)
    y_pred[test_index] = clf1.predict(X_test)
    print "Accuracy:", accuracy_score(y_test, y_pred[test_index])
    print "Confusion Matrix:", confusion_matrix(y_test, y_pred[test_index])
    botProb = [prob[1] for prob in y_prob[test_index]]
    print "AUC:", roc_auc_score(y_test, botProb)
    
print "Accuracy:", accuracy_score(slicedLabelsTrain, y_pred)
print "Confusion Matrix:", confusion_matrix(slicedLabelsTrain, y_pred)
botProb = [prob[1] for prob in y_prob]
print "AUC:", roc_auc_score(slicedLabelsTrain, botProb)

[   2    3    4 ..., 2834 2835 2836] 568
Accuracy: 0.672535211268
Confusion Matrix: [[337  45]
 [141  45]]
AUC: 0.791898890953
[   0    1    2 ..., 2833 2834 2835] 568
Accuracy: 0.700704225352
Confusion Matrix: [[358  24]
 [146  40]]
AUC: 0.801574902888
[   0    1    2 ..., 2834 2835 2836] 567
Accuracy: 0.731922398589
Confusion Matrix: [[339  43]
 [109  76]]
AUC: 0.827048252441
[   0    1    2 ..., 2834 2835 2836] 567
Accuracy: 0.712522045855
Confusion Matrix: [[357  25]
 [138  47]]
AUC: 0.831250884392
[   0    1    3 ..., 2832 2833 2836] 567
Accuracy: 0.675485008818
Confusion Matrix: [[338  44]
 [140  45]]
AUC: 0.805405405405
Accuracy: 0.698625308424
Confusion Matrix: [[1729  181]
 [ 674  253]]
AUC: 0.796516376082


In [130]:
bidderFreqTest[7]

[156.68901377855406,
 381.68901377855406,
 -0.310986221445952,
 381.68901377855406,
 258.68901377855406,
 97.689013778554042,
 24.689013778554049,
 139.68901377855406]

In [42]:
#clf1 = SVC(gamma=2, C=1, probability=True)
#clf1.fit(smox,smoy)
#clf1.fit(bidderFreqTrain,oversampledLabels)
scores = cross_validation.cross_val_score(clf1,trainData, oversampledLabels, n_jobs=1,scoring='roc_auc',cv=5)
scores

array([ 0.99866295,  1.        ,  1.        ,  1.        ,  1.        ])

In [47]:
scores

array([ 0.97736291,  0.97138326,  0.97891071])

###Predict on test data

In [115]:
#clf1.fit(bidderFreqTrain,trainLabels)
bidderFreqTest = np.array(bidderFreqTest)
pred_labels = clf1.predict(bidderFreqTest)
pred_probs = clf1.predict_proba(bidderFreqTest)
print "Accuracy:", accuracy_score(slicedLabelsTest, pred_labels)
print "Confusion Matrix:", confusion_matrix(slicedLabelsTest, pred_labels)
botProb = [prob[1] for prob in pred_probs]
print "AUC:", roc_auc_score(slicedLabelsTest, botProb)

Accuracy: 0.968
Confusion Matrix: [[636  13]
 [ 19 332]]
AUC: 0.988448149465


In [140]:
#clf1.fit(smox,smoy)
pred_labels = clf1.predict(testData2)
pred_probs = clf1.predict_proba(testData2)

In [141]:
pred_probs

array([[  1.00000000e+00,   8.25566647e-15],
       [  1.00000000e+00,   1.60062500e-11],
       [  8.72631307e-01,   1.27368693e-01],
       ..., 
       [  8.54964051e-03,   9.91450359e-01],
       [  9.99969724e-01,   3.02763819e-05],
       [  8.95441861e-01,   1.04558139e-01]])

###Write results into a csv file

In [142]:
i = 0
with open('results_oversamplingdup_NB.csv','w') as f:
    f.write("bidder_id,prediction\n")
    for testRow in testData.iterrows():
        f.write(str(testRow[1][0]) + "," + str(pred_probs[i][1]) + "\n")
        i = i + 1