# AdTracking Fraud Detection Challange

In [1]:
# Importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
# Importing data
dtypes = {'ip'            : 'uint32',
          'app'           : 'uint16',
          'device'        : 'uint16',
          'os'            : 'uint16',
          'channel'       : 'uint16',
          'is_attributed' : 'uint8',
          'click_id'      : 'uint32'
          } 
train = pd.read_csv('train_sample.csv', dtype=dtypes)
#test = pd.read_csv("test.csv", dtype=dtypes)
print(train.shape)
train.head()

(100000, 8)


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [3]:
# Check missing data
train.apply(lambda x: sum(x.isnull()))
# attributed_time is NaN when the app isn't downloaded after clicking on the ad
# https://www.kaggle.com/pranav84/xgboost-histogram-optimized-version/code
# ==> here they don't use 'attributed_time'. What do you think?

ip                     0
app                    0
device                 0
os                     0
channel                0
click_time             0
attributed_time    99773
is_attributed          0
dtype: int64

In [4]:
# Unique values
train.apply(lambda x: len(x.unique()))

ip                 34857
app                  161
device               100
os                   130
channel              161
click_time         80350
attributed_time      228
is_attributed          2
dtype: int64

In [5]:
train.dtypes

ip                 uint32
app                uint16
device             uint16
os                 uint16
channel            uint16
click_time         object
attributed_time    object
is_attributed       uint8
dtype: object

In [6]:
# Extract day, hour and day of the week
train['hour'] = pd.to_datetime(train.click_time).dt.hour.astype('uint8')
train['day']  = pd.to_datetime(train.click_time).dt.day.astype('uint8')
train['dow']  = pd.to_datetime(train.click_time).dt.dayofweek.astype('uint8')

In [7]:
# # of clicks for each ip-day-hour combination
gp = train[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'qty'})
train = train.merge(gp, on=['ip','day','hour'], how='left')
del gp

In [8]:
# # of clicks for each ip-app combination
gp = train[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train = train.merge(gp, on=['ip','app'], how='left')
del gp

In [9]:
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day,dow,qty,ip_app_count
0,87540,12,1,13,497,2017-11-07 09:30:38,,0,9,7,1,1,3
1,105560,25,1,17,259,2017-11-07 13:40:27,,0,13,7,1,4,4
2,101424,12,1,19,212,2017-11-07 18:05:24,,0,18,7,1,1,1
3,94584,13,1,13,477,2017-11-07 04:58:08,,0,4,7,1,1,1
4,68413,12,1,1,178,2017-11-09 09:00:09,,0,9,9,3,1,2
