In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
from sklearn import svm
from sklearn import preprocessing
import timeit
import math


In [3]:
# get data in basic table form
df = pd.read_csv("./train_sample.csv")
df

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0
5,93663,3,1,17,115,2017-11-09 01:22:13,,0
6,17059,1,1,17,135,2017-11-09 01:17:58,,0
7,121505,9,1,25,442,2017-11-07 10:01:53,,0
8,192967,2,2,22,364,2017-11-08 09:35:17,,0
9,143636,3,1,19,135,2017-11-08 12:35:26,,0


### Time to Attribution, hour of day, day of week, minute of hour

In [4]:
df['time_to_attribution'] = df.apply(lambda row: datetime.strptime(row['attributed_time'], '%Y-%m-%d %H:%M:%S') - datetime.strptime(row['click_time'], '%Y-%m-%d %H:%M:%S') if isinstance(row['attributed_time'], str) else '', axis=1)

df['hour_of_day'] = df.apply(lambda row: datetime.strptime(row['click_time'], '%Y-%m-%d %H:%M:%S').hour, axis=1)
df['day_of_week'] = df.apply(lambda row: datetime.strptime(row['click_time'], '%Y-%m-%d %H:%M:%S').weekday(), axis=1)
df['minute_of_hour'] = df.apply(lambda row: datetime.strptime(row['click_time'], '%Y-%m-%d %H:%M:%S').minute, axis=1)

print(df['day_of_week'].max())
print(df['day_of_week'].min())
print(df['hour_of_day'].max())
print(df['hour_of_day'].min())
print(df['minute_of_hour'].max())
print(df['minute_of_hour'].min())

3
0
23
0
59
0


### Click Frequency

In [5]:
df['click_time_dt'] = df.apply(lambda row: (dateutil.parser.parse(row['click_time'], dayfirst=True) - datetime.utcfromtimestamp(0)).total_seconds() , axis=1)


print(df['ip'].nunique())
ip = df['ip'].unique().tolist()
min_time = df.groupby('ip')["click_time_dt"].min().tolist()
max_time = df.groupby('ip')["click_time_dt"].max().tolist()
num_clicks = df.groupby('ip')["click_time_dt"].count().tolist()
ip



34857


[87540,
 105560,
 101424,
 94584,
 68413,
 93663,
 17059,
 121505,
 192967,
 143636,
 73839,
 34812,
 114809,
 114220,
 36150,
 72116,
 5314,
 106598,
 72065,
 37301,
 28735,
 66918,
 25761,
 8362,
 45257,
 145896,
 162976,
 52432,
 135690,
 139137,
 48846,
 70747,
 10831,
 89242,
 140138,
 28411,
 127888,
 75943,
 87879,
 250933,
 133933,
 35096,
 49431,
 53365,
 92599,
 107148,
 116677,
 11051,
 48240,
 92488,
 74999,
 123221,
 4019,
 40168,
 328403,
 134466,
 122354,
 104645,
 184757,
 3488,
 63994,
 52076,
 88604,
 9073,
 105861,
 25114,
 25876,
 61941,
 79827,
 99641,
 171729,
 257816,
 100494,
 5348,
 137208,
 81356,
 58128,
 71483,
 7521,
 100066,
 259930,
 30240,
 63109,
 36311,
 44494,
 111021,
 210555,
 6414,
 123729,
 73040,
 334558,
 123994,
 94536,
 127351,
 2600,
 99926,
 114878,
 173847,
 145747,
 94081,
 74725,
 51895,
 75431,
 104922,
 173247,
 124006,
 77310,
 61667,
 118349,
 46650,
 337099,
 34067,
 61635,
 145260,
 110245,
 77730,
 108831,
 124794,
 61706,
 91175,


In [11]:
# calculating click_frequency and statistics

freq_dict = {}
for (i, m, M, n) in zip(ip, min_time, max_time, num_clicks):
    # skip if only 1 click occurred
    if M - m == 0:
        freq_dict[i] = 0
    else:
        freq_dict[i] = n / (M - m)
        
df['click_freq_for_ip'] = df.apply(lambda row: (freq_dict[row["ip"]]) , axis=1)

# data analysis on frequency list
freqs = list(freq_dict.values())
average_freq = sum(freqs)/len(freq_dict)
average_period = 1/average_freq
arr = np.array(freqs)
stdev = np.std(arr)
outliers = arr[(arr - np.mean(arr)) > 2 * np.std(arr)]
print(outliers)
#np.max(arr)

[0.125      0.08       1.         2.         2.         0.07407407
 2.         0.14285714 2.         0.08       0.05555556 0.08695652
 0.22222222 0.22222222 0.15384615 0.33333333 0.0625     0.0952381
 0.08       0.28571429 2.        ]


### Click time delta


In [21]:
ip_prev_click_time = {}

def hash_id(row, attributes):
    return ''.join(str(row[a]) for a in attributes)


def click_time_delta(row, attributes):
    uid = hash_id(row, attributes)
    prev_click_time = ip_prev_click_time.get(uid)
    ip_prev_click_time[uid] = row["click_time_dt"]
    if prev_click_time:
        return abs(row["click_time_dt"] - prev_click_time)
    else:
        return row["click_time_dt"]

df = df.sort_values(by='click_time_dt')
df['prev_click_time_delta_for_ip'] = df.apply(lambda row: click_time_delta(row, ["ip"]) , axis=1)
df['prev_click_time_delta_for_ip_os_device'] = df.apply(lambda row: click_time_delta(row, ["ip", "os", "device"]) , axis=1)
df['prev_click_time_delta_for_ip_app_os_device'] = df.apply(lambda row: click_time_delta(row, ["ip", "app", "os", "device"]) , axis=1)
df['prev_click_time_delta_for_ip_app'] = df.apply(lambda row: click_time_delta(row, ["ip", "app"]) , axis=1)



In [22]:
ip_prev_click_time = {}

df = df.sort_values(by='click_time_dt', ascending=False)
df['next_click_time_delta_for_ip'] = df.apply(lambda row: click_time_delta(row, ["ip"]) , axis=1)
df['next_click_time_delta_for_ip_os_device'] = df.apply(lambda row: click_time_delta(row, ["ip", "os", "device"]) , axis=1)
df['next_click_time_delta_for_ip_app_os_device'] = df.apply(lambda row: click_time_delta(row, ["ip", "app", "os", "device"]) , axis=1)
df['next_click_time_delta_for_ip_app'] = df.apply(lambda row: click_time_delta(row, ["ip", "app"]) , axis=1)

df

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,time_to_attribution,hour_of_day,...,click_time_dt,click_freq_for_ip,prev_click_time_delta_for_ip,prev_click_time_delta_for_ip_os_device,prev_click_time_delta_for_ip_app_os_device,prev_click_time_delta_for_ip_app,next_click_time_delta_for_ip,next_click_time_delta_for_ip_os_device,next_click_time_delta_for_ip_app_os_device,next_click_time_delta_for_ip_app
23038,44018,13,1,19,477,2017-11-09 15:59:51,,0,,15,...,1.505146e+09,0.000000e+00,3.350700e+04,3.350700e+04,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
41817,11911,1,1,22,115,2017-11-09 15:59:46,,0,,15,...,1.505146e+09,0.000000e+00,2.691545e+06,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
17634,81374,14,1,25,118,2017-11-09 15:59:44,,0,,15,...,1.505146e+09,9.335306e-07,5.375884e+06,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
83818,91779,18,1,41,379,2017-11-09 15:59:43,,0,,15,...,1.505146e+09,7.597260e-07,5.411012e+06,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
75024,85592,12,1,13,145,2017-11-09 15:59:42,,0,,15,...,1.505146e+09,0.000000e+00,1.127500e+04,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
40708,45248,11,1,17,173,2017-11-09 15:59:41,,0,,15,...,1.505146e+09,5.146747e-05,1.954700e+04,3.044000e+04,1.505146e+09,5.401308e+06,1.505146e+09,1.505146e+09,1.505146e+09,5.401308e+06
54485,110811,8,1,13,145,2017-11-09 15:59:39,,0,,15,...,1.505146e+09,0.000000e+00,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
89994,80992,20,1,13,259,2017-11-09 15:59:36,,0,,15,...,1.505146e+09,0.000000e+00,2.683613e+06,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
69075,48282,18,1,13,449,2017-11-09 15:59:34,,0,,15,...,1.505146e+09,3.729248e-07,1.200000e+03,2.866000e+03,1.505146e+09,6.180000e+03,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
15653,346195,7,1,13,101,2017-11-09 15:59:33,,0,,15,...,1.505146e+09,1.855005e-06,1.840400e+04,1.840400e+04,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09,1.505146e+09
