In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(rc={'figure.figsize':(14,6)});
plt.figure(figsize=(14,6));

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

The datasets are very huge in this competition (184.9m rows in the training dataset and 18.7m rows in the testing dataset) so i'll only use some chunks of the dataset.
I am reading the dataset in chunks of 4m (46 chunks) then taking 1 chunk every 6 chunks. This means i'll only use 8 chunks from the training (32m rows).

In [None]:
# Manually setting the types of columns reduces the memory usage by ~x2.7
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32' # for test data
}

# Read the training data as chunks of 4m
print('Reading the train.csv..')
reader = pd.read_csv('../input/train.csv', dtype=dtypes, chunksize=4000000,
                     usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'],
                     parse_dates=['click_time'])
chunks = [chunk for chunk in reader]

print('Selecting the train chunks to use..')
chunks_to_use = [chunks[x] for x in np.arange(0, len(chunks), 6)]
print('Selected {} chunks.'.format(len(chunks_to_use)))
train_df = pd.concat(chunks_to_use, ignore_index=True)
print('train_df created.')
print(train_df.info())

del reader, chunks, chunks_to_use
gc.collect()

**Percentage of attributed clicks**

In [None]:
print('~{:.2f}%'.format(len(train_df[train_df['is_attributed'] == 1]) * 100 / len(train_df)))

Only ~0.24% of clicks were attributed, this is very very low.

**Merging the test dataset with the train dataset**

I saw a lot of kernels merging the test dataset with the train dataset so i'll do that too.
*I read some disscusions about this and they say its "okay" (unless it's not and somebody can explain why please).*

In [None]:
test_df = pd.read_csv('../input/test.csv', dtype=dtypes, parse_dates=['click_time'])
data = train_df.append(test_df)
print('data created.')

train_df_len = len(train_df)
del train_df, test_df
gc.collect()

In [None]:
data.info()

**Missing values**

In [None]:
data[['app', 'channel', 'click_time', 'device', 'ip', 'os']].isnull().sum()

There is no missing values in our data, this is good!

**Unique values count**

Calculate the count of unique values 

In [None]:
unique_counts = data[['ip', 'app', 'channel', 'device', 'os']].apply(lambda x: x.unique().shape[0])
print(unique_counts)
plt.bar(unique_counts.index.values, unique_counts)

del unique_counts
gc.collect()

There are only 210217 ips, this means that a lot of clicks come from the same ip. This shows the possiblity that some ips were used to make fraudulant clicks, but it's not a good evidence since multiple people can have the same IP (for example in the same house with the same box).

**Extracting time features**

Before we begin analyzing the data, i'll extract some time features from click_time (taking into account the local time).

In [None]:
import pytz
cst = pytz.timezone('Asia/Shanghai')
data['local_click_time'] = data['click_time'].dt.tz_localize(pytz.utc).dt.tz_convert(cst)
data['click_day'] = data['local_click_time'].dt.day.astype('uint8')
data['click_hour'] = data['local_click_time'].dt.hour.astype('uint8')
data.drop(['click_time', 'local_click_time'], axis=1, inplace=True)
print('Extracted time features.')

**Number of clicks per ip**

Let's see if there is anything weird about the ips.

In [None]:
clicks_per_ip = data['ip'].value_counts()[:20]
sns.barplot(clicks_per_ip.index.values, clicks_per_ip.values)

Ips 5348 and 5314 have a huge number of clicks (350k+), let's see how they are distributed throughout the hours.

In [None]:
most_clicked_ips = clicks_per_ip[:2].index.values
fig, axes = plt.subplots(1, 2)

for i in range(len(most_clicked_ips)):
    temp_df = data[['ip', 'click_hour']][data['ip'] == most_clicked_ips[i]]
    sns.countplot(x='click_hour', data=temp_df, ax=axes[i])
    axes[i].set_title(most_clicked_ips[i])

These ips are generating a lot of clicks almost every hour (*they also seem to have almost the same number of clicks per hour, weird..*). 

**Downloads count for ips 5348 and 5314**

Since the ips that generate the most clicks, let's see how many times they actually downloaded the app.

In [None]:
ips_download_counts = data[['ip', 'app', 'is_attributed']][data['ip'].isin(most_clicked_ips)].groupby('ip').agg({ 'app': 'count', 'is_attributed': 'sum'})
ips_download_counts.rename(columns={'app': 'click_count', 'is_attributed': 'download_count'}, inplace=True)
ips_download_counts['download_rate'] = ips_download_counts['download_count'] * 100 / ips_download_counts['click_count']
ips_download_counts

So out of 374k and 406k clicks, they only downloaded the app 4xx times. This doesn't look legit. How many devices did these ips use?

In [None]:
data[['ip', 'device']][data['ip'].isin(most_clicked_ips)].groupby('ip')['device'].nunique()

They only use 303 and 289 devices respectively...

**[★ New Feature] Number of clicks per ip**

In [None]:
temp_col = data[['ip', 'channel']].groupby('ip').count().reset_index().rename(columns={'channel': 'ip_count'}).astype('uint32')
data = data.merge(temp_col, on='ip', how='left')

del temp_col
gc.collect()

**Number of clicks per day per ip**

In [None]:
clicks_per_day_per_ip = data[['click_day', 'ip', 'channel']][data['ip'].isin(most_clicked_ips)].groupby(['click_day', 'ip']).count().rename(columns={'channel': 'count'})
clicks_per_day_per_ip.unstack().plot(kind='bar')

**Number of clicks per hour per ip**

In [None]:
clicks_per_hour_per_ip = data[['click_hour', 'ip', 'channel']][data['ip'].isin(most_clicked_ips)].groupby(['click_hour', 'ip']).count().rename(columns={'channel': 'count'})
clicks_per_hour_per_ip.unstack().plot(kind='bar')

**[★ New Feature] Number of clicks per day per hour per ip**

In [None]:
temp_col = data[['click_day', 'click_hour', 'ip', 'channel']].groupby(['click_day', 'click_hour', 'ip']).count().reset_index().rename(columns={'channel': 'day_hour_ip_count'}).astype('uint32')
data = data.merge(temp_col, on=['click_day', 'click_hour', 'ip'], how='left')

del temp_col
gc.collect()

In [None]:
del clicks_per_ip, ips_download_counts, clicks_per_day_per_ip, clicks_per_hour_per_ip
gc.collect()

**What about the devices?**

As we saw, the two most used ips (5348 and 5314) only use 4xx devices out of 2552. Considering the number of clicks they generated, it doesn't look trustworthy. Let's look into the devices closer.

**Number of clicks per device**

In [None]:
clicks_per_device = data['device'].value_counts()[:10]
print(clicks_per_device)
sns.barplot(clicks_per_device.index.values, clicks_per_device.values)

Device 1 is the most used (47.4m) followed by Device 2 (2.5m).

**Devices used by the most used ips**

In [None]:
most_used_devices = clicks_per_device.index.values
fig, axes = plt.subplots(2, 1)

for i in range(len(most_clicked_ips)):   
    temp_df = data[['ip', 'device']][data['ip'] == most_clicked_ips[i]]
    sns.countplot(x='device', data=temp_df, ax=axes[i], order=most_used_devices)
    axes[i].set_title(most_clicked_ips[i])

Almost all their clicks are done using Device 1 and Device 2. I'll go ahead and add a feature representing the number of clicks per ip per device.

In [None]:
data[['ip', 'device', 'channel']].groupby(['ip', 'device'])

**Download counts for the top 10 devices**

In [None]:
devices_download_counts = data[['device', 'ip', 'is_attributed']][data['device'].isin(most_used_devices)].groupby('device').agg({ 'ip': 'count', 'is_attributed': 'sum'})
devices_download_counts.rename(columns={'ip': 'click_count', 'is_attributed': 'download_count'}, inplace=True)
devices_download_counts['download_rate'] = devices_download_counts['download_count'] * 100 / devices_download_counts['click_count']
devices_download_counts

47.4m clicks on Device 1 but only 0.1% downloads, 2.5m clicks on Device 2 but only 0.01% downloads. Not only the ips we were suspecting are using these devices a lot, but also the download rate is very low..

In [None]:
del clicks_per_device, devices_download_counts
gc.collect()

**[★ New Feature] Number of clicks per ip and device**

In [None]:
temp_col = data[['ip', 'device', 'channel']].groupby(['ip', 'device']).count().reset_index().rename(columns={'channel': 'ip_device_count'}).astype('uint32')
data = data.merge(temp_col, on=['ip', 'device'], how='left')

del temp_col
gc.collect()

**What about the apps?**

If there is someone who's generating fraudulent clicks, usually they'll focus on one app, let's see if that's true.

In [None]:
clicks_per_app = data[['app', 'channel']].groupby('app').count().sort_values('channel', ascending=False)['channel']
print('Top 10', clicks_per_app[:10])
plt.scatter(clicks_per_app.index, clicks_per_app)
plt.xlabel('app')
plt.ylabel('count')

As we can see, some of the apps have a lot more clicks than the others. Either they are very popular apps or targeted apps.

**Number of clicks per day per app ( top 10 apps)**

In [None]:
most_used_apps = clicks_per_app[:10].index.values
clicks_per_day_per_app = data[['click_day', 'app', 'ip']][data['app'].isin(most_used_apps)].groupby(['click_day', 'app']).count().rename(columns={'ip': 'count'})
clicks_per_day_per_app.unstack().plot(kind='bar')

**Number of clicks per hour per app ( top 10 apps)**

In [None]:
clicks_per_hour_per_app = data[['click_hour', 'app', 'ip']][data['app'].isin(most_used_apps[:6])].groupby(['click_hour', 'app']).count().rename(columns={'ip': 'count'})
clicks_per_hour_per_app.unstack().plot(kind='bar')

**Download counts for the top 10 apps**

In [None]:
apps_download_counts = data[['app', 'ip', 'is_attributed']][data['app'].isin(most_used_apps)].groupby('app').agg({ 'ip': 'count', 'is_attributed': 'sum'})
apps_download_counts.rename(columns={'ip': 'click_count', 'is_attributed': 'download_count'}, inplace=True)
apps_download_counts['download_rate'] = apps_download_counts['download_count'] * 100 / apps_download_counts['click_count']
apps_download_counts.sort_values('click_count', ascending=False)

The most used apps have a very low download rate (especially App 12 with only 0.006% out of 6.5m clicks). This enforces the guess of targeted apps for fraud.

**[★ New Feature] Number of clicks per day per hour**

In [None]:
temp_col = data[['click_day', 'click_hour', 'app', 'channel']].groupby(['click_day', 'click_hour', 'app']).count().reset_index().rename(columns={'channel': 'day_hour_app_count'}).astype('uint32')
data = data.merge(temp_col, on=['click_day', 'click_hour', 'app'], how='left')

del temp_col
gc.collect()

In [None]:
del clicks_per_app, clicks_per_day_per_app, clicks_per_hour_per_app, apps_download_counts
gc.collect()

**Are the channels the same as apps? Are there channels way more used than the rest?**

In [None]:
clicks_per_channel = data[['app', 'channel']].groupby('channel').count().sort_values('app', ascending=False)['app']
print('Top 10', clicks_per_channel[:10])
plt.scatter(clicks_per_channel.index, clicks_per_channel)
plt.xlabel('channel')
plt.ylabel('count')

Some of the channels have more clicks than the others (mainly 280, 107).

**Number of clicks per app per channel**

In [None]:
most_used_channels = clicks_per_channel[:10].index.values
clicks_per_app_per_channel = data[['app', 'channel', 'ip']][data['channel'].isin(most_used_channels[:6])][data['app'].isin(most_used_apps)].groupby(['app', 'channel']).count().rename(columns={'ip': 'count'})
clicks_per_app_per_channel.unstack().plot(kind='bar')

Looks like some channels are only used by some apps.

**Number of downloads per channel**

In [None]:
channel_download_counts = data[['channel', 'ip', 'is_attributed']][data['channel'].isin(most_used_channels)].groupby('channel').agg({ 'ip': 'count', 'is_attributed': 'sum'})
channel_download_counts.rename(columns={'ip': 'click_count', 'is_attributed': 'download_count'}, inplace=True)
channel_download_counts['download_rate'] = channel_download_counts['download_count'] * 100 / channel_download_counts['click_count']
channel_download_counts.sort_values('click_count', ascending=False)

**[★ New Feature] Number of clicks per app per channel**

In [None]:
temp_col = data[['app', 'channel', 'ip']].groupby(['app', 'channel']).count().reset_index().rename(columns={'ip': 'app_channel_count'}).astype('uint32')
data = data.merge(temp_col, on=['app', 'channel'], how='left')

del temp_col
gc.collect()

In [None]:
del clicks_per_channel, clicks_per_app_per_channel, channel_download_counts
gc.collect()

**Number of clicks per os**

In [None]:
clicks_per_os = data[['os', 'channel']].groupby('os').count().sort_values('channel', ascending=False)['channel']
print('Top 10', clicks_per_os[:10])
plt.scatter(clicks_per_os.index, clicks_per_os)
plt.xlabel('os')
plt.ylabel('count')

Right off the bat we see 2 os having way more clicks than all the rest (13 and 19).

**Number of clicks per os (top 2) per device**

In [None]:
most_used_os = clicks_per_os[:2].index.values
fig, axes = plt.subplots(2, 1)
for i in range(2):
    temp_df = data[['os', 'device']][data['os'] == most_used_os[i]]
    sns.countplot(x='device', data=temp_df, ax=axes[i], order=most_used_devices)
    axes[i].set_title(most_used_os[i])

These are the same devices (1 and 2) used A LOT by the ips we suspect are generating fradulent clicks (5348 and 5314). They also seem to use one of these os (13 or 19).

**[★ New Feature] Number of clicks per os per device**

In [None]:
temp_col = data[['os', 'device', 'channel']].groupby(['os', 'device']).count().reset_index().rename(columns={'channel': 'os_device_count'}).astype('uint32')
data = data.merge(temp_col, on=['os', 'device'], how='left')

del temp_col
gc.collect()

**What os are the most used apps on?**

In [None]:
fig, axes = plt.subplots(2, 1)
for i in range(2):
    temp_df = data[['os', 'app']][data['os'] == most_used_os[i]][data['app'].isin(most_used_apps)]
    sns.countplot(x='app', data=temp_df, ax=axes[i], order=most_used_apps)
    axes[i].set_title(most_used_os[i])

**[★ New Feature] Number of clicks per os per app per channel**

In [None]:
temp_col = data[['os', 'app', 'channel', 'ip']].groupby(['os', 'app', 'channel']).count().reset_index().rename(columns={'ip': 'os_app_channel_count'}).astype('uint32')
data = data.merge(temp_col, on=['os', 'app', 'channel'], how='left')

del temp_col
gc.collect()

In [None]:
data[['ip', 'os']][data['os'].isin(most_used_os)].groupby('os').count()

In [None]:
fig, axes = plt.subplots(2, 1)
for i in range(2):
    temp_df = data[['os', 'ip']][data['ip'] == most_clicked_ips[i]]
    sns.countplot(x='os', data=temp_df, ax=axes[i], order=clicks_per_os[:10].index.values)
    axes[i].set_title(most_clicked_ips[i])

In [None]:
clicks_per_app_per_ip = data[['app', 'channel', 'ip']][data['ip'].isin(most_clicked_ips)][data['app'].isin(most_used_apps)].groupby(['app', 'ip']).count().rename(columns={'channel': 'count'})
clicks_per_app_per_ip = clicks_per_app_per_ip.reindex(most_used_apps, level='app')
clicks_per_app_per_ip.unstack().plot(kind='bar')

**Conclusion**

This was a wonderful oppurtunity for me to learn about how EDA works, how competitions in Kaggle work and most importantly how awesome Kaggle's community is, how everyone helps each other. I appreciate every author of every kernel I have read. 
The best result I got is 0.9682 (LB) and 0.9692 (PB).