In [None]:
import numpy as np
import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import gc
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
click_data = pd.read_csv('../input/talkingdata-adtracking-fraud-detection/train.csv',parse_dates=['click_time', 'attributed_time'], dtype=dtypes, skiprows=range(1,122991234), nrows=60000000).sample(n=1000000)
print("Reading Training data... Done")

In [None]:
columns = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed']
for c in columns:
    click_data[c] = click_data[c].astype('category')
click_data.describe()

In [None]:
import seaborn as sns
plt.figure(figsize=(10, 6))
train_columns = ['ip', 'app', 'device', 'os', 'channel']
unique_count = [len(click_data[column].unique()) for column in train_columns]
sns.set(font_scale=1.2)
axis = sns.barplot(train_columns, unique_count, log=True)
axis.set(xlabel='Feature', ylabel='log(Unique_Count)', title='Number of unique values per feature (Train_Sample)')
for p_feat, uniq_cnt in zip(axis.patches, unique_count):
    height = p_feat.get_height()
    axis.text(p_feat.get_x()+p_feat.get_width()/2,height + 10,uniq_cnt,ha="center") 

In [None]:
click_data[['attributed_time', 'is_attributed']][click_data['is_attributed']==1].describe()
#Checking whether we have any bad data

In [None]:
plt.figure(figsize=(6,6))
click_data['is_attributed']=click_data['is_attributed'].astype(int)
mean = click_data['is_attributed'].mean()
axis = sns.barplot(['Downloaded (1)', 'Not Downloaded (0)'], [mean, 1-mean])
axis.set(ylabel='Percentage', title='Downloaded vs Not Downloaded')
for p_feat, uniq_cnt in zip(axis.patches, [mean, 1-mean]):
    height = p_feat.get_height()
    axis.text(p_feat.get_x()+p_feat.get_width()/2.,height+0.02,'{}%'.format(round(uniq_cnt * 100, 3)),ha="center")

In [None]:
click_data[click_data['is_attributed']==1].ip.describe()

In [None]:
#Plotting Conversion Rates Vs Most Popular Features

In [None]:
#Conversion Rates Vs Most Popular Apps
plt.figure(figsize=(25, 20))
counts = click_data[['app', 'is_attributed']].groupby('app', as_index=False).count()
counts = counts.sort_values('is_attributed', ascending=False)
percentage = click_data[['app', 'is_attributed']].groupby('app', as_index=False).mean()
percentage = percentage.sort_values('is_attributed', ascending=False)

merge = counts.merge(percentage, on='app', how='left')
merge.columns = ['app', 'click_cnt', 'conv_percent']

axis = merge[:50].plot(secondary_y='conv_percent')
plt.title('Conversion Rates Vs 50 Most Popular Apps')
axis.set(ylabel='Click Count')
plt.ylabel('% Downloaded')
plt.show()
del counts, percentage, axis
gc.collect()

In [None]:
#Conversion Rates Vs Most Popular OS
plt.figure(figsize=(25, 20))
counts = click_data[['os', 'is_attributed']].groupby('os', as_index=False).count()
counts = counts.sort_values('is_attributed', ascending=False)
percentage = click_data[['os', 'is_attributed']].groupby('os', as_index=False).mean()
percentage = percentage.sort_values('is_attributed', ascending=False)

merge = counts.merge(percentage, on='os', how='left')
merge.columns = ['os', 'click_cnt', 'conv_percent']

axis = merge[:50].plot(secondary_y='conv_percent')
plt.title('Conversion Rates Vs 50 Most Popular OS')
axis.set(ylabel='Click Count')
plt.ylabel('% Downloaded')
plt.show()
del counts, percentage, axis
gc.collect()

In [None]:
#Conversion Rates Vs Most Popular Devices
plt.figure(figsize=(25, 20))
counts = click_data[['device', 'is_attributed']].groupby('device', as_index=False).count()
counts = counts.sort_values('is_attributed', ascending=False)
percentage = click_data[['device', 'is_attributed']].groupby('device', as_index=False).mean()
percentage = percentage.sort_values('is_attributed', ascending=False)

merge = counts.merge(percentage, on='device', how='left')
merge.columns = ['device', 'click_cnt', 'conv_percent']

axis = merge[:50].plot(secondary_y='conv_percent')
plt.title('Conversion Rates Vs 50 Most Popular Devices')
axis.set(ylabel='Click Count')
plt.ylabel('% Downloaded')
plt.show()
del counts, percentage, axis
gc.collect()

In [None]:
#Conversion Rates Vs Most Popular IPs
plt.figure(figsize=(25, 20))
counts = click_data[['ip', 'is_attributed']].groupby('ip', as_index=False).count()
counts = counts.sort_values('is_attributed', ascending=False)
percentage = click_data[['ip', 'is_attributed']].groupby('ip', as_index=False).mean()
percentage = percentage.sort_values('is_attributed', ascending=False)

merge = counts.merge(percentage, on='ip', how='left')
merge.columns = ['ip', 'click_cnt', 'conv_percent']

axis = merge[:200].plot(secondary_y='conv_percent')
plt.title('Conversion Rates Vs 200 Most Popular IPs')
axis.set(ylabel='Click Count')
plt.ylabel('% Downloaded')
plt.show()
del counts, percentage, axis
gc.collect()

In [None]:
#Conversion Rates Vs Most Popular Channels
plt.figure(figsize=(25, 20))
counts = click_data[['channel', 'is_attributed']].groupby('channel', as_index=False).count()
counts = counts.sort_values('is_attributed', ascending=False)
percentage = click_data[['channel', 'is_attributed']].groupby('channel', as_index=False).mean()
percentage = percentage.sort_values('is_attributed', ascending=False)

merge = counts.merge(percentage, on='channel', how='left')
merge.columns = ['channel', 'click_cnt', 'conv_percent']

axis = merge[:50].plot(secondary_y='conv_percent')
plt.title('Conversion Rates Vs 50 Most Popular Channels')
axis.set(ylabel='Click Count')
plt.ylabel('% Downloaded')
plt.show()
del counts, percentage, axis
gc.collect()

In [None]:
#Let us check if we can see any correlations with time
plt.figure(figsize=(25, 20))
click_data['click_weekday']=click_data['click_time'].dt.day_name()
click_data['click_hr']=click_data['click_time'].dt.hour
click_data.head()

In [None]:
click_data['click_weekday'].describe()

In [None]:
click_data['click_hr'].describe()

In [None]:
#Weekday Vs Click count and Conversion Rate
temp = click_data[['click_weekday','is_attributed']].groupby(['click_weekday'], as_index=False).mean()
x = temp['click_weekday']
y_mean = temp['is_attributed']
temp = click_data[['click_weekday','is_attributed']].groupby(['click_weekday'], as_index=False).count()
y_count = temp['is_attributed']
print(temp)

plot = plt.figure()
sub_plot = plot.add_subplot(111)
addon = sub_plot.twinx()

sub_plot.set_xlabel("Weekday")
sub_plot.set_ylabel("% Conversions")
addon.set_ylabel("Count of Clicks")

plot1, = sub_plot.plot(x, y_mean, color="#75a1a6",label="% Conversions")
plot2, = addon.plot(x, y_count, color="#a675a1", label="Count of Clicks")
lines = [plot1, plot2]
sub_plot.legend(handles=lines, loc='best')

sub_plot.yaxis.label.set_color("#75a1a6")
addon.yaxis.label.set_color("#a675a1")

del temp
gc.collect()

In [None]:
#Click Hour Vs Click count and Conversion Rate
temp = click_data[['click_hr','is_attributed']].groupby(['click_hr'], as_index=False).mean()
x = temp['click_hr']
y_mean = temp['is_attributed']
temp = click_data[['click_hr','is_attributed']].groupby(['click_hr'], as_index=False).count()
y_count = temp['is_attributed']

plot = plt.figure()
sub_plot = plot.add_subplot(111)
addon = sub_plot.twinx()

sub_plot.set_xlabel("Hour of the day")
sub_plot.set_ylabel("% Conversions")
addon.set_ylabel("Count of Clicks")

plot1, = sub_plot.plot(x, y_mean, color="#75a1a6",label="% Conversions")
plot2, = addon.plot(x, y_count, color="#a675a1", label="Count of Clicks")
lines = [plot1, plot2]
sub_plot.legend(handles=lines, loc='best')

sub_plot.yaxis.label.set_color("#75a1a6")
addon.yaxis.label.set_color("#a675a1")

del temp
gc.collect()

In [None]:
click_data.head()