In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
plt.figure(figsize=(20,20))

In [3]:
trainsample=pd.read_csv('../input/train_sample.csv')

### Train Sample Shape 

In [4]:
trainsample.shape

### Train Sample Feature and Types
is_attributed is the target variable

In [5]:
trainsample.dtypes

### Preprocessing changing data types

In [6]:
trainsample.is_attributed=trainsample.is_attributed.astype('object')
trainsample.app=trainsample.app.astype('object')
trainsample.os=trainsample.os.astype('object')
trainsample.device=trainsample.device.astype('object')
trainsample.channel=trainsample.channel.astype('object')

trainsample.click_time=pd.to_datetime(trainsample['click_time'],format='%Y-%m-%d %H:%M:%S')
trainsample.attributed_time=pd.to_datetime(trainsample['attributed_time'],format='%Y-%m-%d %H:%M:%S')



### Missing Values  
Attributed_Time seems to have a lot of missing values

In [7]:
trainsample.isnull().sum()

### Target Variable Count's
Totally imbalanced dataset

In [8]:
trainsample.is_attributed.value_counts()

### Grouping and Counts of independent variables

In [9]:
def GroupByColumns(columns):
    groupCol = trainsample.groupby(columns)\
                    .size()\
                    .sort_values(ascending=False)\
                    .reset_index()
    return groupCol

In [10]:
columns=['app','device','os']

In [11]:
GroupByColumns(columns)

In [12]:
columns=['app','device','os','is_attributed']

In [13]:
GroupByColumns(columns)

In [14]:
columns=['app','device','os','is_attributed']

In [15]:
groupCol = trainsample.groupby(columns)\
                    .size()\
                    .sort_values(ascending=False)\
                    .reset_index()

In [16]:
groupCol

In [17]:
AttributedDF= trainsample[trainsample.is_attributed==1]

In [18]:
timeDiffList= list()
i= 0;
def timeBtwDwnld(df):
    for i in range(0, len(df)):
        timeDiffList.append((df.iloc[i]['attributed_time']-df.iloc[i]['click_time']).seconds)
    return timeDiffList
    

In [19]:
AttributedDF['timediff']=timeBtwDwnld(AttributedDF)

In [20]:
plt.hist(AttributedDF['timediff'],bins=30)
plt.show()

### Some clicks have turned into downloads after a day also

In [21]:
AttributedDF[AttributedDF.timediff>3600].shape

### Plot's 

In [22]:
appseries=trainsample.app.value_counts().nlargest(20)
plt.figure(figsize=(20,20))

In [23]:
appseries.plot.bar()
plt.show()

In [24]:
osseries=trainsample.os.value_counts().nlargest(20)
plt.figure(figsize=(20,20))

In [25]:
osseries.plot.bar()
plt.show()

In [26]:
deviceseries=trainsample.device.value_counts().nlargest(20)
plt.figure(figsize=(20,20))

In [27]:
deviceseries.plot.bar()
plt.show()

In [28]:
channelseries=trainsample.channel.value_counts().nlargest(20)
plt.figure(figsize=(20,20))

In [29]:
channelseries.plot.bar()
plt.show()

In [30]:
times = pd.DatetimeIndex(trainsample.click_time)
plt.figure(figsize=(20,20))

In [31]:
grouped = trainsample.groupby([times.day])['ip'].count()

In [32]:
grouped.plot.bar()
plt.show()

In [33]:
grouped = trainsample.groupby([times.hour])['ip'].count()
plt.figure(figsize=(20,20))

In [34]:
grouped.plot.bar()
plt.show()

In [35]:
times = pd.DatetimeIndex(trainsample.attributed_time)

In [36]:
grouped = trainsample.groupby([times.day])['ip'].count()
plt.figure(figsize=(20,20))

In [37]:
grouped.plot.bar()
plt.show()

In [38]:
grouped = trainsample.groupby([times.hour])['ip'].count()
plt.figure(figsize=(20,20))

In [39]:
grouped.plot.bar()
plt.show()

In [40]:
grouped = trainsample.groupby([times.minute])['ip'].count()

In [41]:
plt.figure(figsize=(20,20))

In [42]:
grouped.plot.bar()
plt.show()

In [43]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()

orig_x_data=trainsample[['ip','os','device','app','channel']]
orig_y_data=trainsample[['is_attributed']]

x_oversampled, y_oversampled = ros.fit_sample(orig_x_data, orig_y_data)

In [44]:
from collections import Counter
print(sorted(Counter(y_oversampled).items()))


In [45]:
trainsample['clicktimemins'] =trainsample.click_time.map(lambda t: t.strftime('%Y-%m-%d %H:%M'))

In [46]:
clickcountbymin=trainsample.groupby(['clicktimemins'])['ip'].count()
plt.figure(figsize=(15,15))

In [47]:
clickcountbymin.plot()
plt.show()

In [48]:
from pandas import Series
from pandas import DataFrame
from pandas import Grouper
import numpy as np

In [49]:
series=Series(clickcountbymin)
n = 480

In [50]:
newdf=trainsample.set_index(['click_time'])

In [51]:
dtresampler=newdf['ip'].resample('60s')

In [52]:
trainsample['clickdate'] =trainsample.click_time.map(lambda t: t.strftime('%Y-%m-%d'))

In [53]:
trainsample['timeofclick'] =trainsample.click_time.map(lambda t: t.strftime('%H:%M:%S'))

In [54]:
uniqdates=trainsample['clickdate'].unique()

In [55]:
df6th=trainsample[trainsample.clickdate=='2017-11-06']
df6th=df6th.set_index(['click_time'])

In [56]:
df7th=trainsample[trainsample.clickdate=='2017-11-07']
df7th=df7th.set_index(['click_time'])

In [57]:
df8th=trainsample[trainsample.clickdate=='2017-11-08']
df8th=df8th.set_index(['click_time'])

In [58]:
df9th=trainsample[trainsample.clickdate=='2017-11-09']
df9th=df9th.set_index(['click_time'])

In [59]:

fig,ax=plt.subplots(4,1,sharex='row',sharey=False,squeeze=False)
plt.figure(figsize=(20,20))

In [60]:
ax[0][0]

In [61]:

ax[0][0].plot(df6th['ip'].resample('60s').count())

In [62]:

ax[1][0].plot(df7th['ip'].resample('60s').count())

In [63]:

ax[2][0].plot(df8th['ip'].resample('60s').count())

In [64]:

ax[3][0].plot(df9th['ip'].resample('60s').count())

In [65]:
plt.show()

In [66]:
columns=['ip','app','device','os']

In [67]:
GroupByColumns(columns)

In [68]:
#extract hour as a feature
trainsample['click_hour']=trainsample['click_time'].dt.hour
trainsample.is_attributed=trainsample.is_attributed.astype('int')

In [69]:
#thanks to yuliagm for the conversation rate  idea 
proportion = trainsample[['ip', 'is_attributed']].groupby('ip', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = trainsample[['ip', 'is_attributed']].groupby('ip', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='ip', how='left')
merge.columns = ['ip', 'click_count', 'prop_downloaded']

ax = merge[:300].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates  300 Most Popular IPs')
ax.set(ylabel='Click Count')
plt.ylabel('Proportion Downloaded')
plt.show()

print('Counversion Rates over Counts of Most Popular IPs')
print(merge[:30])

In [70]:
trainsample['click_hour']=trainsample['click_time'].dt.hour
sns.barplot('click_hour', 'is_attributed', data=trainsample)
plt.title('HOURLY CONVERSION RATIO');
plt.ylabel('Converted Ratio');

In [71]:
plt.show()

In [72]:
proportion = trainsample[['app', 'is_attributed']].groupby('app', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = trainsample[['app', 'is_attributed']].groupby('app', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='app', how='left')
merge.columns = ['app', 'click_count', 'prop_downloaded']

ax = merge[:100].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates over Counts of 100 Most Popular Apps')
ax.set(ylabel='Click Count')
plt.ylabel('Proportion Downloaded')
plt.show()

print('Counversion Rates over Counts of Most Popular Apps')
print(merge[:20])

In [73]:
proportion = trainsample[['os', 'is_attributed']].groupby('os', as_index=False).mean().sort_values('is_attributed', ascending=False)
counts = trainsample[['os', 'is_attributed']].groupby('os', as_index=False).count().sort_values('is_attributed', ascending=False)
merge = counts.merge(proportion, on='os', how='left')
merge.columns = ['os', 'click_count', 'prop_downloaded']

ax = merge[:100].plot(secondary_y='prop_downloaded')
plt.title('Conversion Rates ')
ax.set(ylabel='Click Count')
plt.ylabel('Proportion Downloaded')
plt.show()

print('Counversion Rates over Counts of Most Popular os')
print(merge[:20])