In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interactive
%matplotlib inline

In [2]:
from tqdm import tqdm
from collections import defaultdict

In [3]:
import ipywidgets as widgets
from ipywidgets import HBox, VBox
from IPython.display import display
%matplotlib inline

In [4]:
train_df = pd.read_csv('../data/train_users.csv')
test_df = pd.read_csv('../data/test_users.csv')

In [5]:
users = pd.concat([train_df.drop('country_destination', axis = 1), test_df], axis = 0, ignore_index = True)

In [6]:
date_account_created = pd.DatetimeIndex(users.date_account_created)
users['account_created_year'] = date_account_created.year
users['account_created_month'] = date_account_created.month
users['account_created_day'] = date_account_created.day
users['account_created_weekday'] = date_account_created.weekday

In [7]:
sessions = pd.read_csv('../data/sessions_expanded.csv')

In [8]:
"""
sessions = sessions.rename(columns = {'user_id':'id'})
sessions = sessions.dropna(subset = ['id'])
sessions['secs_elapsed'] = sessions['secs_elapsed'].fillna(0)

sessions_ids = sessions.id.unique()
from_train = train_df.loc[train_df.id.isin(sessions_ids), ['id', 'date_account_created']]
from_test = test_df.loc[test_df.id.isin(sessions_ids), ['id', 'date_account_created']]
account_created_date = pd.concat([from_train, from_test], axis = 0)

sessions['cumulate_secs'] = sessions.groupby('id')['secs_elapsed'].cumsum()
sessions['cumulate_days'] = sessions['cumulate_secs']/60/60//24

account_date = defaultdict()
for _id, _date in zip(account_created_date['id'], account_created_date['date_account_created']):
    account_date[_id] = pd.to_datetime(_date)
    
def return_session_date(row):
    _id = row['id']
    _delta = row['cumulate_days']
    
    return account_date[_id] + pd.DateOffset(_delta)

tqdm.pandas()
sessions['session_date'] = sessions.loc[:, ['id', 'cumulate_days']].progress_apply(return_session_date, axis = 1)
"""

"\nsessions = sessions.rename(columns = {'user_id':'id'})\nsessions = sessions.dropna(subset = ['id'])\nsessions['secs_elapsed'] = sessions['secs_elapsed'].fillna(0)\n\nsessions_ids = sessions.id.unique()\nfrom_train = train_df.loc[train_df.id.isin(sessions_ids), ['id', 'date_account_created']]\nfrom_test = test_df.loc[test_df.id.isin(sessions_ids), ['id', 'date_account_created']]\naccount_created_date = pd.concat([from_train, from_test], axis = 0)\n\nsessions['cumulate_secs'] = sessions.groupby('id')['secs_elapsed'].cumsum()\nsessions['cumulate_days'] = sessions['cumulate_secs']/60/60//24\n\naccount_date = defaultdict()\nfor _id, _date in zip(account_created_date['id'], account_created_date['date_account_created']):\n    account_date[_id] = pd.to_datetime(_date)\n    \ndef return_session_date(row):\n    _id = row['id']\n    _delta = row['cumulate_days']\n    \n    return account_date[_id] + pd.DateOffset(_delta)\n\ntqdm.pandas()\nsessions['session_date'] = sessions.loc[:, ['id', 'cumu

In [9]:
sessions.head(4)

Unnamed: 0.1,Unnamed: 0,id,action,action_type,action_detail,device_type,secs_elapsed,cumulate_secs,cumulate_days,session_date
0,0,d1mm9tcy42,lookup,,,Windows Desktop,319.0,319.0,0.0,2014-01-01
1,1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0,68072.0,0.0,2014-01-01
2,2,d1mm9tcy42,lookup,,,Windows Desktop,301.0,68373.0,0.0,2014-01-01
3,3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0,90514.0,1.0,2014-01-02


In [10]:
@widgets.interact(year = (2010, 2014))
def date(year=2010):
    data = users.loc[users.account_created_year == year]
    x = data['account_created_month'].unique()
    y = data.account_created_month.value_counts().sort_index()
    plt.figure(figsize = (12, 8))
    plt.plot(y)
    plt.xticks(x)
    plt.ylim(0, 24000)
    plt.grid(alpha = 0.3)

interactive(children=(IntSlider(value=2010, description='year', max=2014, min=2010), Output()), _dom_classes=(…

In [11]:
users.head(3)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,account_created_year,account_created_month,account_created_day,account_created_weekday
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,2010,6,28,0
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,2011,5,25,2
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,2010,9,28,1


In [20]:
columns = ['account_created_year', 'account_created_month']
range_slider = widgets.SelectionRangeSlider(
    options = ['{}-{:02}'.format(y, m) for (y, m) in users[columns].groupby(columns).agg(len).index],
    index = (0, 11),
    disabled = False,
    layout={'width': '500px'},
    description='Dates'
)

In [21]:
range_slider

SelectionRangeSlider(description='Dates', index=(0, 11), layout=Layout(width='500px'), options=('2010-01', '20…

In [23]:
def return_selected_range(start, end):

    start = pd.to_datetime(start, format = '%Y-%m')
    end = pd.to_datetime(end, format = '%Y-%m')
    
    date_mask = (pd.to_datetime(users.date_account_created) >= start) & (pd.to_datetime(users.date_account_created) <= end)
    return users.loc[date_mask]
def print_date_range(date_range):
    
    start = date_range[0]
    end = date_range[1]
    
    data = return_selected_range(start, end)
    columns = ['account_created_year', 'account_created_month']
    y = data[columns].groupby(columns).agg(len)
    x = ['{}-{:02}'.format(year, month) for (year, month) in y.index]
    plt.figure(figsize = (16, 8))
    plt.bar(x, y)
    plt.xticks(rotation = 90)
    plt.show()
    
widgets.interact(
    print_date_range,
    date_range=range_slider
);

interactive(children=(SelectionRangeSlider(description='Dates', index=(0, 56), layout=Layout(width='500px'), o…

In [40]:
month_select = widgets.SelectionSlider(
    options= ['{}-{:02}'.format(y, m) for (y, m) in users[columns].groupby(columns).agg(len).index],
    #value='sunny side up',
    description='Select Month',
    disabled=False,
    continuous_update = False,
    orientation='horizontal',
    readout=True
)

In [93]:
def return_new_account(year, month):

    year_mask = users.account_created_year == year
    month_mask = users.account_created_month == month
    
    return len(users.loc[year_mask & month_mask])

In [94]:
def return_new_booking(year, month):

    booked = pd.DatetimeIndex(users.date_first_booking)
    year_mask = booked.year == year
    month_mask = booked.month == month
    
    return len(users.loc[year_mask & month_mask])

In [97]:
def print_date(date):
    
    year, month = map(int, date.split('-'))
    
    print('New Account : ', return_new_account(year, month))
    print('New First Booking : ', return_new_booking(year, month))

In [98]:
  widgets.interact(
    print_date,
    date = month_select
);

interactive(children=(SelectionSlider(continuous_update=False, description='Select Month', index=37, options=(…