In [2]:
%matplotlib inline
import os
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

DATA_DIR = '../input/'
target_col = 'deal_probability'
os.listdir(DATA_DIR)

In [3]:
import time
from contextlib import contextmanager
from functools import lru_cache
os.environ['OMP_NUM_THREADS'] = '4'

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.1f} s')

def reduce_memory(df):
    for c in df.columns:
        if df[c].dtype=='int':
            if df[c].min()<0:
                if df[c].abs().max()<2**7:
                    df[c] = df[c].astype('int8')
                elif df[c].abs().max()<2**15:
                    df[c] = df[c].astype('int16')
                elif df[c].abs().max()<2**31:
                    df[c] = df[c].astype('int32')
                else:
                    continue
            else:
                if df[c].max()<2**8:
                    df[c] = df[c].astype('uint8')
                elif df[c].max()<2**16:
                    df[c] = df[c].astype('uint16')
                elif df[c].max()<2**32:
                    df[c] = df[c].astype('uint32')
                else:
                    continue
        if df[c].dtype=='float64':
            df[c] = df[c].astype('float32')
    return df

def compare_set(a, b):
    print(len(a), len(b), len(a&b))
    
date_map = {}
to_datetime = lambda x:pd.to_datetime(x)
def get_datetime(df, date_col, to_dayofyear=True):
    unq = df[date_col].unique().tolist()
    for u in unq:
        if u not in date_map:
            if to_dayofyear:
                date_map.update({u: to_datetime(u).dayofyear})
            else:
                date_map.update({u: to_datetime(u)})
    df[date_col] = df[date_col].map(date_map.get)
    return df
def fillna_date(df, na_col='activation_date', diff_col='date_from'):
    mask = df[na_col].isnull()
    diff = df.loc[~mask, diff_col] - df.loc[~mask, na_col]
    diff = int(diff.median())
    print('median diff days', diff)
    df.loc[mask, na_col] = df.loc[mask, diff_col] - diff
    df[na_col] = df[na_col].astype('int')
    return df
def get_eval_user_set():
    with timer('Loading train'):
        train = pd.read_csv(DATA_DIR+'train.csv', usecols=['user_id'])
    with timer('Loading test'):
        test = pd.read_csv(DATA_DIR+'test.csv', usecols=['user_id'])
    users = set()
    users |= set(train['user_id'].values)
    users |= set(test['user_id'].values)
    print(len(users), 'in total')
    return users
def get_common_active_item_set(train_active_comm, test_active_comm):
    items = set()
    items |= set(train_active_comm['item_id'].values)
    items |= set(test_active_comm['item_id'].values)
    print(len(items), 'in total')
    return items

In [4]:
periods_cols = [
    'item_id',
    'activation_date',
    'date_from',
    'date_to'
]
train_periods = pd.read_csv(DATA_DIR+'periods_train.csv', usecols=periods_cols)
test_periods = pd.read_csv(DATA_DIR+'periods_test.csv', usecols=periods_cols)
with timer('Loading train periods date'):
    train_periods = get_datetime(train_periods, 'activation_date')
    train_periods = get_datetime(train_periods, 'date_from')
    train_periods = get_datetime(train_periods, 'date_to')
with timer('Loading test periods date'):
    test_periods = get_datetime(test_periods, 'activation_date')
    test_periods = get_datetime(test_periods, 'date_from')
    test_periods = get_datetime(test_periods, 'date_to')

In [5]:
pd.to_datetime('2017-03-15').dayofyear, pd.to_datetime('2017-04-12').dayofyear
print((train_periods['activation_date']<74).sum(), len(train_periods))
print((test_periods['activation_date']<102).sum(), len(test_periods))

In [6]:
f = plt.figure(figsize=(12, 4))
f.add_subplot(1, 2, 1)
(train_periods['date_from'] - train_periods['activation_date']).hist()
f.add_subplot(1, 2, 2)
(test_periods['date_from'] - test_periods['activation_date']).hist()
plt.show()

In [7]:
train_periods = fillna_date(train_periods)
test_periods = fillna_date(test_periods)

In [8]:
train_periods = reduce_memory(train_periods)
test_periods = reduce_memory(test_periods)

In [9]:
eval_users = get_eval_user_set()

In [10]:
def get_common_active(df_active, eval_users):
    print('All shape', df_active.shape)
    common_indexes = set(df_active['user_id'].values) & eval_users
    common_indexes = np.array(list(common_indexes))
    df_active = df_active.reset_index().set_index('user_id')
    df_active = df_active.loc[common_indexes, :]
    df_active = df_active.reset_index()
    print('Common only shape', df_active.shape)
    return df_active
def get_common_period(df_period, active_items):
    print('All shape', df_period.shape)
    common_indexes = set(df_period['item_id'].values) & active_items
    common_indexes = np.array(list(common_indexes))
    df_period = df_period.reset_index().set_index('item_id')
    df_period = df_period.loc[common_indexes, :]
    df_period = df_period.reset_index()
    print('Common only shape', df_period.shape)
    return df_period

In [11]:
with timer('Loading train_active'):
    train_active = pd.read_csv(
        DATA_DIR+'train_active.csv', 
        usecols=['user_id', 'item_id', 'activation_date'])
with timer('Loading test_active'):
    test_active = pd.read_csv(
        DATA_DIR+'test_active.csv', 
        usecols=['user_id', 'item_id', 'activation_date'])
with timer('Loading train'):
    train = pd.read_csv(
        DATA_DIR+'train.csv', 
        usecols=['user_id', 'item_id', 'activation_date'])
    train = get_datetime(train, 'activation_date')
with timer('Loading test'):
    test = pd.read_csv(
        DATA_DIR+'test.csv', 
        usecols=['user_id', 'item_id', 'activation_date'])
    test = get_datetime(test, 'activation_date')

In [12]:
with timer('Getting common users and day of year for train_active'):
    train_active = get_common_active(train_active, eval_users)
    train_active = get_datetime(train_active, 'activation_date')
with timer('Getting common users and day of year for test_active'):
    test_active = get_common_active(test_active, eval_users)
    test_active = get_datetime(test_active, 'activation_date')

In [13]:
active_items = get_common_active_item_set(train_active, test_active)

In [14]:
with timer('Getting common [items] train period'):
    train_periods = get_common_period(train_periods, active_items)
with timer('Getting common [items] test period'):
    test_periods = get_common_period(test_periods, active_items)

In [15]:
idx_to_save = [train_active,
               test_active,
               train_periods,
               test_periods] 
idx_save_path = ['train_active_raw_index.npy', 
                 'test_active_raw_index.npy', 
                 'train_periods_raw_index.npy', 
                 'test_periods_raw_index.npy']
for df, path in zip(idx_to_save, idx_save_path):
    df = reduce_memory(df)
    idx = df['index'].values
    np.save(path, idx)
    del df['index']; gc.collect();
    print(path, 'saved')

In [None]:
print('train activation date start from No.',
      pd.to_datetime('2017-03-15').dayofyear, 
      'day of year')
print('test activation date start from No.', 
      pd.to_datetime('2017-04-12').dayofyear, 
      'day of year')

f = plt.figure(figsize=(18, 12))
x_range = np.arange(0, 125, 5)
f.add_subplot(3, 1, 1)
sns.distplot(train_periods['activation_date']).set_xticks(x_range)
sns.distplot(test_periods['activation_date']).set_xticks(x_range)
plt.legend(['train_periods', 'test_periods'], loc=1)
plt.grid()

f.add_subplot(3, 1, 2)
sns.distplot(train_periods['date_from']).set_xticks(x_range)
sns.distplot(test_periods['date_from']).set_xticks(x_range)
plt.legend(['train_periods', 'test_periods'], loc=1)
plt.grid()

f.add_subplot(3, 1, 3)
sns.distplot(train_periods['date_to']).set_xticks(x_range)
sns.distplot(test_periods['date_to']).set_xticks(x_range)
plt.legend(['train_periods', 'test_periods'], loc=1)
plt.grid()

In [None]:
f = plt.figure(figsize=(18, 4))
x_range = np.arange(0, 125, 5)
sns.distplot(train_active['activation_date']).set_xticks(x_range)
sns.distplot(test_active['activation_date']).set_xticks(x_range)
plt.legend(['train_active', 'test_active'], loc=1)
plt.grid()

In [16]:
f = plt.figure(figsize=(18, 4))
x_range = np.arange(0, 125, 5)
sns.distplot(train['activation_date']).set_xticks(x_range)
sns.distplot(test['activation_date']).set_xticks(x_range)
plt.legend(['train', 'test'], loc=1)
plt.grid()

In [17]:
def get_unique_stats(grp_size):
    unique_stats = np.unique(grp_size, return_counts=True)
    unique_stats = np.array(unique_stats).T
    cols = ['unique', 'unique_counts']
    unique_stats = pd.DataFrame(unique_stats, columns=cols)
    return unique_stats

In [18]:
train_periods_item_grp = train_periods.groupby('item_id')
with timer('Getting groups [items] train period'):
    train_periods_item_size = train_periods_item_grp.size()
print('original shape\n', train_periods.shape)
train_periods_item_uniqstats = get_unique_stats(train_periods_item_size)
train_periods_item_uniqstats.describe().T

In [19]:
test_periods_item_grp = test_periods.groupby('item_id')
with timer('Getting groups [items] test period'):
    test_periods_item_size = test_periods_item_grp.size()
print('original shape\n', test_periods.shape)
test_periods_item_uniqstats = get_unique_stats(test_periods_item_size)
test_periods_item_uniqstats.describe().T

In [20]:
train_active_user_grp = train_active.groupby('user_id')
with timer('Getting groups [users] train active'):
    train_active_user_size = train_active_user_grp.size()
print('original shape\n', train_active.shape)
train_active_user_uniqstats = get_unique_stats(train_active_user_size)
train_active_user_uniqstats.describe().T

In [21]:
test_active_user_grp = test_active.groupby('user_id')
with timer('Getting groups [users] test active'):
    test_active_user_size = test_active_user_grp.size()
print('original shape\n', test_active.shape)
test_active_user_uniqstats = get_unique_stats(test_active_user_size)
test_active_user_uniqstats.describe().T

In [22]:
train_user_grp = train.groupby('user_id')
with timer('Getting groups [users] train'):
    train_user_size = train_user_grp.size()
print('original shape\n', train.shape)
train_user_uniqstats = get_unique_stats(train_user_size)
train_user_uniqstats.describe().T

In [23]:
test_user_grp = test.groupby('user_id')
with timer('Getting groups [users] test'):
    test_user_size = test_user_grp.size()
print('original shape\n', test.shape)
test_user_uniqstats = get_unique_stats(test_user_size)
test_user_uniqstats.describe().T

In [27]:
train_user_size         = train_user_size.rename('user_id_size').reset_index()
test_user_size          = test_user_size.rename('user_id_size').reset_index()
train_active_user_size  = train_active_user_size.rename('user_id_size').reset_index()
test_active_user_size   = test_active_user_size.rename('user_id_size').reset_index()
train_periods_item_size = train_periods_item_size.rename('item_id_size').reset_index()
test_periods_item_size  = test_periods_item_size.rename('item_id_size').reset_index()

In [43]:
train_active = train_active.merge(train_active_user_size, how='left', on='user_id')
test_active = test_active.merge(test_active_user_size, how='left', on='user_id')
train_periods = train_periods.merge(train_periods_item_size, how='left', on='item_id')
test_periods = test_periods.merge(test_periods_item_size, how='left', on='item_id')

In [49]:
dfs_to_save = [train_active,
               test_active,
               train_periods,
               test_periods] 
dfs_save_path = ['train_active_ids_date.csv', 
                 'test_active_ids_date.csv', 
                 'train_periods.csv', 
                 'test_periods.csv']
for df, path in zip(dfs_to_save, dfs_save_path):
    df = reduce_memory(df)
    df.to_csv(path, index=False)
    print(path, 'saved')