In [5]:
%matplotlib inline
import os
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

DATA_DIR = '../input/'
os.listdir(DATA_DIR)

In [10]:
import time
from contextlib import contextmanager
from functools import lru_cache
os.environ['OMP_NUM_THREADS'] = '4'

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.1f} s')
def reduce_memory(df):
    for c in df.columns:
        if df[c].dtype=='int':
            if df[c].min()<0:
                if df[c].abs().max()<2**7:
                    df[c] = df[c].astype('int8')
                elif df[c].abs().max()<2**15:
                    df[c] = df[c].astype('int16')
                elif df[c].abs().max()<2**31:
                    df[c] = df[c].astype('int32')
                else:
                    continue
            else:
                if df[c].max()<2**8:
                    df[c] = df[c].astype('uint8')
                elif df[c].max()<2**16:
                    df[c] = df[c].astype('uint16')
                elif df[c].max()<2**32:
                    df[c] = df[c].astype('uint32')
                else:
                    continue
        if df[c].dtype=='float64':
            df[c] = df[c].astype('float32')
    return df
def get_dow(df):
    f = lambda x:pd.to_datetime(x).dayofweek if x!='unknown' else 8
    unq = df['activation_date'].unique().tolist()
    d = dict([u, f(u)] for u in unq)
    df['dow'] = df['activation_date'].map(d.get)
    return df

In [11]:
df_names = ['train.csv', 
            'train_active.csv', 
            'test.csv', 
            'test_active.csv'
            ]

def get_group_feat(by, y='once', df_names=df_names):
    print('Working on', by)
    if not isinstance(by, list):
        by = [by]
    if y!='once':
        usecols = by+[y]
    else:
        usecols = by
    if 'image_top_1' in by or 'image' in by:
        df_names = ['train.csv', 'test.csv']
    df = []
    for fname in df_names:
        print('loading', fname)
        df.append(pd.read_csv(DATA_DIR+fname, usecols=usecols).fillna('unknown'))
        if y=='price':
            df[-1][y] = np.log1p(df[-1][y].fillna(0)).astype('float32')
        elif y=='once':
            df[-1][y] = 1
            df[-1][y] = df[-1][y].astype('int')
        if fname in ['train.csv', 'test.csv']:
            df[-1]['eval'] = 1
        else:
            df[-1]['eval'] = 0
        df[-1]['eval'] = df[-1]['eval'].astype('uint8')
        if 'image' in by:
            df[-1]['image'] = (~df[-1]['image'].isnull()).astype('uint8')
        if 'activation_date' in by:
            df[-1] = get_dow(df[-1])
    with timer('concating'):
        df = pd.concat(df, ignore_index=True)
    gc.collect();
    if len(by)>1:
        p_colname = 'P_'+'X'.join(by)
    else:
        p_colname = by[0]
    featname = p_colname+'_cnt'
    grp = df.groupby(by, sort=False)[y]
    with timer('summing'):
        feat = grp.sum().rename(featname).reset_index()
    df = df[df['eval']==1]
    feat = df.merge(feat, on=by, how='left')[featname]
    feat = feat.reset_index(drop=True)
    gc.collect();
    return feat, featname

In [12]:
by_li = [#'item_id',
         'user_id',
         'region', 
         'city', 
         'parent_category_name', 
         'category_name',
         'param_1', 
         'param_2', 
         'param_3',
         #'title',
         #'description',
         #'price',
         'item_seq_number', 
         #'activation_date',
         'user_type',
         'image',
         'image_top_1',
         #'deal_probability',
        ]
# by_li = []

# by_li += [['region', 'parent_category_name']]
# by_li += [['region', 'category_name']]
# by_li += [['region', 'param_1']]
# by_li += [['region', 'param_2']]
# by_li += [['region', 'param_3']]
# by_li += [['region', 'image_top_1']]
# by_li += [['region', 'user_type', 'parent_category_name']]
# by_li += [['region', 'user_type', 'category_name']]
# by_li += [['region', 'user_type', 'param_1']]
# by_li += [['region', 'user_type', 'param_2']]
# by_li += [['region', 'user_type', 'param_3']]
# by_li += [['region', 'user_type', 'image_top_1']]

In [13]:
df = pd.DataFrame()
name_li = []
for idx, by in enumerate(by_li):
    gc.collect();
    print('No.', idx+1, 'out of', len(by_li))
    feat, name = get_group_feat(by)
    if isinstance(by, str):
        name = by
    name_li.append(name)
    df[name] = feat.values
print('all feature names', name_li)
df = df[name_li]
df = reduce_memory(df)
print(df.info())
df.head().T

In [14]:
df.shape

In [41]:
df.to_csv('count_enc.csv', index=False)