In [8]:
%matplotlib inline
import os
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

DATA_DIR = '../input/'
os.listdir(DATA_DIR)

In [9]:
import time
from contextlib import contextmanager
from functools import lru_cache
os.environ['OMP_NUM_THREADS'] = '4'

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.1f} s')

In [10]:
df_names = ['train.csv', 
            'train_active.csv', 
            'test.csv', 
            'test_active.csv'
            ]

def get_group_feat(by, y='price', mode='ratio', df_names=df_names):
    print('Working on', by)
    if not isinstance(by, list):
        by = [by]
    usecols = by+[y]
    if 'image_top_1' in by or 'image' in by:
        df_names = ['train.csv', 'test.csv']
    df = []
    for fname in df_names:
        print('loading', fname)
        df.append(pd.read_csv(DATA_DIR+fname, usecols=usecols))
        if y=='price':
            df[-1][y] = np.log1p(df[-1][y].fillna(0)).astype('float32')
        if fname in ['train.csv', 'test.csv']:
            df[-1]['eval'] = 1
        else:
            df[-1]['eval'] = 0
        df[-1]['eval'] = df[-1]['eval'].astype('uint8')
        if 'image' in by:
            df[-1]['image'] = (~df[-1]['image'].isnull()).astype('uint8')
    with timer('concating'):
        df = pd.concat(df, ignore_index=True)
    gc.collect();
    if len(by)>1:
        p_colname = 'P_'+'X'.join(by)
    else:
        p_colname = by[0]
    grp = df.groupby(by, sort=False)[y]
    if mode=='rank':
        with timer('ranking'):
            feat = grp.rank(na_option='top', ascending=True, pct=True).to_frame()
    elif mode=='ratio':
        with timer('calculating ratio'):
            feat = grp.apply(lambda x: (x+1e-12)/(x.max()+1e-12))
    elif mode=='zscore':
        with timer('calculating zscore'):
            zscore = lambda x: (x - x.mean()) / x.std()
            feat = grp.apply(zscore)
    feat.columns = [p_colname+'_prk']
    feat = feat[df['eval']==1].reset_index(drop=True)
    featname = p_colname+'_prk'
    gc.collect();
    return feat, featname

In [None]:
by_li = [#'item_id',
         'user_id',
         'region', 
         'city', 
         'parent_category_name', 
         'category_name',
         'param_1', 
         'param_2', 
         'param_3',
         #'title',
         #'description',
         #'price',
         #'item_seq_number', 
         'activation_date',
         'user_type',
         'image',
         'image_top_1',
         #'deal_probability',
        ]

In [None]:
df = pd.DataFrame()
for idx, by in enumerate(by_li):
    gc.collect();
    print('No.', idx+1, 'out of', len(by_li))
    feat, name = get_group_feat(by)
    df[by] = feat.values
df = df[by_li]
df.head()

In [None]:
df.shape

In [None]:
df.to_csv('price_ratio_enc.csv', index=False)