In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import os
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

DATA_DIR = '../input/avito-demand-prediction/'
textdata_path = '../input/adp-prepare-kfold-text/textdata.csv'
target_col = 'deal_probability'
os.listdir(DATA_DIR)

In [2]:
geo_detail_path = '../input/region-and-city-details-with-lat-lon-and-clusters/avito_region_city_features.csv'
geo_detail = pd.read_csv(geo_detail_path)
for c in ['city_region', 'region_id', 'city_region_id']:
    del geo_detail[c]
geo_detail.head(2).T

In [3]:
active_period_feats = pd.read_csv('../input/adp-active-user-feats/active_period_feats.csv')
active_period_feats.head()

In [4]:
active_period_feats.columns.tolist()

In [5]:
act_feat_cols = ['avg_days_from_act_user',
                 'avg_days_up_user',
                 'avg_days_up_sum_user',
                 'avg_times_up_user',
                 'n_user_items']

In [6]:
usecols = ['user_id', #'item_id',
           'region', 'city', 'parent_category_name', 'category_name', 
           'param_1', 'param_2', 'param_3', 
           'activation_date',
           'title', 'description', 
           'price', 'item_seq_number', 
           'user_type', 
           'image_top_1', 'image']
eval_sets = pd.read_csv(textdata_path, usecols=['eval_set'])['eval_set'].values
train_num = (eval_sets!=10).sum()
eval_sets = eval_sets[:train_num]
train = pd.read_csv(DATA_DIR+'train.csv', usecols=usecols+[target_col])
test = pd.read_csv(DATA_DIR+'test.csv', usecols=usecols)
train = train.merge(active_period_feats, on='user_id', how='left')
test = test.merge(active_period_feats, on='user_id', how='left')
del active_period_feats; gc.collect()

In [7]:
def get_dow(df):
    f = lambda x:pd.to_datetime(x).dayofweek
    unq = df['activation_date'].unique().tolist()
    d = dict([u, f(u)] for u in unq)
    df['dow'] = df['activation_date'].map(d.get)
    return df
train = get_dow(train)
test = get_dow(test)
del train['activation_date'], test['activation_date']; gc.collect()

In [8]:
len(set(train['user_id'].values.tolist()) & set(test['user_id'].values.tolist())), len(set(test['user_id'].values.tolist()))

In [9]:
common_indexes = set(train['user_id'].values.tolist()) & set(test['user_id'].values.tolist())
common_indexes = list(common_indexes)
train['user_common'] = 0
test['user_common'] = 0
train = train.set_index('user_id')
test = test.set_index('user_id')
train.loc[common_indexes, 'user_common'] = 1
test.loc[common_indexes, 'user_common'] = 1
train = train.reset_index()
test = test.reset_index()
del common_indexes; gc.collect()

In [10]:
train['user_id_common'] = train['user_id'].values
train.loc[train['user_common']==0, 'user_id_common'] = 'unknown'
test['user_id_common'] = test['user_id'].values
test.loc[test['user_common']==0, 'user_id_common'] = 'unknown'

In [11]:
train.head(2).T

In [12]:
train_num == len(train)

In [13]:
y = train[target_col].values
del train[target_col]; gc.collect()
train_num = len(train)
df = pd.concat([train, test], ignore_index=True)
del train, test; gc.collect()

In [14]:
geo = df[['city', 'region']].merge(geo_detail, on=['city', 'region'], how='left')
geo.head(1).T

In [15]:
geo.columns.tolist()

In [16]:
geo_cols = ['latitude',
            'longitude',
            'lat_lon_hdbscan_cluster_05_03',
            'lat_lon_hdbscan_cluster_10_03',
            'lat_lon_hdbscan_cluster_20_03']
geo = geo[geo_cols]
geo.to_csv('geo_detail.csv', index=False)

In [17]:
df['image'].isnull().sum()

In [18]:
df['image'] = (~df['image'].isnull()).astype('int8')

In [19]:
del df['user_common']; gc.collect();

In [20]:
df['image_top_1'].isnull().sum()

In [21]:
df['image_top_1'].min(), df['image_top_1'].max()

In [22]:
df.head(3).T

In [23]:
enc_cols = ['user_id', 'user_id_common',
            'region', 'city', 'parent_category_name', 'category_name', 
            'param_1', 'param_2', 'param_3', 
            'user_type', 
            'image_top_1']

# pair_cols = [('image_top_1', 'city'), 
#              ('image_top_1', 'region'), 
#              ('image_top_1', 'param_1'), 
#              ('city', 'region'), 
#              ('city', 'param_1'), 
#              ('region', 'param_1')]

# for i, pair in enumerate(pair_cols):
#     print('column pairing', i, pair)
#     p_colname = 'P_'+'X'.join(pair)
#     df[p_colname] = ''
#     for p in pair:
#         df[p_colname] += df[p].fillna('unknown').astype(str)
#     enc_cols.append(p_colname)

enc_dict = {}
for i, c in enumerate(enc_cols):
    print('label encoding', i, c)
    values, names = pd.factorize(df[c].fillna('unknown'))
    df[c] = values
    #enc_dict[c] = pd.DataFrame(names.values, columns=['lbe'])
    #enc_dict[c].to_csv(c+'_enc.csv', index=False)

In [24]:
df['price_bin'] = pd.cut(np.log1p(df['price']), 256, labels=np.arange(256))
df['price_bin'] = df['price_bin'].astype('float').fillna(-1)
df['price_bin'] = df['price_bin'].astype('int')

df['item_seq_bin'] = pd.cut(np.log1p(df['item_seq_number']), 512, labels=np.arange(512))
df['item_seq_bin'] = df['item_seq_bin'].astype('float').fillna(-1)
df['item_seq_bin'] = df['item_seq_bin'].astype('int')

In [25]:
df.head(3).T

In [26]:
del df['title'], df['description']; gc.collect();
df.info()

In [27]:
def reduce_memory(df):
    for c in df.columns:
        if df[c].dtype=='int':
            if df[c].min()<0:
                if df[c].abs().max()<2**7:
                    df[c] = df[c].astype('int8')
                elif df[c].abs().max()<2**15:
                    df[c] = df[c].astype('int16')
                elif df[c].abs().max()<2**31:
                    df[c] = df[c].astype('int32')
                else:
                    continue
            else:
                if df[c].max()<2**8:
                    df[c] = df[c].astype('uint8')
                elif df[c].max()<2**16:
                    df[c] = df[c].astype('uint16')
                elif df[c].max()<2**32:
                    df[c] = df[c].astype('uint32')
                else:
                    continue
    return df
df = reduce_memory(df)
print(df.info())

In [29]:
act_feat_cols

In [30]:
cols = ['user_id',
        'user_id_common',
        'region',
        'city',
        'parent_category_name',
        'category_name',
        'param_1',
        'param_2',
        'param_3',
        'price', 'price_bin', 
        'item_seq_number', 'item_seq_bin', 
        'user_type',
        'image',
        'image_top_1',
        'dow'] + \
        act_feat_cols
df = df[cols]

In [31]:
df.shape

In [32]:
df.to_csv('data_lbe.csv', index=False)