In [None]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import warnings
import time
warnings.filterwarnings('ignore')
np.random.seed(4950)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
import os
os.listdir('../input')

In [None]:
df_merchant = pd.read_csv('../input/merchants.csv')
df_hist_trans = pd.read_csv('../input/historical_transactions.csv',usecols = ['card_id','merchant_id','purchase_amount'])
# df_new_merchant_trans = pd.read_csv('../input/elo-merchant-category-recommendation/new_merchant_transactions.csv')

In [None]:
df_hist_trans = reduce_mem_usage(df_hist_trans)
df_merchant = reduce_mem_usage(df_merchant)

In [None]:
df_merchant.head()

In [None]:
df_hist_trans = df_hist_trans.merge(df_merchant, on = 'merchant_id', how = 'left')

In [None]:
df_hist_trans.head()

In [None]:
def get_new_columns(name,aggs):
    #for for 写法 nice
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [None]:
del df_hist_trans['merchant_id']

In [None]:
for df in [df_hist_trans]:
    aggs = {}
    for col in ['most_recent_sales_range','most_recent_purchases_range','category_1','category_2','category_4',
                'subsector_id','merchant_category_id', 'state_id','city_id','merchant_group_id']:
        aggs[col] = ['nunique']
    for col in ['numerical_1','numerical_2']:
        aggs[col] =  ['sum','max','min','mean','var','median']
    for col in ['avg_sales_lag3','avg_purchases_lag3','active_months_lag3','avg_sales_lag6','avg_purchases_lag6',
               'active_months_lag6','avg_sales_lag12','avg_purchases_lag12','active_months_lag12']:
        aggs[col] = ['sum', 'mean']
#     features = ['category_2','category_3','state_id','subsector_id','merchant_category_id','city_id'
#                ,'merchant_id']
#     for col in features:
#         df[col+'_mean'] = df.groupby([col])['purchase_amount'].transform('mean')
#         df[col+'_sum'] = df.groupby([col])['purchase_amount'].transform('sum') 
#         aggs[col+'_mean'] = ['mean']
#         aggs[col+'_sum'] = ['sum'] 
    prefix = 'merchant_merge_hist'
    new_columns = get_new_columns(prefix,aggs)
    df_hist_trans_group = df.groupby('card_id').agg(aggs)
    df_hist_trans_group.columns = new_columns
    df_hist_trans_group.reset_index(drop=False,inplace=True)
    del df
    gc.collect()
    time.sleep(5)

In [None]:
print(len(df_hist_trans_group.columns))

In [None]:
df_hist_trans_group.head()

In [None]:
df_hist_trans_group.to_pickle('merchant_merge_hist_features.pickle')