In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import warnings
import time
warnings.filterwarnings('ignore')
np.random.seed(4950)

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
df_merchant = pd.read_csv('../../ELO/merchants.csv')
df_hist_trans = pd.read_csv('../../ELO/historical_transactions.csv',usecols = ['card_id','merchant_id'])
# df_new_merchant_trans = pd.read_csv('../input/elo-merchant-category-recommendation/new_merchant_transactions.csv')

In [4]:
df_hist_trans = reduce_mem_usage(df_hist_trans)
df_merchant = reduce_mem_usage(df_merchant)

Mem. usage decreased to 444.22 Mb (0.0% reduction)
Mem. usage decreased to 30.32 Mb (46.0% reduction)


In [22]:
df_new_merchant_trans = pd.read_csv('../../ELO/new_merchant_transactions.csv')
df_new_merchant_trans = reduce_mem_usage(df_new_merchant_trans)

Mem. usage decreased to 114.20 Mb (45.5% reduction)


In [41]:
df_train=pd.read_csv('train.csv',parse_dates=['first_active_month'])
df_test=pd.read_csv('test.csv',parse_dates=['first_active_month'])

In [5]:
df_merchant.head()

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057465,-0.057465,N,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,N,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057465,-0.057465,N,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,N,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057465,-0.057465,N,E,E,-82.129997,...,-82.129997,260.0,2,-82.129997,260.0,2,N,-1,5,5.0
3,M_ID_a70e9c5f81,5026,792,9,-0.057465,-0.057465,Y,E,E,,...,,4.666667,6,,3.833333,12,Y,-1,-1,
4,M_ID_64456c37ce,2228,222,21,-0.057465,-0.057465,Y,E,E,,...,,0.361111,6,,0.347222,12,Y,-1,-1,


In [6]:
df_hist_trans = df_hist_trans.merge(df_merchant, on = 'merchant_id', how = 'left')

In [7]:
df_hist_trans.head()

Unnamed: 0,card_id,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,C_ID_4e6213e9bc,M_ID_e020e9b302,35.0,80.0,37.0,23.46875,23.015625,N,A,A,...,1.14,1.114135,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0
1,C_ID_4e6213e9bc,M_ID_86ec983688,2084.0,367.0,16.0,1.092773,-0.057465,N,A,A,...,1.06,1.058605,6.0,1.05,1.062087,12.0,Y,88.0,16.0,1.0
2,C_ID_4e6213e9bc,M_ID_979ed661fc,27369.0,80.0,37.0,0.021851,0.021851,N,C,C,...,0.98,0.967058,6.0,0.97,0.956668,12.0,Y,88.0,16.0,1.0
3,C_ID_4e6213e9bc,M_ID_e6d5ae8ea6,24104.0,560.0,34.0,-0.057465,-0.057465,N,D,C,...,0.88,0.897406,6.0,0.86,0.864394,12.0,Y,88.0,16.0,1.0
4,C_ID_4e6213e9bc,M_ID_e020e9b302,35.0,80.0,37.0,23.46875,23.015625,N,A,A,...,1.14,1.114135,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0


In [8]:
def get_new_columns(name,aggs):
    #for for 写法 nice
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [9]:
del df_hist_trans['merchant_id']

In [10]:
for df in [df_hist_trans]:
    aggs = {}
    for col in ['most_recent_sales_range','most_recent_purchases_range','category_1','category_2','category_4',
                'subsector_id','merchant_category_id', 'state_id','city_id','merchant_group_id']:
        aggs[col] = ['nunique']
    for col in ['numerical_1','numerical_2']:
        aggs[col] =  ['sum','max','min','mean','var','median']
    for col in ['avg_sales_lag3','avg_purchases_lag3','active_months_lag3','avg_sales_lag6','avg_purchases_lag6',
               'active_months_lag6','avg_sales_lag12','avg_purchases_lag12','active_months_lag12']:
        aggs[col] = ['sum', 'mean']
#     features = ['category_2','category_3','state_id','subsector_id','merchant_category_id','city_id'
#                ,'merchant_id']
#     for col in features:
#         df[col+'_mean'] = df.groupby([col])['purchase_amount'].transform('mean')
#         df[col+'_sum'] = df.groupby([col])['purchase_amount'].transform('sum') 
#         aggs[col+'_mean'] = ['mean']
#         aggs[col+'_sum'] = ['sum'] 
    prefix = 'merchant_merge_hist'
    new_columns = get_new_columns(prefix,aggs)
    df_hist_trans_group = df.groupby('card_id').agg(aggs)
    df_hist_trans_group.columns = new_columns
    df_hist_trans_group.reset_index(drop=False,inplace=True)
    del df
    gc.collect()
    time.sleep(5)

In [11]:
df_hist_trans_group.head()

Unnamed: 0,card_id,merchant_merge_hist_most_recent_sales_range_nunique,merchant_merge_hist_most_recent_purchases_range_nunique,merchant_merge_hist_category_1_nunique,merchant_merge_hist_category_2_nunique,merchant_merge_hist_category_4_nunique,merchant_merge_hist_subsector_id_nunique,merchant_merge_hist_merchant_category_id_nunique,merchant_merge_hist_state_id_nunique,merchant_merge_hist_city_id_nunique,...,merchant_merge_hist_avg_purchases_lag6_sum,merchant_merge_hist_avg_purchases_lag6_mean,merchant_merge_hist_active_months_lag6_sum,merchant_merge_hist_active_months_lag6_mean,merchant_merge_hist_avg_sales_lag12_sum,merchant_merge_hist_avg_sales_lag12_mean,merchant_merge_hist_avg_purchases_lag12_sum,merchant_merge_hist_avg_purchases_lag12_mean,merchant_merge_hist_active_months_lag12_sum,merchant_merge_hist_active_months_lag12_mean
0,C_ID_00007093c1,5,5,2,1,2,12,18,2,2,...,156.494324,1.050297,894.0,6.0,159.190002,1.068389,165.997952,1.11408,1776.0,11.919463
1,C_ID_0001238066,5,5,2,2,2,17,27,5,16,...,655.069438,5.325768,738.0,6.0,334.23999,2.717398,714.115425,5.805816,1476.0,12.0
2,C_ID_0001506ef0,5,5,2,1,2,12,17,2,3,...,103.265523,1.518611,408.0,6.0,87.479996,1.286471,105.664009,1.553882,806.0,11.852941
3,C_ID_0001793786,5,5,2,3,2,23,46,4,7,...,607.345252,2.735789,1332.0,6.0,409.51001,1.84464,603.030942,2.716356,2631.0,11.851351
4,C_ID_000183fdda,5,5,2,1,2,20,35,4,3,...,1250.224477,8.390768,894.0,6.0,559.219971,3.753154,1347.227115,9.041793,1760.0,11.812081


In [12]:
df_hist_trans_group.to_pickle('merchant_merge_hist_features.pickle')

In [17]:
df_hist_trans_group.shape

(325540, 41)

In [38]:
index=df_hist_trans[~(df_hist_trans.card_id.isin(df_new_merchant_trans['card_id']))].groupby('card_id').count().index

In [44]:
df_no_new=df_train[df_train.card_id.isin(index)]

In [47]:
df_no_new[df_no_new.target<-33].shape[0]/df_no_new.shape[0]

0.026172997127353975

In [48]:
df_no_new.shape[0]

21931

In [49]:
df_no_new_test=df_test[df_test.card_id.isin(index)]

In [50]:
df_no_new_test.shape[0]

13608

In [51]:
13608*0.026172997127353975

356.16214490903286

In [None]:
index2=df_hist_trans[~(df_hist_trans.card_id.isin(df_new_merchant_trans['card_id']))].groupby('card_id').count().index

In [None]:
strange_items = df_hist_trans[~(historical_df['card_id'].isin(new_merchant_df['card_id']))].groupby(['card_id'])['authorized_flag'].agg(['size'])\

                                .reset_index().rename(columns={'size': 'number_of_transactions'})

strange_items = strange_items[strange_items['number_of_transactions']<7]

strange_items = train[(train['card_id'].isin(strange_items['card_id']))&amp;((train['target']<-33)|(train['target']>0))]

In [25]:
strange_items = df_hist_trans[~(df_hist_trans['card_id'].isin(df_new_merchant_trans['card_id']))].groupby(['card_id'])['authorized_flag'].agg(['size']).reset_index().rename(columns={'size': 'number_of_transactions'})


KeyError: 'Column not found: authorized_flag'

In [53]:
df_hist_trans.columns

Index(['card_id', 'merchant_group_id', 'merchant_category_id', 'subsector_id',
       'numerical_1', 'numerical_2', 'category_1', 'most_recent_sales_range',
       'most_recent_purchases_range', 'avg_sales_lag3', 'avg_purchases_lag3',
       'active_months_lag3', 'avg_sales_lag6', 'avg_purchases_lag6',
       'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12',
       'active_months_lag12', 'category_4', 'city_id', 'state_id',
       'category_2'],
      dtype='object')

In [27]:
df_hist_trans[~(df_hist_trans['card_id'].isin(df_new_merchant_trans['card_id']))].groupby(['card_id'])['authorized_flag']

KeyError: 'Column not found: authorized_flag'

In [29]:
df_hist_trans.head()

Unnamed: 0,card_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,C_ID_4e6213e9bc,35.0,80.0,37.0,23.46875,23.015625,N,A,A,1.08,...,1.14,1.114135,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0
1,C_ID_4e6213e9bc,2084.0,367.0,16.0,1.092773,-0.057465,N,A,A,1.06,...,1.06,1.058605,6.0,1.05,1.062087,12.0,Y,88.0,16.0,1.0
2,C_ID_4e6213e9bc,27369.0,80.0,37.0,0.021851,0.021851,N,C,C,0.98,...,0.98,0.967058,6.0,0.97,0.956668,12.0,Y,88.0,16.0,1.0
3,C_ID_4e6213e9bc,24104.0,560.0,34.0,-0.057465,-0.057465,N,D,C,1.0,...,0.88,0.897406,6.0,0.86,0.864394,12.0,Y,88.0,16.0,1.0
4,C_ID_4e6213e9bc,35.0,80.0,37.0,23.46875,23.015625,N,A,A,1.08,...,1.14,1.114135,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0
