Changing feature enginering

In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
np.random.seed(4590)

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
df_train = pd.read_csv('C:/Users/user/Documents/Salamat/ELO/train.csv')
df_test = pd.read_csv('C:/Users/user/Documents/Salamat/ELO/test.csv')
df_hist_trans = pd.read_csv('C:/Users/user/Documents/Salamat/ELO/historical_transactions.csv')
df_new_merchant_trans = pd.read_csv('C:/Users/user/Documents/Salamat/ELO/new_merchant_transactions.csv')

In [6]:
df_train=reduce_mem_usage(df_train)
df_test=reduce_mem_usage(df_test)
df_hist_trans=reduce_mem_usage(df_hist_trans)
df_new_merchant_trans=reduce_mem_usage(df_new_merchant_trans)

Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)
Mem. usage decreased to 1749.11 Mb (43.7% reduction)
Mem. usage decreased to 114.20 Mb (45.5% reduction)


Count number of purchases made in each merchant. We will use it to fill NaN values by most frequent merchant id.

Chech weather we have NaN values in the following categories

In [7]:
df_hist_trans.isnull().any()

authorized_flag         False
card_id                 False
city_id                 False
category_1              False
installments            False
category_3               True
merchant_category_id    False
merchant_id              True
month_lag               False
purchase_amount         False
purchase_date           False
category_2               True
state_id                False
subsector_id            False
dtype: bool

In [8]:
df_new_merchant_trans.isnull().any()

authorized_flag         False
card_id                 False
city_id                 False
category_1              False
installments            False
category_3               True
merchant_category_id    False
merchant_id              True
month_lag               False
purchase_amount         False
purchase_date           False
category_2               True
state_id                False
subsector_id            False
dtype: bool

It seems that 'category_2' , 'category_3' and 'mechant_id' has Nan values in historical and new mechant transactions. Now, let's count values in each of this categories for historical transactions. Let's start with 'category_2'

In [9]:
df_hist_trans.category_2.value_counts(dropna=False)

 1.0    15177199
 3.0     3911795
 5.0     3725915
NaN      2652864
 4.0     2618053
 2.0     1026535
Name: category_2, dtype: int64

There is 2652864 Nan values in 'category_2'. Now, let's check if in each 'card_id' 'category_2' have only 'Nan' unique value and change it to the most frequent values found in the whole history transacations. Most frequent seems to be 1.(in the above cell).

In order to check we can use groupby and sum function. By setting min_count=1, we can get nan value for sum of nan array, if we didn't it will give zero by default.

In [10]:
group_cat2=df_hist_trans.groupby(['card_id']).category_2.sum(min_count=1)

Now let's select those who have non-values only

In [11]:
group_cat2_nan=group_cat2[group_cat2.isnull()]

In [12]:
group_cat2_nan.head()

card_id
C_ID_001b4c5151   NaN
C_ID_001c09a36b   NaN
C_ID_0028e15a78   NaN
C_ID_002b706ded   NaN
C_ID_0030e0945f   NaN
Name: category_2, dtype: float16

I am setting index as 'card_id'. In order to , change 'category_2' values. I tried just by using 
df_hist_trans.loc[df_hist_trans.card_id.isin(group_cat2_nan.index)].category_2=1
or 
df_hist_trans[df_hist_trans.card_id.isin(group_cat2_nan.index)].category_2
and you can check by
df_hist_trans.loc[df_hist_trans.card_id.isin(group_cat2_nan.index)].category_2
or
df_hist_trans.loc[df_hist_trans.card_id.isin(group_cat2_nan.index)].category_2
You will see that values do not change. Both of this methods gives copy from the dataframe so we can't change it. It seems when you use masking(df_hist_trans.card_id.isin(group_cat2_nan.index)) you will have copy. 
Therefore, I decided to call from index and index will be 'card_id'.


In [13]:
df_hist_trans[df_hist_trans.card_id.isin(group_cat2_nan.index)].category_2=1

In [14]:
df_hist_trans[df_hist_trans.card_id.isin(group_cat2_nan.index)].category_2.head()

15207   NaN
15208   NaN
15209   NaN
15210   NaN
15211   NaN
Name: category_2, dtype: float16

You can see that it doesn't work. So , let's change index of df_hist_trans to 'card_id'. Call dataframe from their respective indexes

In [15]:
df_hist_trans.set_index('card_id',inplace=True)

In [16]:
df_hist_trans.loc[group_cat2_nan.index,'category_2']=1

In [17]:
df_hist_trans.loc[group_cat2_nan.index,'category_2'].unique()

array([ 1.])

Now, we only changed the once which have only NaN values in 'category_2'. Let's check how many nan are still there. So nan values are reduced by 115446.

In [18]:
df_hist_trans.category_2.value_counts(dropna=False)

 1.0    15327456
 3.0     3911795
 5.0     3725915
 4.0     2618053
NaN      2502607
 2.0     1026535
Name: category_2, dtype: int64

Now let's reset_index and groupby 'card_id' and 'category' . We can look at number of counts in each category of 'category_2'.

In [19]:
df_hist_trans.reset_index(inplace=True)

In [20]:
category_2_count=df_hist_trans.groupby(['card_id','category_2']).count()

In [21]:
category_2_count.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id
card_id,category_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C_ID_00007093c1,3.0,120,120,120,120,120,120,120,120,120,120,120,120
C_ID_00007093c1,5.0,1,1,1,1,1,1,1,1,1,1,1,1
C_ID_0001238066,1.0,95,95,95,95,94,95,95,95,95,95,95,95
C_ID_0001238066,5.0,20,20,20,20,19,20,20,20,20,20,20,20
C_ID_0001506ef0,1.0,2,2,2,2,2,2,2,2,2,2,2,2
C_ID_0001506ef0,3.0,64,64,64,64,64,64,64,64,64,64,64,64
C_ID_0001793786,1.0,11,11,11,11,11,11,11,11,11,11,11,11
C_ID_0001793786,2.0,76,76,76,76,76,76,76,76,76,76,76,76
C_ID_0001793786,3.0,15,15,15,15,15,15,15,15,15,15,15,15
C_ID_000183fdda,1.0,7,7,7,7,7,7,7,7,7,7,7,7


We need only one column(since all of them are same) let's choose 'authorized_flag'. 

In [22]:
category_2_count=category_2_count.authorized_flag

In [23]:
category_2_count.head(20)

card_id          category_2
C_ID_00007093c1  3.0           120
                 5.0             1
C_ID_0001238066  1.0            95
                 5.0            20
C_ID_0001506ef0  1.0             2
                 3.0            64
C_ID_0001793786  1.0            11
                 2.0            76
                 3.0            15
C_ID_000183fdda  1.0             7
                 2.0             1
                 3.0           131
                 5.0             1
C_ID_00024e244b  1.0             3
                 3.0            67
C_ID_0002709b5a  1.0             1
                 2.0            52
                 5.0            14
C_ID_00027503e2  1.0             3
                 3.0            39
Name: authorized_flag, dtype: int64

We will need only index of maximum values. We can do it by groupby(level=0), level=0 is 'card_id' in our case.

In [24]:
category_2_count_max=category_2_count.groupby(level=0).idxmax()



Now we need only second part of the tuple. Finally, we will obtain Series object with corresponding max count of categories for each 'card_id'

In [25]:
category_2_count_max=category_2_count_max.apply(lambda x: x[1])

In [26]:
category_2_count_max.head()

card_id
C_ID_00007093c1    3.0
C_ID_0001238066    1.0
C_ID_0001506ef0    3.0
C_ID_0001793786    2.0
C_ID_000183fdda    3.0
Name: authorized_flag, dtype: float64

Now, we can input most frequent 'category_2' value for each non value in certain 'card_id'

In [27]:
df_hist_trans.set_index('card_id',inplace=True)
df_hist_trans.head()

Unnamed: 0_level_0,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C_ID_4e6213e9bc,Y,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
C_ID_4e6213e9bc,Y,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
C_ID_4e6213e9bc,Y,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


Let's check what where value counts before

In [28]:
df_hist_trans.category_2.value_counts(dropna=False)

 1.0    15327456
 3.0     3911795
 5.0     3725915
 4.0     2618053
NaN      2502607
 2.0     1026535
Name: category_2, dtype: int64

Fill nan values according to given series by using fillna function

In [29]:
df_hist_trans.category_2=df_hist_trans.category_2.fillna(category_2_count_max)

Finally, we get rid of all nan values for 'category_2' . Now, we can try same for 'category_3' and 'merchant_id'. Also, we need to do same for 'df_new_merchant_trans'. Finally, let's check if there is any nan values in category_2

In [30]:
df_hist_trans.category_2.value_counts(dropna=False)

1.0    16804879
3.0     4289903
5.0     4050578
4.0     2793190
2.0     1173811
Name: category_2, dtype: int64

Let's do same for 'category_3'

In [31]:
df_hist_trans.category_3.value_counts(dropna=False)

A      15411747
B      11677522
C       1844933
NaN      178159
Name: category_3, dtype: int64

Let's change 'A' , 'B', 'C' to numerical values in order to be able to use sum(min_counts=1) function

In [32]:
d_cat3={'A':1,'B':2,'C':3}

In [33]:
df_hist_trans.category_3=df_hist_trans.category_3.map(d_cat3)

In [34]:
df_hist_trans.head()

Unnamed: 0_level_0,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,1.0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,1.0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [39]:
df_hist_trans.category_3.value_counts(dropna=False)

 1.0    15411747
 2.0    11677522
 3.0     1844933
NaN       178159
Name: category_3, dtype: int64

In [40]:
group_cat3=df_hist_trans.groupby(['card_id']).category_3.sum(min_count=1)

In [41]:
group_cat3.isnull().sum()

0

### This means that we don't have any card_id which have only NaN values in category_3. So we can jump to changing nan values in each card id.

In [42]:
df_hist_trans

Unnamed: 0_level_0,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,1.0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,1.0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37
C_ID_4e6213e9bc,Y,333,N,0,1.0,80,M_ID_50af771f8d,0,-0.734887,2018-02-24 08:45:05,1.0,9,37
C_ID_4e6213e9bc,Y,88,N,0,1.0,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21 00:10:51,1.0,16,37
C_ID_4e6213e9bc,Y,3,N,0,1.0,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18 20:05:55,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01 22:02:56,1.0,16,37
C_ID_4e6213e9bc,Y,88,N,0,1.0,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16 15:41:22,1.0,16,37


In [None]:
group_cat3_nan=group_cat3[group_cat3.isnull()]
# 'A' is most frequent which corresponds to 1
df_hist_trans[df_hist_trans.isin(group_cat3_nan.index)].category_3=1

In [41]:
df_train['category_2']=df_train.card_id.apply(lambda x: category_2_count.loc[x].idxmax())

In [None]:
df_hist_trans.category_2.value_counts(dropna=False)

In [None]:
df_hist_trans.category_2=df_hist_trans.card_id.apply(lambda x: 1 if x in group_cat2_nan)

In [None]:
1 in group_cat2_nan

In [None]:
df_hist_trans.category_2.value_counts(dropna=False)

In [None]:
merchant_na=df_hist_trans.groupby(['card_id','merchant_id']).count()

In [None]:
a=merchant_na.authorized_flag

In [None]:
a.loc['C_ID_00007093c1'].idxmax()

In [None]:
df_train['max_merchant_id']=df_train.card_id.apply(lambda x: a.loc[x].idxmax())

In [None]:
df_hist_trans['category_2'].value_counts(dropna=False)

In [None]:
df_hist_trans['category_3'].value_counts(dropna=False)

In [None]:
category_2_na=df_hist_trans.groupby(['card_id','category_2']).count()
category_3_na=df_hist_trans.groupby(['card_id','category_3']).count()

For some card_id unique values are only NaN which can't be used as index. We can change those values to most frequently seen values. And then Apply to fill rest Nan values by the same strategy as we use for new merchant id

In [None]:
category_2_na=category_2_na.authorized_flag

In [None]:
category_2_na.loc['C_ID_00007093c1'].max()

In [None]:
category_2_group=df_hist_trans.groupby('card_id').category_2.max()

In [None]:
category_2_group.value_counts(dropna=False)

In [None]:
uncat2=category_2_group.unique()

In [None]:
category_2_group1=category_2_group.apply(lambda x: 1 if pd.isnull(x) else x)

In [None]:
category_2_group1.value_counts(dropna=False)

In [None]:
df_new_merchant_trans[df_new_merchant_trans.card_id=='C_ID_0030e0945f']

In [None]:
category_2_group[category_2_group.isnull()]

In [None]:
df_train['max_category_2']=df_train.card_id.apply(lambda x: category_2_na.loc[x].idxmax())

Salamat :Changed NaN values in merchant ID to most frequent merchant. Might not be good idea since it is generalizing. So, we can make it unique to each card id most frequent merchant maybe ...
Same goes for category_3

In [None]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [None]:
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

I we can take weekofyear,day of week, weekend as extra parameter which I didn't. I am not sure if month diff is correct. Still confused about this issue

In [None]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    #https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/73244
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

In [None]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_hist_trans[col+'_mean'] = df_hist_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']    

new_columns = get_new_columns('hist',aggs)
df_hist_trans_group = df_hist_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['hist_purchase_date_diff'] = (df_hist_trans_group['hist_purchase_date_max'] - df_hist_trans_group['hist_purchase_date_min']).dt.days
df_hist_trans_group['hist_purchase_date_average'] = df_hist_trans_group['hist_purchase_date_diff']/df_hist_trans_group['hist_card_id_size']
df_hist_trans_group['hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['hist_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

In [None]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_new_merchant_trans[col+'_mean'] = df_new_merchant_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']
    
new_columns = get_new_columns('new_hist',aggs)
df_hist_trans_group = df_new_merchant_trans.groupby('card_id').agg(aggs)
df_hist_trans_group.columns = new_columns
df_hist_trans_group.reset_index(drop=False,inplace=True)
df_hist_trans_group['new_hist_purchase_date_diff'] = (df_hist_trans_group['new_hist_purchase_date_max'] - df_hist_trans_group['new_hist_purchase_date_min']).dt.days
df_hist_trans_group['new_hist_purchase_date_average'] = df_hist_trans_group['new_hist_purchase_date_diff']/df_hist_trans_group['new_hist_card_id_size']
df_hist_trans_group['new_hist_purchase_date_uptonow'] = (datetime.datetime.today() - df_hist_trans_group['new_hist_purchase_date_max']).dt.days
df_train = df_train.merge(df_hist_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_hist_trans_group,on='card_id',how='left')
del df_hist_trans_group;gc.collect()

In [None]:
del df_hist_trans;gc.collect()
del df_new_merchant_trans;gc.collect()
df_train.head(5)

In [None]:
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
df_train['outliers'].value_counts()

In [None]:
df_train.columns

In [None]:
df_train.hist_purchase_date_min

In [None]:
for df in [df_train,df_test]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['dayofweek'] = df['first_active_month'].dt.dayofweek
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_hist_first_buy'] = (df['new_hist_purchase_date_min'] - df['first_active_month']).dt.days
    for f in ['hist_purchase_date_max','hist_purchase_date_min','new_hist_purchase_date_max',\
                     'new_hist_purchase_date_min']:
        df[f] = df[f].astype(np.int64) * 1e-9
    df['card_id_total'] = df['new_hist_card_id_size']+df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_hist_purchase_amount_sum']+df['hist_purchase_amount_sum']

for f in ['feature_1','feature_2','feature_3']:
    order_label = df_train.groupby([f])['outliers'].mean()
    df_train[f] = df_train[f].map(order_label)
    df_test[f] = df_test[f].map(order_label)


In [None]:
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
target = df_train['target']
del df_train['target']

In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = df_train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.sqrt(mean_squared_error(oof, target))

In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="Feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
sub_df = pd.DataFrame({"card_id":df_test["card_id"].values})
sub_df["target"] = predictions
sub_df.to_csv("chau_feature_engineering.csv", index=False)

**To be continued ...**