In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
data = pd.read_csv('Data/train_rosbank.csv')
data['TRDATETIME'] = pd.to_datetime(data['TRDATETIME'], format='%d%b%y:%X')
data.head()

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,2017-10-21 00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,2017-10-12 12:24:07,20000.0,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,2017-12-05 00:00:00,767.0,POS,0,0.0
3,01/10/2017,0,5411,,810,2017-10-21 00:00:00,2031.0,POS,0,0.0
4,01/10/2017,0,6012,,810,2017-10-24 13:14:24,36562.0,C2C_OUT,0,0.0


In [3]:
data['channel_type'] = data['channel_type'].fillna('0')

In [4]:
# Пересчитаем валютные транзакции в рубли по курсам цб
mapping = {810: 1, 978: 65.87, 504: 6.5, 704: 0.0025, 981: 23, 985: 15.48,
           840: 58.31, 949: 15.97, 51: 4.3, 826: 75.11, 214: 1.23, 764: 1.7,
           203: 2.5, 702: 42, 360: 0.0042, 756: 59.26, 933: 30.20, 975: 33.68,
           36: 13.5, 191: 8.5, 784: 15.9, 980: 2.2, 124: 44.97, 398: 0.18,
           376: 15.7, 944: 33.81, 352: 0.52, 417: 0.85, 156: 8.6, 752: 6.8,
           392: 0.52, 484: 3.3, 634: 15.8, 188: 0.11, 643: 1, 348: 0.21, 356: 0.90,
           458: 13.5, 986: 18, 498: 3.5, 578: 7.1, 208: 8.9, 344: 7.5, 32: 65.87,
           410: 0.05, 788: 24, 480: 1.6, 604: 17.5, 941: 0.55, 144: 0.37, 946: 14.4,
           710: 4.52, 690: 4, 44: 20, 170: 0.02, 901: 1.9, 608: 1.15, 554: 41.5, 462: 3.7}
data['amount'] = data['amount'] * data['currency'].map(mapping)

In [5]:
%%time

def build_features(data):
    aggregated = data.groupby('cl_id')[['channel_type']].first()
    ids = aggregated.index
    aggregated['cl_id'] = ids
    aggregated['total_number_transact_cl_id'] = data.groupby(
        'cl_id')['TRDATETIME'].apply(lambda x: x.count())
    aggregated['uniq_number_MCC_cl_id'] = data.groupby(
        'cl_id')['MCC'].apply(lambda x: x.nunique())
    data['uniq_number_MCC_cl_id_period'] = data.groupby(
        ['cl_id', 'PERIOD'])['MCC'].transform(lambda x: x.nunique())
    aggregated['uniq_number_MCC_cl_id_period'] = data.groupby(
        'cl_id')['uniq_number_MCC_cl_id_period'].apply(lambda x: x.mean())
    aggregated['trx_category_cl_id_mode'] = data.groupby(
        'cl_id')['trx_category'].apply(lambda x: x.value_counts().index[0])
    aggregated['diff_days_cl_id'] = data.groupby(
        'cl_id')['TRDATETIME'].apply(lambda x: x.max() - x.min()).dt.days
    data['diff_days_cl_id_period'] = data.groupby(
        ['cl_id', 'PERIOD'])['TRDATETIME'].transform(
        lambda x: x.max() - x.min()).dt.days
    aggregated['diff_days_cl_id_period'] = data.groupby(
        'cl_id')['diff_days_cl_id_period'].apply(lambda x: x.mean())
    data['days_from_first_transaction'] = data.groupby(
        'cl_id')['TRDATETIME'].transform(lambda x: x - x.min()).dt.days
    aggregated['days_from_first_transaction'] = data.groupby(
        'cl_id')['days_from_first_transaction'].apply(lambda x: x.mean())
    aggregated['mad_of_days_from_first_transaction'] = data.groupby(
        'cl_id')['days_from_first_transaction'].apply(lambda x: x.mad())
    data['trx_type'] = data.apply(
        lambda x: 1 if x['trx_category'] in ['DEPOSIT', 'C2C_IN', 'BACK_TRX'] else -1, axis=1)
    data['amount_signed'] = data['amount'] * data['trx_type']
    aggregated['amount_signed'] = data.groupby(
        'cl_id')['amount_signed'].apply(lambda x: x.mean()) 
    aggregated['amount_sum'] = data.groupby(
        'cl_id')['amount'].apply(lambda x: x.sum())
    aggregated['amount_signed_sum'] = data.groupby(
        'cl_id')['amount_signed'].apply(lambda x: x.sum())
    aggregated['amount_snyatie'] = (aggregated['amount_sum'] - aggregated['amount_signed_sum']) / 2
    aggregated['amount_popolnenie'] = aggregated['amount_sum'] - aggregated['amount_snyatie']
    aggregated['target_flag'] = data.groupby('cl_id')['target_flag'].first()           
    return aggregated
data_agg2 = build_features(data)

CPU times: user 35.7 s, sys: 673 ms, total: 36.4 s
Wall time: 28.5 s


In [6]:
data_agg2.head(10)

Unnamed: 0_level_0,channel_type,cl_id,total_number_transact_cl_id,uniq_number_MCC_cl_id,uniq_number_MCC_cl_id_period,trx_category_cl_id_mode,diff_days_cl_id,diff_days_cl_id_period,days_from_first_transaction,mad_of_days_from_first_transaction,amount_signed,amount_sum,amount_signed_sum,amount_snyatie,amount_popolnenie,target_flag
cl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,0,5,5,3.4,POS,53,9.6,16.2,14.72,-4876.6,64383.0,-24383.0,44383.0,20000.0,0
1,0,1,104,23,14.423,POS,92,24.596,59.298,17.724,-1297.905,324982.152,-134982.152,229982.152,95000.0,0
5,0,5,142,34,16.338,POS,92,23.697,47.951,23.597,-1899.197,557666.027,-269686.027,413676.027,143990.0,1
9,0,9,39,5,3.41,POS,89,26.128,43.667,25.744,-6292.695,849315.09,-245415.09,547365.09,301950.0,0
10,0,10,463,36,23.469,POS,89,28.758,44.423,19.295,-543.724,1124343.99,-251743.99,688043.99,436300.0,0
11,0,11,217,39,20.724,POS,91,26.332,39.203,20.694,-318.963,433215.032,-69215.032,251215.032,182000.0,0
14,0,14,136,30,14.574,POS,92,20.706,57.529,22.541,-1053.604,448690.17,-143290.17,295990.17,152700.0,1
20,0,20,77,17,12.156,POS,73,25.0,43.948,19.588,-94.91,437308.09,-7308.09,222308.09,215000.0,0
21,0,21,124,35,18.048,POS,88,27.387,44.363,21.257,-817.142,338881.65,-101325.65,220103.65,118778.0,0
22,0,22,59,12,6.034,POS,66,16.169,25.237,20.236,-77.407,249767.0,-4567.0,127167.0,122600.0,0


In [7]:
data_agg2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 10215
Data columns (total 16 columns):
channel_type                          5000 non-null object
cl_id                                 5000 non-null int64
total_number_transact_cl_id           5000 non-null int64
uniq_number_MCC_cl_id                 5000 non-null int64
uniq_number_MCC_cl_id_period          5000 non-null float64
trx_category_cl_id_mode               5000 non-null object
diff_days_cl_id                       5000 non-null int64
diff_days_cl_id_period                5000 non-null float64
days_from_first_transaction           5000 non-null float64
mad_of_days_from_first_transaction    5000 non-null float64
amount_signed                         5000 non-null float64
amount_sum                            5000 non-null float64
amount_signed_sum                     5000 non-null float64
amount_snyatie                        5000 non-null float64
amount_popolnenie                     5000 non-null float64
ta

In [8]:
y = data_agg2.pop('target_flag')
data_agg2.drop('channel_type', axis=1, inplace=True)
X = pd.get_dummies(data_agg2)

In [9]:
import lightgbm as lgb
kf = KFold(n_splits=5, shuffle=True, random_state=42)
lgboost = lgb.LGBMClassifier(n_estimators=500, max_depth=1, learning_rate=0.1,
                             random_state=42)
cross_val_score(lgboost, X, y, cv=kf, scoring='roc_auc').mean()

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


0.8501735333341586