In [16]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from memory_fix import reduce_mem_usage
%pylab inline

import warnings
warnings.filterwarnings("ignore")

Populating the interactive namespace from numpy and matplotlib


In [21]:
transactions_train = reduce_mem_usage( pd.read_csv('data/transactions_train.csv') )
X_test = reduce_mem_usage(pd.read_csv('data/transactions_test.csv'))

y_train = pd.read_csv('data/train_target.csv')

Memory usage of dataframe is 668.09 MB
Memory usage after optimization is: 250.53 MB
Decreased by 62.5%
Memory usage of dataframe is 668.38 MB
Memory usage after optimization is: 250.64 MB
Decreased by 62.5%


In [210]:
from datetime import date
from dateutil.relativedelta import relativedelta

In [214]:
def get_date(day):
    first_jn = 61    
    return date(2018, 1, 1) + relativedelta(days = day - first_jn)

In [217]:
transactions_train['trans_date_times'] = transactions_train['trans_date'].apply(lambda x: get_date(x))

In [219]:
transactions_train.to_csv('transactions_train_date.csv')

In [223]:
def get_diff(array):
    new_array = list()
    for i in range(1, len(array)):
        new_array.append(array[i] - array[i-1])
    
    return np.mean(new_array), np.max(new_array), np.min(new_array), np.std(new_array)

In [224]:
def poly_trend(timeseries, degree=1):
    """
    Поиск прямой, которая максимально соответствует тренду TS
    Возвращает коэффициенты линейной функции вида f(x) = ax + b
    """
    X = np.arange(0, len(timeseries))
    Y = np.array(timeseries)
    z = np.polyfit(X, Y, degree)

    return z[0],z[1]

In [242]:
small = transactions_train[transactions_train.small_group == target]
group = small.groupby(['client_dk','trans_date_times'])["amount"].agg(['count', 'sum'])

In [261]:
import holidays

In [283]:
ru_hol = holidays.RU()

In [282]:
holidays.RU().get(date(2018, 2, 23))

'День защитника отечества'

In [377]:
group_2 = small.groupby(['client_dk', 'trans_date'])["amount"].agg(['count', 'sum'])

In [383]:
get_diff(list(group_2.loc[2].index))

(32.888888888888886, 61, 14, 17.025761163891513)

In [269]:
transactions_train[transactions_train.client_dk == 2]['trans_date_times']

47024       2017-11-05
68655       2017-11-07
71360       2017-11-12
73288       2017-11-26
76516       2017-11-14
               ...    
21783351    2018-11-15
21820190    2018-11-23
21845186    2018-11-19
21878918    2018-02-27
21890615    2018-02-28
Name: trans_date_times, Length: 1514, dtype: object

In [338]:
group.loc[2].index[-1].month

10

In [336]:
small = transactions_train[transactions_train.small_group == 27]
group = small.groupby(['client_dk', 'trans_date_times'])["amount"].agg(['count', 'sum'])

In [416]:
def get_target_stat(transaction, target, is_train=True):
    stat = {'mean_date_diff':[], 'max_date_diff':[],'min_date_diff':[],'std_date_diff':[],
            'mean_sum_trans':[], 'mean_count_trans':[],'data_trend':[],
            'count_trans':[],'target':[], 'client_dk':[],
            'last_trans_year':[],'last_trans_month':[], 'last_trans_day':[],
            'first_trans_year':[],'first_trans_month':[], 'first_trans_day':[],}
#             'user_last_trans_year':[], 'user_last_trans_month':[], 'user_last_trans_day':[]}
    
    small = transaction[transaction.small_group == target]
    group = small.groupby(['client_dk', 'trans_date_times'])["amount"].agg(['count', 'sum'])
    
    group_2 = small.groupby(['client_dk', 'trans_date'])["amount"].agg(['count', 'sum'])
    
    for ind in tqdm(transaction['client_dk'].unique()):
        if is_train:
            stat['target'].append(y_train[y_train.client_dk == ind][str(target)].values[0])
            stat['client_dk'].append(ind)
        else:
            stat['target'].append(-1)
            stat['client_dk'].append(ind)
            
        try:

            smb = group.loc[ind]
            smb_2 = group_2.loc[ind]
            mean, mx, mn, std = get_diff(list(smb_2.index))
#             print(mean, mx, mn, std)

            stat['mean_date_diff'].append(mean)
            stat['max_date_diff'].append(mx)
            stat['min_date_diff'].append(mn)
            stat['std_date_diff'].append(std)


            stat['mean_sum_trans'].append(np.mean(smb_2['sum']))
            stat['mean_count_trans'].append(np.mean(smb_2['count']))
            stat['count_trans'].append(len(smb))


            stat['last_trans_year'].append(smb.index[-1].year)
            stat['last_trans_month'].append(smb.index[-1].month)
            stat['last_trans_day'].append(smb.index[-1].day)
            
            
            stat['first_trans_year'].append(smb.index[0].year)
            stat['first_trans_month'].append(smb.index[0].month)
            stat['first_trans_day'].append(smb.index[0].day)
            
            
#             last_trans = transactions_train[transactions_train.client_dk == ind]['trans_date_times'].max()
#             stat['user_last_trans_year'].append(last_trans.year)
#             stat['user_last_trans_month'].append(last_trans.month)
#             stat['user_last_trans_day'].append(last_trans.day)
            
            
            
            
            try:
                date_line = poly_trend(list(smb.index))[0]
                stat['data_trend'].append(date_line)
            except Exception:
                stat['data_trend'].append(0)
            
            
        except Exception:
            stat['mean_date_diff'].append(-10000)
            stat['max_date_diff'].append(-10000)
            stat['min_date_diff'].append(-10000)
            stat['std_date_diff'].append(-10000)

            stat['mean_sum_trans'].append(-10000)
            stat['mean_count_trans'].append(-10000)
            stat['count_trans'].append(-10000)
                        
            stat['last_trans_year'].append(-10000)
            stat['last_trans_month'].append(-10000)
            stat['last_trans_day'].append(-10000)
            
            stat['first_trans_year'].append(-10000)
            stat['first_trans_month'].append(-10000)
            stat['first_trans_day'].append(-10000)
            
#             last_trans = transactions_train[transactions_train.client_dk == ind]['trans_date_times'].max()


#             stat['user_last_trans_year'].append(last_trans.year)
#             stat['user_last_trans_month'].append(last_trans.month)
#             stat['user_last_trans_day'].append(last_trans.day)
            
            
            stat['data_trend'].append(-10000)

        
    return stat

In [417]:
TO_PREDICT = ['27', '32', '41', '45', '67', '73', '81', '88']

In [418]:
def create_files(transaction, is_train=True):
    for target in TO_PREDICT:
        stats = get_target_stat(transaction, target, is_train)
        df = pd.DataFrame(stats)
        pd.to_csv(df, f'cat_{target}_train={is_train}')

In [419]:
def predict(model):
    pred = []
    for ind, col in enumerate(TO_PREDICT):
        print(f"{ind+1} / {len(TO_PREDICT)}")
        
        data = get_target_stat(transactions_train, target=int(col))
        TRAIN = pd.DataFrame(data)
        model.fit(TRAIN.drop(['target', 'client_dk'], axis=1), TRAIN['target'])
        
        TEST = pd.DataFrame(get_target_stat(X_test, target=int(col), is_train=False))
        test_ids = TEST['client_dk']
        proba = model.predict_proba(TEST.drop(['target','client_dk'], axis=1))[:,1]
        pred.append(proba)
    
    return dict(zip(TO_PREDICT, pred)), test_ids

## Models

In [420]:
from sklearn.model_selection import cross_val_score
import catboost as cb
import eli5
import lightgbm

In [None]:
lgb = lightgbm.LGBMClassifier(learning_rate=0.025, n_estimators=1000,num_laeves=69,reg_lambda=3, random_seed=1007)


In [422]:
transactions_train.iloc[:10]

Unnamed: 0,client_dk,trans_date,amount,small_group,trans_date_times
0,43976,0,4.563,2,2017-11-01
1,8417,0,48.341999,0,2017-11-01
2,17309,0,12.32,0,2017-11-01
3,33523,0,29.004999,6,2017-11-01
4,24228,0,10.266,6,2017-11-01
5,11611,0,1.95,5,2017-11-01
6,22922,0,6.673,5,2017-11-01
7,29053,1,0.967,0,2017-11-02
8,39287,1,6.478,3,2017-11-02
9,48992,0,10.266,0,2017-11-01


In [423]:
def val(model):
    total_score = 0
    pred = []
    for ind, col in enumerate(TO_PREDICT):
        print(f"{ind+1}/{len(TO_PREDICT)}")
        
        data = get_target_stat(transactions_train, target=int(col))
        TRAIN = pd.DataFrame(data)
        score = cross_val_score(
            model,
            TRAIN.drop(['target', 'client_dk'], axis=1),
            TRAIN['target'],
            cv = 4,
            scoring = "roc_auc",
            verbose = 1
                      )
        total_score += np.mean(score)
        print(col, score, np.mean(score))
    print(total_score / 8)
val(lgb)

1/8


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.3s finished


27 [0.81837053 0.81750468 0.80874978 0.844945  ] 0.8223924966998762
2/8


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.8s finished


32 [0.71689856 0.70717101 0.72044639 0.74616316] 0.7226697784592309
3/8


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.4s finished


41 [0.8218126  0.80966097 0.80813606 0.82638733] 0.8164992412600912
4/8


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))

KeyboardInterrupt: 

In [290]:
def val(model):
    pred = []
    for ind, col in enumerate(TO_PREDICT):
        print(f"{ind+1}/{len(TO_PREDICT)}")
        
        data = get_target_stat(transactions_train, target=int(col))
        TRAIN = pd.DataFrame(data)

        score = cross_val_score(
            model,
            TRAIN.drop(['target', 'client_dk'], axis=1),
            TRAIN['target'],
            cv = 4,
            scoring = "roc_auc",
            verbose = 1
                      )
        print(col, score, np.mean(score))
val(lgb)

1/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.2s finished


27 [0.81430593 0.81520816 0.80319944 0.83443049] 0.8167860048753762
2/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.6s finished


32 [0.72608816 0.72506347 0.73746608 0.76185866] 0.7376190909233798
3/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.8s finished


41 [0.81274142 0.80812033 0.80621751 0.82301697] 0.8125240585724562
4/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.8s finished


45 [0.72722416 0.73157426 0.71995596 0.74566656] 0.7311052348963402
5/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.0s finished


67 [0.78427413 0.77468425 0.78967282 0.86332425] 0.8029888601589151
6/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.4s finished


73 [0.65212699 0.62405095 0.66019247 0.64355374] 0.6449810377175916
7/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.3s finished


81 [0.87107025 0.85859205 0.83040356 0.90019619] 0.8650655121562897
8/8


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.1s finished


88 [0.5772935  0.58753269 0.54179228 0.56265334] 0.5673179547145741


In [281]:
lgb

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.025, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_laeves=69, num_leaves=31,
               objective=None, random_seed=1007, random_state=None,
               reg_alpha=0.0, reg_lambda=3, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [258]:
prediction = predict(lgb)

1/8


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:44<00:00, 557.79it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:13<00:00, 1801.68it/s]


2/8


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:44<00:00, 561.42it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:17<00:00, 1466.95it/s]


3/8


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:46<00:00, 533.80it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:18<00:00, 1336.03it/s]


4/8


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:52<00:00, 479.01it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:22<00:00, 1124.05it/s]


5/8


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:29<00:00, 833.84it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:04<00:00, 5528.80it/s]


6/8


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:38<00:00, 655.10it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:10<00:00, 2466.64it/s]


7/8


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:26<00:00, 953.74it/s]
100%|█████████████████████████████████████████████████████████████████████████| 25000/25000 [00:02<00:00, 10607.61it/s]


8/8


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:35<00:00, 698.52it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:07<00:00, 3539.05it/s]


## Submit

In [12]:
def submit(prediction, filename):
    pred, idx = prediction
    
    pred = pd.DataFrame(pred)
    pred['client_dk'] = idx
    submission = pd.DataFrame(data=np.zeros((25000,8)),columns=y_train.columns[1:],index=test_id['client_dk'].values)

    submission.index.name = 'client_dk'
    submission = submission.reset_index()
    
    sub = pd.DataFrame(submission['client_dk']).merge(pred, on='client_dk')
    
    sub = sub.set_index('client_dk')
    sub = sub.add_prefix("cat_")
    sub.to_csv(f'{filename}.csv')
    
    

In [272]:
def average_score(model, columns):
    for col in columns:
        score = cross_val_score(
        model,
        train_data,
        y_train[col],
        cv = 4,
        scoring = "roc_auc",
        verbose = 1,
        )
    print(col, score, np.mean(score))