# AZS

In [1]:
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

## Args

In [2]:
TRAIN_PATH = '~/.kaggle/competitions/ai-hack-2018-msk-gpn/train_data.csv'
TEST_PATH = '~/.kaggle/competitions/ai-hack-2018-msk-gpn/test_data.csv'
SUBMISSION_PATH = 'submission.csv'

## Read

### Train [`df`]

In [3]:
def initial_pre(df):
    df = df.copy()
    
    # Renaming
    del df['ID']
    rename_map = {
        'CLIENT': 'client',
        'DATA_TRANS': 'time',
        'AZS_NUMBER': 'azs',
        'REGION_AZS': 'region',
        'VID_NP': 'type',
        'COL_LITR': 'litres',
        'CENA_CLIENT': 'price'
    }
    df = df.rename(columns=rename_map)
    
    # Split times
    times_df = pd.DataFrame([(time.hour, time.day, time.month, time.year) for time in pd.to_datetime(df.time)], 
                            columns=['hour', 'day', 'month', 'year'],
                            index=df.index)
    del df['time']
    df = pd.concat([df, times_df], axis=1)
    
    # Reordering
    columns = df.columns.tolist()
    if 'litres' in columns:
        columns.remove('litres')
        columns.append('litres')
    df = df[columns]
    
    return df

In [4]:
# %time df = initial_pre(pd.read_csv(TRAIN_PATH))
# df.shape

In [5]:
# %time df.to_hdf('train_data.h5', 'df', mode='w', format='t', complevel=9)

### Test [`tdf`]

In [6]:
# %time tdf = initial_pre(pd.read_csv(TEST_PATH))
# tdf.shape

In [7]:
# %time tdf.to_hdf('stest_data.h5', 'tdf', mode='w', format='t', complevel=9)

## TT split

In [8]:
class FFTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, df):
        return self
    
    def transform(self, df):
        del df['client']
        del df['azs']
        
        if 'litres' in df.columns:
            y = df.litres
            df.drop('litres', axis=1, inplace=True)
            return df, y
        else:
            return df

In [None]:
preprocess = FFTransformer()
tdf = pd.read_hdf('stest_data.h5')
X_test = preprocess.transform(tdf)
del tdf

In [None]:
scores = []
for i in range(3):
    
    # Data
    df = pd.read_hdf('train_data.h5').sample(frac=0.1)
    X, y = preprocess.fit_transform(df)
    del df
    cat_features = np.where(X.dtypes != np.float)[0]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

    # Train
    model = cb.CatBoostRegressor()
    model.fit(X_train, y_train, cat_features=cat_features, 
              use_best_model=True, eval_set=(X_val, y_val), 
              plot=False);
    
    # Predict
    scores.append(model.predict(X_test))

0:	learn: 141.8962426	test: 143.6069014	best: 143.6069014 (0)	total: 2.8s	remaining: 46m 33s
1:	learn: 139.8959298	test: 141.6172803	best: 141.6172803 (1)	total: 4.33s	remaining: 35m 59s
2:	learn: 137.9879720	test: 139.7177313	best: 139.7177313 (2)	total: 7s	remaining: 38m 46s
3:	learn: 136.1675174	test: 137.9078085	best: 137.9078085 (3)	total: 9.05s	remaining: 37m 34s
4:	learn: 134.4328555	test: 136.1813112	best: 136.1813112 (4)	total: 10.6s	remaining: 35m 6s
5:	learn: 132.7792410	test: 134.5377748	best: 134.5377748 (5)	total: 13.2s	remaining: 36m 30s
6:	learn: 131.2010370	test: 132.9688130	best: 132.9688130 (6)	total: 15.5s	remaining: 36m 41s
7:	learn: 129.6990760	test: 131.4741907	best: 131.4741907 (7)	total: 17.8s	remaining: 36m 48s
8:	learn: 128.2720669	test: 130.0562473	best: 130.0562473 (8)	total: 19.8s	remaining: 36m 15s
9:	learn: 126.9144282	test: 128.7073132	best: 128.7073132 (9)	total: 22.3s	remaining: 36m 48s
10:	learn: 125.6211665	test: 127.4210438	best: 127.4210438 (10)	t

86:	learn: 99.5687160	test: 101.4321006	best: 101.4321006 (86)	total: 2m 46s	remaining: 29m 7s
87:	learn: 99.5361325	test: 101.4003485	best: 101.4003485 (87)	total: 2m 48s	remaining: 29m 9s
88:	learn: 99.5060621	test: 101.3693250	best: 101.3693250 (88)	total: 2m 50s	remaining: 29m 3s
89:	learn: 99.4807625	test: 101.3441459	best: 101.3441459 (89)	total: 2m 51s	remaining: 28m 58s
90:	learn: 99.4575911	test: 101.3216957	best: 101.3216957 (90)	total: 2m 54s	remaining: 29m
91:	learn: 99.4314005	test: 101.2949116	best: 101.2949116 (91)	total: 2m 55s	remaining: 28m 56s
92:	learn: 99.4060764	test: 101.2684894	best: 101.2684894 (92)	total: 2m 57s	remaining: 28m 51s
93:	learn: 99.3852208	test: 101.2477150	best: 101.2477150 (93)	total: 2m 59s	remaining: 28m 47s
94:	learn: 99.3655939	test: 101.2281179	best: 101.2281179 (94)	total: 3m	remaining: 28m 44s
95:	learn: 99.3414049	test: 101.2040584	best: 101.2040584 (95)	total: 3m 3s	remaining: 28m 45s
96:	learn: 99.3168324	test: 101.1773917	best: 101.17

171:	learn: 98.2546990	test: 100.0940766	best: 100.0940766 (171)	total: 5m 36s	remaining: 26m 58s
172:	learn: 98.2492908	test: 100.0889697	best: 100.0889697 (172)	total: 5m 38s	remaining: 26m 55s
173:	learn: 98.2257640	test: 100.0639471	best: 100.0639471 (173)	total: 5m 39s	remaining: 26m 53s
174:	learn: 98.2156939	test: 100.0537408	best: 100.0537408 (174)	total: 5m 42s	remaining: 26m 52s
175:	learn: 98.2079970	test: 100.0470951	best: 100.0470951 (175)	total: 5m 44s	remaining: 26m 52s
176:	learn: 98.1981714	test: 100.0379941	best: 100.0379941 (176)	total: 5m 46s	remaining: 26m 52s
177:	learn: 98.1884150	test: 100.0292303	best: 100.0292303 (177)	total: 5m 48s	remaining: 26m 51s
178:	learn: 98.1814494	test: 100.0228212	best: 100.0228212 (178)	total: 5m 51s	remaining: 26m 51s
179:	learn: 98.1748223	test: 100.0156132	best: 100.0156132 (179)	total: 5m 53s	remaining: 26m 50s
180:	learn: 98.1678965	test: 100.0079756	best: 100.0079756 (180)	total: 5m 55s	remaining: 26m 48s
181:	learn: 98.15864

257:	learn: 97.4813311	test: 99.3006145	best: 99.3006145 (257)	total: 8m 43s	remaining: 25m 4s
258:	learn: 97.4594158	test: 99.2765713	best: 99.2765713 (258)	total: 8m 45s	remaining: 25m 2s
259:	learn: 97.4504510	test: 99.2676684	best: 99.2676684 (259)	total: 8m 47s	remaining: 25m
260:	learn: 97.4465185	test: 99.2635853	best: 99.2635853 (260)	total: 8m 49s	remaining: 24m 59s
261:	learn: 97.4424757	test: 99.2593973	best: 99.2593973 (261)	total: 8m 51s	remaining: 24m 58s
262:	learn: 97.4391646	test: 99.2560980	best: 99.2560980 (262)	total: 8m 53s	remaining: 24m 55s
263:	learn: 97.4353828	test: 99.2521905	best: 99.2521905 (263)	total: 8m 55s	remaining: 24m 54s
264:	learn: 97.4316359	test: 99.2484173	best: 99.2484173 (264)	total: 8m 58s	remaining: 24m 53s
265:	learn: 97.4287970	test: 99.2459903	best: 99.2459903 (265)	total: 9m	remaining: 24m 50s
266:	learn: 97.4259140	test: 99.2432945	best: 99.2432945 (266)	total: 9m 1s	remaining: 24m 47s
267:	learn: 97.4187917	test: 99.2358423	best: 99.23

343:	learn: 96.9593526	test: 98.7606557	best: 98.7606557 (343)	total: 11m 54s	remaining: 22m 43s
344:	learn: 96.9553079	test: 98.7570053	best: 98.7570053 (344)	total: 11m 56s	remaining: 22m 40s
345:	learn: 96.9511895	test: 98.7532673	best: 98.7532673 (345)	total: 11m 59s	remaining: 22m 39s
346:	learn: 96.9443437	test: 98.7459416	best: 98.7459416 (346)	total: 12m 1s	remaining: 22m 38s
347:	learn: 96.9428713	test: 98.7443995	best: 98.7443995 (347)	total: 12m 3s	remaining: 22m 36s
348:	learn: 96.9386437	test: 98.7401116	best: 98.7401116 (348)	total: 12m 5s	remaining: 22m 34s
349:	learn: 96.9360394	test: 98.7375125	best: 98.7375125 (349)	total: 12m 7s	remaining: 22m 31s
350:	learn: 96.9275504	test: 98.7275321	best: 98.7275321 (350)	total: 12m 9s	remaining: 22m 29s
351:	learn: 96.9260048	test: 98.7255712	best: 98.7255712 (351)	total: 12m 12s	remaining: 22m 27s
352:	learn: 96.9237345	test: 98.7235197	best: 98.7235197 (352)	total: 12m 14s	remaining: 22m 26s
353:	learn: 96.9192262	test: 98.719

428:	learn: 96.6593338	test: 98.4561078	best: 98.4561078 (428)	total: 15m 9s	remaining: 20m 9s
429:	learn: 96.6550836	test: 98.4518393	best: 98.4518393 (429)	total: 15m 11s	remaining: 20m 8s
430:	learn: 96.6517450	test: 98.4494669	best: 98.4494669 (430)	total: 15m 14s	remaining: 20m 6s
431:	learn: 96.6488423	test: 98.4463196	best: 98.4463196 (431)	total: 15m 16s	remaining: 20m 5s
432:	learn: 96.6465647	test: 98.4442483	best: 98.4442483 (432)	total: 15m 19s	remaining: 20m 3s
433:	learn: 96.6429159	test: 98.4401867	best: 98.4401867 (433)	total: 15m 21s	remaining: 20m 2s
434:	learn: 96.6394421	test: 98.4372256	best: 98.4372256 (434)	total: 15m 24s	remaining: 20m
435:	learn: 96.6375989	test: 98.4354241	best: 98.4354241 (435)	total: 15m 26s	remaining: 19m 58s
436:	learn: 96.6362597	test: 98.4337973	best: 98.4337973 (436)	total: 15m 29s	remaining: 19m 56s
437:	learn: 96.6270074	test: 98.4240622	best: 98.4240622 (437)	total: 15m 31s	remaining: 19m 54s
438:	learn: 96.6247935	test: 98.4220329	b

513:	learn: 96.4271091	test: 98.2285009	best: 98.2285009 (513)	total: 18m 28s	remaining: 17m 28s
514:	learn: 96.4232802	test: 98.2243940	best: 98.2243940 (514)	total: 18m 31s	remaining: 17m 26s
515:	learn: 96.4221193	test: 98.2231639	best: 98.2231639 (515)	total: 18m 33s	remaining: 17m 23s
516:	learn: 96.4208388	test: 98.2221411	best: 98.2221411 (516)	total: 18m 35s	remaining: 17m 21s
517:	learn: 96.4191778	test: 98.2206254	best: 98.2206254 (517)	total: 18m 37s	remaining: 17m 19s
518:	learn: 96.4183539	test: 98.2199011	best: 98.2199011 (518)	total: 18m 39s	remaining: 17m 17s
519:	learn: 96.4160132	test: 98.2176109	best: 98.2176109 (519)	total: 18m 41s	remaining: 17m 15s
520:	learn: 96.4096182	test: 98.2100171	best: 98.2100171 (520)	total: 18m 43s	remaining: 17m 13s
521:	learn: 96.4078851	test: 98.2083174	best: 98.2083174 (521)	total: 18m 46s	remaining: 17m 11s
522:	learn: 96.4071359	test: 98.2072440	best: 98.2072440 (522)	total: 18m 48s	remaining: 17m 8s
523:	learn: 96.4053364	test: 98

598:	learn: 96.2331031	test: 98.0381892	best: 98.0381892 (598)	total: 21m 55s	remaining: 14m 40s
599:	learn: 96.2291911	test: 98.0335143	best: 98.0335143 (599)	total: 21m 58s	remaining: 14m 38s
600:	learn: 96.2276415	test: 98.0319660	best: 98.0319660 (600)	total: 22m	remaining: 14m 36s
601:	learn: 96.2263940	test: 98.0310166	best: 98.0310166 (601)	total: 22m 2s	remaining: 14m 34s
602:	learn: 96.2256877	test: 98.0301795	best: 98.0301795 (602)	total: 22m 3s	remaining: 14m 31s
603:	learn: 96.2134304	test: 98.0190974	best: 98.0190974 (603)	total: 22m 6s	remaining: 14m 29s
604:	learn: 96.2123625	test: 98.0181781	best: 98.0181781 (604)	total: 22m 8s	remaining: 14m 27s
605:	learn: 96.2113099	test: 98.0174248	best: 98.0174248 (605)	total: 22m 11s	remaining: 14m 25s
606:	learn: 96.2107120	test: 98.0170007	best: 98.0170007 (606)	total: 22m 13s	remaining: 14m 23s
607:	learn: 96.2085505	test: 98.0153531	best: 98.0153531 (607)	total: 22m 15s	remaining: 14m 21s
608:	learn: 96.2055292	test: 98.012448

683:	learn: 96.0888319	test: 97.8981730	best: 97.8981730 (683)	total: 25m 16s	remaining: 11m 40s
684:	learn: 96.0866211	test: 97.8960000	best: 97.8960000 (684)	total: 25m 18s	remaining: 11m 38s
685:	learn: 96.0858173	test: 97.8950362	best: 97.8950362 (685)	total: 25m 21s	remaining: 11m 36s
686:	learn: 96.0845430	test: 97.8935946	best: 97.8935946 (686)	total: 25m 23s	remaining: 11m 34s
687:	learn: 96.0831581	test: 97.8920909	best: 97.8920909 (687)	total: 25m 26s	remaining: 11m 32s
688:	learn: 96.0814869	test: 97.8905391	best: 97.8905391 (688)	total: 25m 28s	remaining: 11m 30s
689:	learn: 96.0786847	test: 97.8873652	best: 97.8873652 (689)	total: 25m 31s	remaining: 11m 27s
690:	learn: 96.0779530	test: 97.8865540	best: 97.8865540 (690)	total: 25m 33s	remaining: 11m 25s
691:	learn: 96.0772613	test: 97.8857903	best: 97.8857903 (691)	total: 25m 36s	remaining: 11m 23s
692:	learn: 96.0665174	test: 97.8761616	best: 97.8761616 (692)	total: 25m 38s	remaining: 11m 21s
693:	learn: 96.0645732	test: 9

769:	learn: 95.9367837	test: 97.7494919	best: 97.7494919 (769)	total: 28m 37s	remaining: 8m 33s
770:	learn: 95.9349817	test: 97.7475103	best: 97.7475103 (770)	total: 28m 40s	remaining: 8m 30s
771:	learn: 95.9342311	test: 97.7465872	best: 97.7465872 (771)	total: 28m 41s	remaining: 8m 28s
772:	learn: 95.9305466	test: 97.7423522	best: 97.7423522 (772)	total: 28m 43s	remaining: 8m 26s
773:	learn: 95.9288373	test: 97.7408779	best: 97.7408779 (773)	total: 28m 46s	remaining: 8m 24s
774:	learn: 95.9259720	test: 97.7375774	best: 97.7375774 (774)	total: 28m 49s	remaining: 8m 22s
775:	learn: 95.9167049	test: 97.7295090	best: 97.7295090 (775)	total: 28m 51s	remaining: 8m 19s
776:	learn: 95.9164131	test: 97.7291979	best: 97.7291979 (776)	total: 28m 53s	remaining: 8m 17s
777:	learn: 95.9157453	test: 97.7286392	best: 97.7286392 (777)	total: 28m 56s	remaining: 8m 15s
778:	learn: 95.9142534	test: 97.7272761	best: 97.7272761 (778)	total: 28m 59s	remaining: 8m 13s
779:	learn: 95.9123370	test: 97.7258072	

855:	learn: 95.8247789	test: 97.6421637	best: 97.6421637 (855)	total: 31m 52s	remaining: 5m 21s
856:	learn: 95.8233900	test: 97.6411340	best: 97.6411340 (856)	total: 31m 54s	remaining: 5m 19s
857:	learn: 95.8222343	test: 97.6401094	best: 97.6401094 (857)	total: 31m 56s	remaining: 5m 17s
858:	learn: 95.8216646	test: 97.6397693	best: 97.6397693 (858)	total: 31m 59s	remaining: 5m 14s
859:	learn: 95.8197155	test: 97.6379989	best: 97.6379989 (859)	total: 32m 1s	remaining: 5m 12s
860:	learn: 95.8194498	test: 97.6378143	best: 97.6378143 (860)	total: 32m 3s	remaining: 5m 10s
861:	learn: 95.8190210	test: 97.6372485	best: 97.6372485 (861)	total: 32m 5s	remaining: 5m 8s
862:	learn: 95.8184630	test: 97.6365730	best: 97.6365730 (862)	total: 32m 7s	remaining: 5m 5s
863:	learn: 95.8154989	test: 97.6331113	best: 97.6331113 (863)	total: 32m 9s	remaining: 5m 3s
864:	learn: 95.8145865	test: 97.6324353	best: 97.6324353 (864)	total: 32m 11s	remaining: 5m 1s
865:	learn: 95.8129872	test: 97.6310097	best: 97.

941:	learn: 95.7175243	test: 97.5413208	best: 97.5413208 (941)	total: 35m 18s	remaining: 2m 10s
942:	learn: 95.7155965	test: 97.5394934	best: 97.5394934 (942)	total: 35m 20s	remaining: 2m 8s
943:	learn: 95.7146921	test: 97.5385727	best: 97.5385727 (943)	total: 35m 22s	remaining: 2m 5s
944:	learn: 95.7145731	test: 97.5384453	best: 97.5384453 (944)	total: 35m 24s	remaining: 2m 3s
945:	learn: 95.7143206	test: 97.5382310	best: 97.5382310 (945)	total: 35m 27s	remaining: 2m 1s
946:	learn: 95.7138458	test: 97.5380332	best: 97.5380332 (946)	total: 35m 30s	remaining: 1m 59s
947:	learn: 95.7123356	test: 97.5367210	best: 97.5367210 (947)	total: 35m 33s	remaining: 1m 57s
948:	learn: 95.7113765	test: 97.5359757	best: 97.5359757 (948)	total: 35m 35s	remaining: 1m 54s
949:	learn: 95.7109544	test: 97.5355686	best: 97.5355686 (949)	total: 35m 37s	remaining: 1m 52s
950:	learn: 95.7104262	test: 97.5348450	best: 97.5348450 (950)	total: 35m 39s	remaining: 1m 50s
951:	learn: 95.7102790	test: 97.5347376	best

## Train

In [None]:
...

## Submission

In [None]:
sdf = pd.DataFrame(X_test.index, columns=['ID'])
sdf['COL_LITR'] = np.vstack(scores).mean(axis=0)
sdf.head()

In [None]:
sdf.to_csv(SUBMISSION_PATH, index=False)