In [1]:
#!/usr/bin/env python
# coding: utf-8


import numpy as np
import pandas as pd

import _pickle as cPickle
import argparse
from copy import deepcopy
import japanize_matplotlib
import lightgbm as lgb
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import mean_squared_error
import time
from tqdm import tqdm

In [2]:
parser = argparse.ArgumentParser()
arg = parser.add_argument
arg('seed', type=int)
arg('iteration_mul', type=float)
arg('train_file', type=str)
arg('test_file', type=str)
arg('--learning_rate', type=float, default=0.05)
arg('--num_leaves', type=int, default=31)
args = parser.parse_args(args=['1', '0.5','train_fe.ftr', 'test_fe.ftr'])

print(args)

Namespace(iteration_mul=0.5, learning_rate=0.05, num_leaves=31, seed=1, test_file='test_fe.ftr', train_file='train_fe.ftr')


In [3]:
train_fe = pd.read_feather(f'../prepare_data/{args.train_file}')
test_fe = pd.read_feather(f'../prepare_data/{args.test_file}')

target_fe = train_fe['meter_reading']
train_fe = train_fe.drop('meter_reading', axis=1)

In [4]:
X_train = train_fe.query('20160115 <= timestamp < 20160601 & site_id != 0')
X_valid = train_fe.query('20160901 <= timestamp < 20170101 & site_id != 0')
X_test = test_fe

y_train = target_fe.loc[X_train.index]
y_valid = target_fe.loc[X_valid.index]
# y_train = np.log1p(y_train)
# y_valid = np.log1p(y_valid)

X_train = X_train.drop('timestamp', axis=1)
X_valid = X_valid.drop('timestamp', axis=1)
X_test = X_test.drop('timestamp', axis=1)

# print(X_train.shape)

In [5]:
def meter_predict(meter, model, X_test, best_iteration, iteration_mul=1.5):
    X_test_m = X_test.query('meter == {}'.format(meter)).drop('meter', axis=1)
    g = X_test_m.groupby('building_id')
    
    y_pred = []
    for building_id in tqdm(sorted(X_test_m['building_id'].unique())):
        X_building = g.get_group(building_id)
        y_pred.append(pd.Series(model.predict(X_building, n_jobs=4,num_iteration=min(models_all[meter].n_estimators, int(best_iteration[meter][building_id]*iteration_mul))), index=X_building.index))
        
    return pd.concat(y_pred).sort_index()

In [6]:
# load model
load_name = '../model/model_use_{}_seed{}_leave{}_lr{}.pkl'.format(args.train_file.replace('.ftr', ''),args.seed, args.num_leaves, str(args.learning_rate).replace('.', ''))
with open(load_name, 'rb') as f:
    models = pickle.load(f)

In [7]:
# with open('../model/model_5_95_hokan_cleaning_50000tree_seed{}.pkl'.format(args.seed), 'wb') as f:
#     pickle.dump(models, f)

# 各building, meter毎の最良のiteration数
best_iteration = dict()
for meter in [0,1,2,3]:
    best_iteration[meter] = dict()
#     for i in range(1448):
#         best_iteration[meter][i] = 200
    for i in tqdm(sorted(X_valid.query('meter == {}'.format(meter))['building_id'].unique())):
        best_iteration[meter][i] = max(20, np.argmin(np.array(models[meter].evals_result_[i]['rmse'])) + 1)
#         best_iteration[meter][i] = np.argmin(np.array(models[meter].evals_result_[i]['rmse'])) + 1

100%|██████████| 1302/1302 [00:00<00:00, 21041.58it/s]
100%|██████████| 474/474 [00:00<00:00, 16805.86it/s]
100%|██████████| 324/324 [00:00<00:00, 19022.32it/s]
100%|██████████| 142/142 [00:00<00:00, 9510.29it/s]


In [8]:
del_list = [list(), list(), list(), list()]
for meter in [0,1,2,3]:
    for buildingID, itr in best_iteration[meter].items():
        if itr<=20:
            del_list[meter].append(buildingID)
        if itr<=100:
            best_iteration[meter][buildingID] = 100
#         if itr>=int(models[0].n_estimators * 0.98):
#             best_iteration[meter][buildingID] = models[0].n_estimatorss

In [9]:
for meter in [0,1,2,3]:
    for i in range(1448):
        if i not in best_iteration[meter]:
            best_iteration[meter][i] = 200

In [10]:
#load model
load_name = '../model/model_all_use_{}_seed{}_leave{}_lr{}.pkl'.format(args.train_file.replace('.ftr', ''),args.seed, args.num_leaves, str(args.learning_rate).replace('.', ''))
with open(load_name, 'rb') as f:
    models_all = pickle.load(f)

In [11]:
# meter type毎のtestの予測    
preds = list()
for i in tqdm([3,2,1,0]):
    preds.append(meter_predict(i, models_all[i], X_test, best_iteration, iteration_mul=args.iteration_mul))

y_preds = pd.concat(preds).sort_index()

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/145 [00:00<?, ?it/s][A
  1%|          | 1/145 [00:00<00:14,  9.90it/s][A
  2%|▏         | 3/145 [00:00<00:14,  9.82it/s][A
  3%|▎         | 5/145 [00:00<00:13, 10.38it/s][A
  5%|▍         | 7/145 [00:00<00:12, 11.22it/s][A
  6%|▌         | 9/145 [00:00<00:10, 12.71it/s][A
  8%|▊         | 11/145 [00:00<00:09, 13.44it/s][A
  9%|▉         | 13/145 [00:00<00:09, 13.65it/s][A
 10%|█         | 15/145 [00:01<00:09, 13.78it/s][A
 12%|█▏        | 17/145 [00:01<00:08, 14.88it/s][A
 13%|█▎        | 19/145 [00:01<00:08, 14.93it/s][A
 14%|█▍        | 21/145 [00:01<00:07, 15.59it/s][A
 16%|█▌        | 23/145 [00:01<00:08, 14.07it/s][A
 17%|█▋        | 25/145 [00:01<00:08, 14.08it/s][A
 19%|█▊        | 27/145 [00:01<00:08, 13.64it/s][A
 20%|██        | 29/145 [00:02<00:08, 13.78it/s][A
 21%|██▏       | 31/145 [00:02<00:07, 14.47it/s][A
 23%|██▎       | 33/145 [00:02<00:07, 14.85it/s][A
 24%|██▍       | 35/145 [00:02<00:07, 14

 31%|███       | 155/498 [00:10<00:20, 16.74it/s][A
 32%|███▏      | 157/498 [00:10<00:22, 15.00it/s][A
 32%|███▏      | 159/498 [00:11<00:22, 15.18it/s][A
 32%|███▏      | 161/498 [00:11<00:20, 16.26it/s][A
 33%|███▎      | 163/498 [00:11<00:19, 17.04it/s][A
 33%|███▎      | 165/498 [00:11<00:19, 17.48it/s][A
 34%|███▎      | 167/498 [00:11<00:18, 17.95it/s][A
 34%|███▍      | 169/498 [00:11<00:18, 17.57it/s][A
 34%|███▍      | 171/498 [00:11<00:19, 17.07it/s][A
 35%|███▍      | 173/498 [00:11<00:19, 16.38it/s][A
 35%|███▌      | 175/498 [00:11<00:18, 17.24it/s][A
 36%|███▌      | 177/498 [00:12<00:18, 17.80it/s][A
 36%|███▌      | 179/498 [00:12<00:19, 16.06it/s][A
 36%|███▋      | 181/498 [00:12<00:20, 15.36it/s][A
 37%|███▋      | 183/498 [00:12<00:19, 16.01it/s][A
 37%|███▋      | 185/498 [00:12<00:18, 16.57it/s][A
 38%|███▊      | 187/498 [00:12<00:18, 16.52it/s][A
 38%|███▊      | 189/498 [00:12<00:19, 16.08it/s][A
 38%|███▊      | 191/498 [00:12<00:18, 16.42it

 20%|█▉        | 277/1413 [00:20<01:42, 11.13it/s][A
 20%|█▉        | 279/1413 [00:20<01:45, 10.79it/s][A
 20%|█▉        | 281/1413 [00:20<01:47, 10.54it/s][A
 20%|██        | 283/1413 [00:20<01:49, 10.33it/s][A
 20%|██        | 285/1413 [00:20<01:40, 11.20it/s][A
 20%|██        | 287/1413 [00:20<01:38, 11.45it/s][A
 20%|██        | 289/1413 [00:21<01:29, 12.60it/s][A
 21%|██        | 291/1413 [00:21<01:35, 11.74it/s][A
 21%|██        | 293/1413 [00:21<01:29, 12.53it/s][A
 21%|██        | 295/1413 [00:21<01:21, 13.76it/s][A
 21%|██        | 297/1413 [00:21<01:35, 11.74it/s][A
 21%|██        | 299/1413 [00:21<01:39, 11.15it/s][A
 21%|██▏       | 301/1413 [00:22<01:43, 10.71it/s][A
 21%|██▏       | 303/1413 [00:22<01:31, 12.20it/s][A
 22%|██▏       | 305/1413 [00:22<01:34, 11.68it/s][A
 22%|██▏       | 307/1413 [00:22<01:41, 10.90it/s][A
 22%|██▏       | 309/1413 [00:22<01:40, 11.01it/s][A
 22%|██▏       | 311/1413 [00:22<01:34, 11.62it/s][A
 22%|██▏       | 313/1413 [0

 62%|██████▏   | 882/1413 [01:07<00:44, 12.04it/s][A
 63%|██████▎   | 884/1413 [01:07<00:43, 12.17it/s][A
 63%|██████▎   | 886/1413 [01:07<00:44, 11.85it/s][A
 63%|██████▎   | 888/1413 [01:07<00:45, 11.43it/s][A
 63%|██████▎   | 890/1413 [01:08<00:44, 11.74it/s][A
 63%|██████▎   | 892/1413 [01:08<00:39, 13.24it/s][A
 63%|██████▎   | 894/1413 [01:08<00:38, 13.49it/s][A
 63%|██████▎   | 896/1413 [01:08<00:35, 14.39it/s][A
 64%|██████▎   | 898/1413 [01:08<00:34, 15.09it/s][A
 64%|██████▎   | 900/1413 [01:08<00:35, 14.29it/s][A
 64%|██████▍   | 902/1413 [01:08<00:35, 14.34it/s][A
 64%|██████▍   | 904/1413 [01:08<00:35, 14.28it/s][A
 64%|██████▍   | 906/1413 [01:09<00:38, 13.07it/s][A
 64%|██████▍   | 908/1413 [01:09<00:38, 13.05it/s][A
 64%|██████▍   | 910/1413 [01:09<00:36, 13.68it/s][A
 65%|██████▍   | 912/1413 [01:09<00:39, 12.85it/s][A
 65%|██████▍   | 914/1413 [01:09<00:38, 13.13it/s][A
 65%|██████▍   | 916/1413 [01:09<00:41, 11.99it/s][A
 65%|██████▍   | 918/1413 [0

In [12]:
# lgb.plot_importance(models_all[0], importance_type='gain', figsize=(10,20))
# lgb.plot_importance(models_all[0], importance_type='split', figsize=(10,20))

In [13]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['meter_reading'] = (np.expm1(y_preds))
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0

In [14]:
save_name = '../output/use_{}_seed{}_leave{}_lr{}_mul{}.csv'.format(args.train_file.replace('.ftr', ''), args.seed, args.num_leaves, str(args.learning_rate).replace('.', ''), str(args.iteration_mul).replace('.', ''))
submission.to_csv(save_name, index=False)

In [16]:
submission.head()

Unnamed: 0,row_id,meter_reading
0,0,185.95355
1,1,91.564835
2,2,10.186876
3,3,290.553136
4,4,1379.153921
