In [29]:
import numpy as np, pandas as pd, os, copy, time, joblib, datetime
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.

/kaggle/input/ashraeb/submission_b.csv
/kaggle/input/ashrae-train-test-1-0/__results__.html
/kaggle/input/ashrae-train-test-1-0/custom.css
/kaggle/input/ashrae-train-test-1-0/__notebook__.ipynb
/kaggle/input/ashrae-train-test-1-0/df_test_label_enc.csv.gz
/kaggle/input/ashrae-train-test-1-0/__output__.json
/kaggle/input/ashrae-train-test-1-0/df_label_enc.csv.gz
/kaggle/input/ashraesitemodels/lgbm_7_.pkl
/kaggle/input/ashraesitemodels/lgbm_14_.pkl
/kaggle/input/ashraesitemodels/lgbm_15_.pkl
/kaggle/input/ashraesitemodels/lgbm_13_.pkl
/kaggle/input/ashraesitemodels/lgbm_2_.pkl
/kaggle/input/ashraesitemodels/lgbm_5_.pkl
/kaggle/input/ashraesitemodels/lgbm_0_.pkl
/kaggle/input/ashraesitemodels/lgbm_4_.pkl
/kaggle/input/ashraesitemodels/lgbm_10_.pkl
/kaggle/input/ashraesitemodels/lgbm_6_.pkl
/kaggle/input/ashraesitemodels/lgbm_9_.pkl
/kaggle/input/ashraesitemodels/lgbm_1_.pkl
/kaggle/input/ashraesitemodels/lgbm_3_.pkl
/kaggle/input/ashraesitemodels/lgbm_11_.pkl
/kaggle/input/ashraesitemodels

In [30]:
output_compression_type = 'gzip'
compression_type = None
data_folder = '../input/'
output_folder = ''
df_path = data_folder + 'ashrae-train-test-1-0/df_label_enc.csv.gz'
df_test_path = data_folder + 'ashrae-train-test-1-0/df_test_label_enc.csv.gz'
model_path = data_folder + 'ashraesitemodels/'

In [31]:
debug = True
generate_output = True
set_a = True

if set_a:
    sites = [a for a in range(0,8)]
    output_file_name = 'a_submission.csv'
    existing_file_name = 'b_submission.csv'
else:
    sites = [a for a in range(8,16)]
    output_file_name = 'b_submission.csv'
    existing_file_name = 'a_submission.csv'
existing_file_path = data_folder + 'ashraeb/' + existing_file_name

if output_compression_type == 'gzip':
    output_file_name = output_file_name + '.gz'
    output_existing_file_name = existing_file_name + '.gz'
else:
    output_existing_file_name = existing_file_name
    
if debug:
    rows = 1000
else:
    rows = None

In [32]:
dtypes = [{'col_name': 'building_id', 'data_type': 'int16', 'feature_col': 1}
         , {'col_name': 'meter', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'site_id', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'primary_use', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'year_built', 'data_type': 'int16', 'feature_col': 1}
         , {'col_name': 'floor_count', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'air_temperature', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'cloud_coverage', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'dew_temperature', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'precip_depth_1_hr', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'sea_level_pressure', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'wind_direction', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'wind_speed', 'data_type': 'float16', 'feature_col': 1}
         , {'col_name': 'meter_reading', 'data_type': 'float16', 'feature_col': 0}
         , {'col_name': 'square_feet', 'data_type': 'float64', 'feature_col': 1}
         , {'col_name': 'month', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'day', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'hour', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'day_of_week', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'weekend', 'data_type': 'int8', 'feature_col': 1}
         , {'col_name': 'night', 'data_type': 'int8', 'feature_col': 1}
         ]
dtype = {col['col_name']: col['data_type'] for col in dtypes}

In [33]:
df = pd.read_csv(df_path
                 , dtype=dtype
                 , usecols=['meter_reading']
                 , nrows=rows)
df.drop(df[df['meter_reading']==np.inf].index, axis=0, inplace=True)

In [34]:
df_test = pd.read_csv(df_test_path
                 , dtype=dtype
                 , parse_dates=['timestamp']
                 , nrows=rows)

In [35]:
# df_existing_file = pd.read_csv(existing_file_path, compression='gzip')

In [36]:
feature_cols = [col['col_name'] for col in dtypes if col['feature_col']==1]
target_col = 'meter_reading'
min_val = df[target_col].min()
max_val = df[target_col].max()

In [37]:
outputs = pd.DataFrame()
cols = copy.deepcopy(feature_cols)
if 'site_id' in cols:
    cols.remove('site_id')

for site in sites:
    t_start = time.perf_counter()
    model_name = model_path + 'lgbm_' + str(site) + '_.pkl'
    model = joblib.load(model_name)
    print('Site [{site_no}] start...'.format(site_no=site))
    print('{dt:%Y-%m-%d %H-%M-%S}'.format(dt=datetime.datetime.now()))
    
    t_df_prep = time.perf_counter()
    df_test_site = df_test.loc[df_test['site_id']==site, cols]
    row_id_site = df_test.loc[df_test['site_id']==site, ['row_id']]['row_id']
    print('{label:<30s}: {value:,}'.format(
        label='Number of rows',
        value=len(df_test_site.index)))

    print('{label:<30s}: {value:.2f}'.format(
        label='Test set prep time',
        value=(time.perf_counter()-t_df_prep)/60))
    
    t_predict = time.perf_counter()
    if len(df_test_site.index) > 0:
        pred_site = model.predict(df_test_site)
        pred_site = np.clip(pred_site, min_val, max_val) 
        output_site = pd.DataFrame({'row_id': row_id_site,'meter_reading': pred_site})
        outputs = outputs.append(output_site, ignore_index = True)
    print('{label:<30s}: {value:.2f}'.format(
        label='Predict time',
        value=(time.perf_counter()-t_predict)/60))

    t_stop = time.perf_counter()
    print('{label:<30s}: {value:.2f}'.format(
        label='Elapsed time in minutes',
        value=(t_stop-t_start)/60))
    print('{dt:%Y-%m-%d %H-%M-%S}'.format(dt=datetime.datetime.now()))
    print('Site [{site_no}] end!\n'.format(site_no=site))
y_pred = outputs[target_col]    

Site [0] start...
2019-12-18 02-19-58
Number of rows                : 1,000
Test set prep time            : 0.00
Predict time                  : 0.02
Elapsed time in minutes       : 0.04
2019-12-18 02-19-59
Site [0] end!

Site [1] start...
2019-12-18 02-20-00
Number of rows                : 0
Test set prep time            : 0.00
Predict time                  : 0.00
Elapsed time in minutes       : 0.03
2019-12-18 02-20-00
Site [1] end!

Site [2] start...
2019-12-18 02-20-02
Number of rows                : 0
Test set prep time            : 0.00
Predict time                  : 0.00
Elapsed time in minutes       : 0.03
2019-12-18 02-20-02
Site [2] end!

Site [3] start...
2019-12-18 02-20-03
Number of rows                : 0
Test set prep time            : 0.00
Predict time                  : 0.00
Elapsed time in minutes       : 0.03
2019-12-18 02-20-03
Site [3] end!

Site [4] start...
2019-12-18 02-20-05
Number of rows                : 0
Test set prep time            : 0.00
Predict time   

In [38]:
if generate_output:
    outputs.to_csv(output_file_name, index=False, compression=output_compression_type)
#     df_existing_file.to_csv(output_existing_file_name, index=False, compression=output_compression_type)
    outputs.to_csv(output_existing_file_name, index=False, compression=output_compression_type)

NotADirectoryError: [Errno 20] Not a directory: 'submission_a.csv.gz/submission_a.csv.gz'