In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, gc, warnings
import random
import datetime

from tqdm.notebook import tqdm
# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# from plotly.offline import init_notebook_mode, iplot
# init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.offline as offline
# offline.init_notebook_mode()

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn

import lightgbm as lgb

import pickle

warnings.filterwarnings('ignore')

In [None]:
path = '../input/work-df/'
# Input data files are available in the "../input/" directory.
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load data and display samples

In [None]:
# unimportant features (see importance below)
unimportant_cols = ['wind_direction', 'wind_speed', 'sea_level_pressure']
target = 'meter_reading'

def load_data(source='train', path=path):
    ''' load and merge all tables '''
    assert source in ['train', 'test']
    
    building = pd.read_csv(f'{path}/building_metadata.csv', dtype={'building_id':np.uint16, 'site_id':np.uint8})
    weather  = pd.read_csv(f'{path}/weather_{source}_filled.csv', parse_dates=['timestamp'],
                                                           dtype={'site_id':np.uint8, 'air_temperature':np.float16,
                                                                  'cloud_coverage':np.float16, 'dew_temperature':np.float16,
                                                                  'precip_depth_1_hr':np.float16},
                                                           usecols=lambda c: c not in unimportant_cols)
    df = pd.read_csv(f'{path}/{source}.csv', dtype={'building_id':np.uint16, 'meter':np.uint8}, parse_dates=['timestamp'])
    df = df.merge(building, on='building_id', how='left')
    df = df.merge(weather, on=['site_id', 'timestamp'], how='left')
    return df

In [None]:
%%time
train = load_data('train')
train.sample(7)

In [None]:
%%time
test = load_data('test')
test.sample(7)

In [None]:
print(f'Training from {train.timestamp.min()} to {train.timestamp.max()}')

# Remove visual outlayers.

In [None]:
meter_arr = train["meter"].unique()
for meter in meter_arr:
    mask = train["meter"] == meter
    plt.figure(figsize=(20, 5))
    sns.scatterplot(data = train[mask], x = "meter_reading", y = "air_temperature")
    plt.xlabel("Meter: {}".format(meter))
    plt.show()

# Removing meter_reading out_layers for the different meters

* meter:0, electricity: bigger than 8000 is outlayer.
* meter:3, hotwater: bigger than 140000 is outlayer.

In [None]:
mask1 = train["meter"] == 0
mask2 = train["meter_reading"] > 40000
mask = np.logical_and(mask1, mask2)
print(train.shape)
train[mask]["meter_reading"] = np.mean(train[mask1]["meter_reading"])
print(train.shape)

In [None]:
mask1 = train["meter"] == 3
mask2 = train["meter_reading"] > 140000
mask = np.logical_and(mask1, mask2)
print(train.shape)
train[mask]["meter_reading"] = np.mean(train[mask1]["meter_reading"])
print(train.shape)

### Remove building_id = [1099, 1088]. It's outlayer.

In [None]:
mask = train["building_id"] == 778
train[mask]["meter_reading"] = 1

In [None]:
mask = train["building_id"] == 1088
train[mask]["meter_reading"] = 1

In [None]:
# for our train and valudation dataset    return df
def correct_error_meter0_model_use(df):
    out = df
    new_values = out["meter_reading"] * 0.2931
    out.loc[out.meter == 0, "meter_reading"] = new_values
    return out

In [None]:
# train = correct_error_meter0_model_use(train)

In [None]:
# target's log-log histogram:
ax = train.meter_reading.hist()
ax.set_yscale('log')

# describe raw values first
train.meter_reading.describe()

In [None]:
# check the distribution in the types of meters
meters = train.groupby('building_id').meter.nunique()
plt.title('Distribution of types of meters\n{0:electricity, 1:water, 2:steam, 3:hotwater}') # from the official starter kernel
_ = meters.hist()
# from the graphs it looks like steam and hotwater are reversed (e.g.: 3:steam, 2:hotwater) but that shouldn't make any difference to the model

## display a single time series (notice measurement errors and discontinuities)

In [None]:
building_id = 1258  # a building with all 4 meters
meters = train[train['building_id'] == building_id].meter.nunique()

for meter in range(meters):
    fig, ax = plt.subplots()
    plt.title(f'Building {building_id} Meter {meter}')
    ax2 = ax.twinx()
    # plot meter_reading
    idx = (train['building_id'] == building_id) & (train['meter'] == meter)
    dates = matplotlib.dates.date2num(train.loc[idx, 'timestamp'])
    ax2.plot_date(dates, train.loc[idx, 'meter_reading'], '-', label='meter_reading', alpha=0.8)
    # plot air_temperature
    dates = matplotlib.dates.date2num(train.loc[train['building_id'] == building_id, 'timestamp'])
    ax.plot_date(dates, train.loc[train['building_id'] == building_id, 'air_temperature'], '.', color='tab:cyan', label='air_temperature')
    ax.set_ylabel('air_temperature'); ax2.set_ylabel('meter_reading')
    ax.legend(loc='upper left'); ax2.legend(loc='upper right')

## now let's see what's the expected prediction in the test set for the same building

In [None]:
meter = 1 # pick a meter

train_sample = train[(train['building_id'] == building_id) & (train['meter'] == meter)]  # same train sample as above

test['meter_reading'] = 0.0
test_sample = test[(test['building_id'] == building_id) & (test['meter'] == meter)]  # and the same meter in the test set

fig, ax = plt.subplots(figsize=(16,4))
plt.title(f'Meter {meter}')
ax.xaxis.set_tick_params(rotation=30, labelsize=10)
ax2 = ax.twinx()

# plot training sample
dates = matplotlib.dates.date2num(train_sample['timestamp'])
ax2.plot_date(dates, train_sample['meter_reading'], '-', label='train', alpha=0.8)
ax.plot_date(dates, train_sample['air_temperature'], '.', color='tab:cyan', label='air_temperature_train')

# plot test sample
dates = matplotlib.dates.date2num(test_sample['timestamp'])
ax2.plot_date(dates, test_sample['meter_reading'], '*', label='test', alpha=0.8)
ax.plot_date(dates, test_sample['air_temperature'], '.', color='tab:cyan', label='air_temperature_test')

ax.set_ylabel('air_temperature'); ax2.set_ylabel('meter_reading')
ax.legend(loc='upper left'); ax2.legend(loc='upper right')

del train_sample; del test_sample; del dates

In [None]:
# the counts above expose the missing data (Should we drop or refill the missing data?)
print("Ratio of available data (not NAN's):")
data_ratios = train.count()/len(train)
data_ratios

In [None]:
# Is the same happening in the test set? Yes
print("Ratio of available data (not NAN's):")
test.count()/len(test)

# Preproces data
* The weather data sets are correctly filled in other scripts.

In [None]:
def ASHRAE3Preprocessor(df, data_ratios):
    avgs = df.loc[:,data_ratios < 1.0].mean()
    pu_le = LabelEncoder() # Asign to a categorical variable numerical values.
    pu_le.fit(df["primary_use"])
    
    df = df.fillna(avgs) # refill NAN with averages
    df['primary_use'] = np.uint8(pu_le.transform(df['primary_use']))  # encode labels

    # expand datetime into its components
    df['hour'] = np.uint8(df['timestamp'].dt.hour)
    df['day'] = np.uint8(df['timestamp'].dt.day)
    df['weekday'] = np.uint8(df['timestamp'].dt.weekday)
    df['month'] = np.uint8(df['timestamp'].dt.month)
    df['year'] = np.uint8(df['timestamp'].dt.year-2000)

    # parse and cast columns to a smaller type
    df.rename(columns={"square_feet": "log_square_feet"}, inplace=True)
    df['log_square_feet'] = np.float16(np.log(df['log_square_feet']))
    df['year_built'] = np.uint8(df['year_built']-1900)
    df['floor_count'] = np.uint8(df['floor_count'])

    # remove redundant columns
    for col in df.columns:
        if col in ['timestamp', 'row_id']:
            del df[col]

    # extract target column
    if 'meter_reading' in df.columns:
        df['meter_reading'] = np.log1p(df['meter_reading']).astype(np.float32) # comp metric uses log errors , / df['square_feet']
    
    return df

In [None]:
train_transform = ASHRAE3Preprocessor(train, data_ratios)

In [None]:
train_transform.sample(7)

In [None]:
%%time
fig, ax = plt.subplots(figsize=(16,8))
# use a ranked correlation to catch nonlinearities
# plot train in all columns except year, taking 100100 random samples, checking correlation using the method 'spearman'
corr = train_transform[[col for col in train_transform.columns if col != 'year']].sample(100100).corr(method='spearman')
_ = sns.heatmap(corr, annot=True,
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)

# Train k folds

In [None]:
# target = meter_reading
# force the model to use the weather data instead of dates, to avoid overfitting to the past history
features = [col for col in train_transform.columns if col not in [target, 'year', 'month', 'day']]

In [None]:
def fit_regressor(tr_idx, val_idx, features_arr, target_str):
    # train
    tr_x, tr_y = train_transform[features_arr].iloc[tr_idx], train_transform[target_str][tr_idx]
    # evaluating ("test")
    vl_x, vl_y = train_transform[features_arr].iloc[val_idx], train_transform[target_str][val_idx]
    print({'train_transform size':len(tr_x), 'eval size':len(vl_x)})

    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)  
    clf = lgb.LGBMRegressor(n_estimators=6000,
                            learning_rate=0.28,
                            feature_fraction=0.9,
                            subsample=0.2,  # batches of 20% of the data
                            subsample_freq=1,
                            num_leaves=20,
                            metric='rmse')
    # Metric: Root Mean Square Error (RMSE), it tells you how concentrated the data is around the line of best fit.
    clf.fit(tr_x, tr_y,
            eval_set=[(vl_x, vl_y)],
            early_stopping_rounds=50,
            verbose=200)
    return clf

In [None]:
folds = 4
seed = 42
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed) # Provides train/test indices to split data in train/test sets.
# oof_pred = np.zeros(train_transform.shape[0])  # out of fold predictions
models = []

## generating 4 train/test pair of index_arrays, and analizing wich give the better results.
for tr_idx, val_idx in tqdm(kf.split(train_transform, train_transform['building_id']), total=folds): # train/test indices
    clf = fit_regressor(tr_idx, val_idx, features, target)
    models.append(clf)

gc.collect() # trigger a manual garbage collection process, cleans up a huge amount of objects.

# Algorithm ID3 regresion
1. Calculate the initial system entropy based on the **objective** variable to predict.
    * Entropy: Determine wich parameters are more important than others to have a better sort in the tree.

## Feature importance

In [None]:
_ = lgb.plot_importance(models[1], importance_type='gain', figsize=(16,8))

In [None]:
# calculate the mean feature importance, so that we can update 'unimportant_cols' above
feature_importance = np.mean([m._Booster.feature_importance(importance_type='gain') for m in models], axis=0)
sorted(zip(feature_importance, train_transform.columns), reverse=True)

# Checking in train data RMSE error.

In [None]:
features = [col for col in train_transform.columns if col not in [target, 'year', 'month', 'day']]
tr_idx = np.random.randint(0, 1552000, 1000000)
tr_x, tr_y = train_transform[features].iloc[tr_idx], train_transform[target][tr_idx]

In [None]:
dic = {"real": tr_y, "prediction": models[1].predict(tr_x)}
p_df = pd.DataFrame(data = dic)

In [None]:
actual = p_df["real"].values
predicted = p_df["prediction"].values
mse = sklearn.metrics.mean_squared_error(actual, predicted)
rmse = np.sqrt(mse)
print(rmse)

# Check prediction in the same test sample

In [None]:
# the counts above expose the missing data (Should we drop or refill the missing data?)
print("Ratio of available data (not NAN's):")
data_ratios = test.count()/len(test)
data_ratios

In [None]:
# load and pre-process test data
test_transform = ASHRAE3Preprocessor(test, data_ratios)
test_transform.sample(7)

In [None]:
folds = 4
meter = 1 # pick a meter
building_id = 1258  # a building with all 4 meters
features = [col for col in train_transform.columns if col not in [target, 'year', 'month', 'day']]

def recover_timestamp(x):
    ''' reassemble timestamp using date components '''
    return datetime.datetime.strptime(f'{x.year}-{x.month}-{x.day} {x.hour}', '%y-%m-%d %H')

fig, ax = plt.subplots(figsize=(16,4))
plt.title(f'Building {building_id} Meter {meter} on all {folds} prediction folds')
ax.xaxis.set_tick_params(rotation=30, labelsize=10)
ax2 = ax.twinx()

train_sample = train_transform[(train_transform['building_id'] == building_id) & (train_transform['meter'] == meter)]  # same training sample as before
test_sample = test_transform[(test_transform['building_id'] == building_id) & (test_transform['meter'] == meter)]   # and the same meter in the test set

# plot training sample
dates = matplotlib.dates.date2num(train_sample[['year', 'month', 'day', 'hour']].apply(recover_timestamp, axis=1))
ax2.plot_date(dates, train_sample['meter_reading'], '-', label='train', alpha=0.8)
ax.plot_date(dates, train_sample['air_temperature'], '.', color='tab:cyan', label='air_temperature_train')


# plot prediction sample
dates = matplotlib.dates.date2num(test_sample[['year', 'month', 'day', 'hour']].apply(recover_timestamp, axis=1))
ax.plot_date(dates, test_sample['air_temperature'], '.', color='tab:cyan', label='air_temperature')
for i,model in enumerate(models):
    ax2.plot_date(dates, model.predict(test_sample[features]), '-', label=f'prediction{i}', alpha=0.4) #np.expm1()

ax.set_ylabel('air_temperature'); ax2.set_ylabel('meter_reading (+prediction)')
ax.legend(loc='upper left'); ax2.legend(loc='upper right')
_ = plt.show()

# Save/ load the model to a file

In [None]:
# # save the model to disk
# filename = './models_saved/ASHRAE3_ID3_16082021_3.sav'
# pickle.dump(models, open(filename, 'wb'))

In [None]:
# # load the model from disk
# filename = './models_saved/ASHRAE3_ID3_16082021_3.sav'
# models = pickle.load(open(filename, 'rb'))
# models

# Test inferance and submission

In [None]:
# split test data into batches
set_size = len(test_transform)
iterations = 50
batch_size = set_size // iterations

print(set_size, iterations, batch_size)
assert set_size == iterations * batch_size

In [None]:
meter_reading = []
for i in tqdm(range(iterations)):
    pos = i*batch_size
    fold_preds = [model.predict(test_transform[features].iloc[pos : pos+batch_size]) for model in models]
    meter_reading.extend(np.mean(fold_preds, axis=0)) # Using all the models and making the mean between each other.

print(len(meter_reading))
assert len(meter_reading) == set_size

## Save object data

In [None]:
# filename = './models_saved/meter_reading_1.csv'
# out_df = pd.DataFrame(meter_reading)
# out_df.to_csv(filename, index=False)

# Prepare data for submision.

In [None]:
# for our outcome in validation
def correct_error_meter0_scoring_use(df):
    out = df
    new_values = out["meter_reading"] * 3.4118
    out.loc[out.meter == 0, "meter_reading"] = new_values
    return out

In [None]:
# train = correct_error_meter0_scoring_use(train)

In [None]:
meter_reading_exp = np.expm1(meter_reading) # Calculate exp(x) - 1 for all elements in the array.

In [None]:
# meter_reading_exp = meter_reading_exp * test_transform['square_feet']

In [None]:
dic = {"pred": meter_reading_exp}
df_pred_out = pd.DataFrame(data= dic)

In [None]:
# target's log-log histogram:
ax = df_pred_out.pred.hist()
ax.set_yscale('log')

# describe raw values first
df_pred_out.pred.describe()

# Save submision

In [None]:
np.clip([-1, 1,2,3,4], a_min=0, a_max=None)

In [None]:
submission = pd.read_csv(f'{path}/sample_submission.csv')
submission['meter_reading'] = np.clip(meter_reading_exp, a_min=0, a_max=None) # clip min at zero

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head(9)