In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_log_error
import os


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dirc = "/kaggle/input/ashrae-energy-prediction/"
train = pd.read_csv(dirc + 'train.csv')
weather_train = pd.read_csv(dirc + 'weather_train.csv')
df_building = pd.read_csv(dirc + 'building_metadata.csv')


In [None]:
print('Train data shape:', train.shape,'\nTrain weather data shape:', weather_train.shape, '\nBuilding data shape:', df_building.shape)

### train.csv
- `building_id` - Foreign key for the building metadata.
- `meter` - The meter id code. Read as <span style='color:blue;background:yellow'>`{0: electricity, 1: chilledwater, 2: steam, hotwater: 3}`. Not every building has all meter types.
- `timestamp` - When the measurement was taken
- `meter_reading` - The target variable. Energy consumption in kWh (or equivalent). Note that this is real data with measurement error, which we expect will impose a baseline level of modeling error.
### building_meta.csv
- `site_id` - Foreign key for the weather files.
- `building_id` - Foreign key for training.csv
- `primary_use` - Indicator of the primary category of activities for the building based on EnergyStar property type definitions
- `square_feet` - Gross floor area of the building
- `year_built` - Year building was opened
- `floor_count` - Number of floors of the building
### weather_[train/test].csv
- Weather data from a meteorological station as close as possible to the site.

- `site_id`
- `air_temperature` - Degrees Celsius
- `cloud_coverage` - Portion of the sky covered in clouds, in oktas
- `dew_temperature` - Degrees Celsius
- `precip_depth_1_hr` - Millimeters
- `sea_level_pressure` - Millibar/hectopascals
- `wind_direction` - Compass direction (0-360)
- `wind_speed` - Meters per second

### test.csv
- The submission files use row numbers for ID codes in order to save space on the file uploads. test.csv has no feature data; it exists so you can get your predictions into the correct order.

- `row_id` - Row id for your submission file
- `building_id` - Building id code
- `meter` - The meter id code
- `timestamp` - Timestamps for the test data period
### sample_submission.csv
- A valid sample submission.

- All floats in the solution file were truncated to four decimal places; we recommend you do the same to save space on your file upload.
There are gaps in some of the meter readings for both the train and test sets. Gaps in the test set are not revealed or scored.

## Train Data

In [None]:
train.head()

In [None]:
train.meter.unique()

In [None]:
(train.meter.value_counts('percent')*100).plot(kind='bar')
print((train.meter.value_counts('percent')*100))

- 59.66 % meters are type 0 (electricity)
- Type 3 (hotwater) meters are used least. 

In [None]:
fig, ax = plt.subplots(figsize=(12,9))
sns.boxplot(x='meter', y='meter_reading', data=train)
plt.xlabel('Meter type', fontsize=19, color='blue')
plt.ylabel('Meter Reading', fontsize=19, color='blue')

- Meter type 2 (steam) has maximum meter reading.

> ## Building data

In [None]:
df_building.head()

In [None]:
df_building.isnull().sum(axis=0)/df_building.shape[0]*100

- 53.41%  half values of `year_build` column and 75.50% values of `floor_count` are missing.

In [None]:
data = df_building.primary_use.value_counts('percent')*100
data.sort_values(ascending=True, inplace=True)
fig, ax = plt.subplots(figsize=(12,9))
data.plot(kind='barh')
for i, j in enumerate(data.values):
    plt.text(j, i-0.25, str(round(j,2))+ '%', fontsize=16, color='k')
plt.yticks(fontsize=17, color='b')
fig.set_facecolor('yellow')
ax.set_facecolor('pink')

- 57% buildings are used for Educational and office purpose. 

In [None]:
df_building.describe()

In [None]:
fig, ax = plt.subplots(figsize=(12,9))
sns.boxplot(x='primary_use', y='square_feet', data=df_building)
plt.xticks(rotation=90)

- Median of area for parking is highest.

## Train weather data

In [None]:
weather_train.head()

In [None]:
weather_train.isnull().sum(axis=0)/weather_train.shape[0]*100

- Significant values are missing in `cloud_coverage` and `precip_depth_1_hr` columns.


# Merge data
### 1. Merge train and building data on `building_id`
### 3. Merge 1 with train weather data on `site_id` and `timestamp`



In [None]:
train = pd.merge(train, df_building, on='building_id', how='left')

In [None]:
weather_train['timestamp1'] = pd.to_datetime(weather_train.timestamp)
weather_train['month'] = np.uint8(weather_train.timestamp1.apply(lambda x:x.month))
weather_train['dom'] = np.uint8(weather_train.timestamp1.apply(lambda x:x.day))
weather_train['dow'] = np.uint8(weather_train.timestamp1.apply(lambda x:x.weekday()))
weather_train['hour'] = np.uint8(weather_train.timestamp1.apply(lambda x:x.hour))

In [None]:
del weather_train['timestamp1'] 

In [None]:
train = pd.merge(train, weather_train, on=['site_id', 'timestamp'], how='left')
del weather_train

In [None]:
from bokeh.models import Panel, Tabs
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
output_notebook()

color_map = {
    "air_temperature": "yellow",
    "dew_temperature": "brown",
    "sea_level_pressure": "green",
    "wind_speed": "red",
    "cloud_coverage": "blue",
}

col_map = {
    "air_temperature": "Air Temperature",
    "dew_temperature": "Dew Temperature",
    "sea_level_pressure": "Sea Level Pressure",
    "wind_speed": "Wind Speed",
    "cloud_coverage": "Cloud Coverage",
}


In [None]:
def get_bar_plot_by_site(df,By):
    def get_plots(data, col, color, By):
        p = figure(plot_width=1000, plot_height=350, title=f"Mean of {col} by {By}")
        p.vbar(data[By], top=data[col], color=color, width=0.5)
        return p
    main_tabs_list = []
    cols = ["air_temperature","dew_temperature", "sea_level_pressure", "wind_speed", "cloud_coverage"]
    for col in cols:
        tab_list = []
        for site in range(16):
            temp = df[df["site_id"]==site]
            temp = temp.groupby(['site_id', By])[col].agg({col:'mean'})
            temp.reset_index(inplace=True)
            p = get_plots(temp, col, color_map[col],By)
            tab = Panel(child=p, title=f"Site:{site}")
            tab_list.append(tab)
        tabs = Tabs(tabs=tab_list)
        panel = Panel(child=tabs, title=col_map[col])
        main_tabs_list.append(panel)

    tabs = Tabs(tabs=main_tabs_list)
    show(tabs)

In [None]:
get_bar_plot_by_site(train, 'month')

In [None]:
get_bar_plot_by_site(train, 'dom')

# <center><span style='color:red;backgroud:yellow'>  Stay tunned! More coming up </span></center>
#  <center><span style='color:red;backgroud:yellow'>  ||-------------------------------If like it, Upvote Please -------------------------------||</span></center>