# ASHRAE ENERGY PREDICTION III:

# From Previous Notebook ...

## - Importing Necessary Libraries

In [None]:
# Importing needed libraries to be used throughout the project

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,KFold,GroupKFold
import lightgbm as lgb
import gc

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## - Loading and Reducing Memory Usage of Data

In [None]:
data_path = "/kaggle/input/ashrae-energy-prediction/"

train_path = data_path + "train.csv"

building_path = data_path + "building_metadata.csv"

weather_train_path = data_path + "weather_train.csv"

In [None]:
train_data = pd.read_csv(train_path)

building_data = pd.read_csv(building_path)

weather_train_data = pd.read_csv(weather_train_path)

In [None]:
# Converting data into feather format since some dataframes are too large and take a long time to load
# This method is inspired from the kaggle notebook titled: ASHRAE: feather format for fast loading
# Which is found at: https://www.kaggle.com/corochann/ashrae-feather-format-for-fast-loading

train_data.to_feather('train_data.feather')

building_data.to_feather('building_data.feather')

weather_train_data.to_feather('weather_train_data.feather')

In [None]:
train_data = pd.read_feather('train_data.feather')

building_data = pd.read_feather('building_data.feather')

weather_train_data = pd.read_feather('weather_train_data.feather')

In [None]:
# This function helps in optimizing the memory used by the dataframes by by modifying/altering thedatatype of each column.
# This method is inspired from the kaggle notebook titled: load data (reduce memory usage)
# Which is found at: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df, df_name):

    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:

        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage of {} is reduced by {:.2f} %. Usage dropped from {:.2f} MB to {:.2f} MB.'.format(df_name, (100 * (start_mem - end_mem) / start_mem), start_mem, end_mem))
    
    return df


In [None]:
# Reducing Memory Usage of Data

train_data = reduce_mem_usage(train_data, 'Train Data')

building_data = reduce_mem_usage(building_data, 'Building Data')

weather_train_data = reduce_mem_usage(weather_train_data, 'Weather Train Data')

In [None]:
# Merging Data

train = train_data.merge(building_data, on='building_id', how='left')
train = train.merge(weather_train_data, on=['site_id', 'timestamp'], how='left')

In [None]:
# Breaking Timestamp into Hour, Day, Month, Year

# This function firstly converts timestamp to date and then breaks down date into 6 new columns: hour, day, dayOfWeek, dayOfYear, month and year

def breakdown_timestamp(dataframe):
    
    dataframe['timestamp']= pd.to_datetime(dataframe['timestamp'])

    dataframe['hour']= np.uint8(dataframe['timestamp'].dt.hour)
    
    dataframe['day']= np.uint16(dataframe['timestamp'].dt.day)
    dataframe['dayofweek']= np.uint8(dataframe['timestamp'].dt.dayofweek)
    dataframe['dayofyear']= np.uint16(dataframe['timestamp'].dt.dayofyear)

    dataframe['month']= np.uint8(dataframe['timestamp'].dt.month)

    dataframe['year']= np.uint16(dataframe['timestamp'].dt.year)
    
    return dataframe

In [None]:
train = breakdown_timestamp(train)

## - Applying Log Transformation to 'Meter Reading and 'Square Feet'

In [None]:
train['meter_reading'] = np.log1p(train['meter_reading'])

In [None]:
train['square_feet'] = np.log1p(train['square_feet'])

# 5. Feature Engineering

**Preprocessing and Data Preperation Steps:**
1. Removing rows with zero meter readings as they may indicate an innacurate or unavailable reading.
2. Setting a threshold limit to drop columns with more than 50 % missing values.
3. Filling In missing values using column median.
4. Adding new features.
5. Encoding categorical data.

## 5.1 Removing Zero Meter Readings

In [None]:
zero_meter_readings = list(train[train['meter_reading'] == 0].index)
train.drop(zero_meter_readings, axis = 0, inplace = True)

In [None]:
print('New Shape of Train Data:',train.shape)

## 5.2 Dropping Columns with More than 50 % Missing Values

In [None]:
threshold = len(train) * 0.5
train.dropna(axis=1, thresh = threshold, inplace = True)

In [None]:
print('New Shape of Train Data:',train.shape)

## 5.3 Filling Missing Values

In [None]:
train['cloud_coverage'].fillna(train['cloud_coverage'].median(), inplace=True)
train['sea_level_pressure'].fillna(train['sea_level_pressure'].median(), inplace=True)
train['precip_depth_1_hr'].fillna(train['precip_depth_1_hr'].median(), inplace=True)
train['wind_direction'].fillna(train['wind_direction'].median(), inplace=True)
train['wind_speed'].fillna(train['wind_speed'].median(), inplace=True)
train['dew_temperature'].fillna(train['dew_temperature'].median(), inplace=True)
train['air_temperature'].fillna(train['air_temperature'].median(), inplace=True)

In [None]:
train.isnull().sum() / len(train)

## 5.4 Adding New Features

In [None]:
# adding new feature from existing ones to get better results

train['season'] = train['timestamp'].apply(lambda x: 'Spring' if x.month==3 or x.month==4 or x.month==5 else 
                                                  'Summer' if x.month==6 or x.month==7 or x.month==8 else 
                                                  'Autumn' if x.month==9 or x.month==10 or x.month==11 else 
                                                  'Winter')

train['isDayTime'] = train['timestamp'].apply(lambda x: 1 if x.hour >=6 and x.hour <=18 else 0)

In [None]:
train.head()

## 5.5 Encoding categorical data

In [None]:
categorical_features = ['primary_use', 'season']

In [None]:
encoder = preprocessing.LabelEncoder()

for i in categorical_features:
    
    train[i] = encoder.fit_transform(train[i])
    
print (train.info())

In [None]:
train.head()

# 6. Building Transformation Pipeline

Creating a pipline with all the transformation done to the data.

**Note**
- Unfortunately, when running the pipline the session always crashes, so we didn't get an opportunity to fully test it out and that's why it's commented.

In [None]:
#numerical_features = train.select_dtypes(include=[np.number])
#numerical_features = list(numerical_features)

In [None]:
#meter_reading_ix = train.columns.get_loc('meter_reading')

#def drop_zero_readings(X):

#        zero_meter_readings = list(X[X.iloc[:, meter_reading_ix] == 0.0].index)
#        X.drop(zero_meter_readings, axis = 0, inplace = True)
        
#        return X

In [None]:
#def drop_mostly_null_features(X, missing_factor = 0.5):

#        threshold = len(X) * missing_factor
#        X.dropna(axis=1, thresh = threshold, inplace = True)
        
#        return np.c_[X]

In [None]:
#timestamp_ix = train.columns.get_loc("timestamp")

#def add_extra_features(X):
    
#    season = X[:, timestamp_ix].apply(lambda x: 'Spring' if x.month==3 or x.month==4 or x.month==5 else 
#                                                    'Summer' if x.month==6 or x.month==7 or x.month==8 else 
#                                                    'Autumn' if x.month==9 or x.month==10 or x.month==11 else 
#                                                    'Winter')
#    
#    isDayTime = X[:, timestamp_ix].apply(lambda x: 1 if x.hour >=6 and x.hour <=18 else 0)

#    return np.c_[X, season, isDayTime]

In [None]:
#num_pipeline = Pipeline([
  
#        ('drop_zero_readings', FunctionTransformer(drop_zero_readings, validate=False)),
#        ('drop_mostly_null_features', FunctionTransformer(drop_mostly_null_features, validate=False)),
#        ('imputer', SimpleImputer(strategy="median")),
#        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
#    ])

In [None]:
#categorical_features = ['primary_use', 'season']

In [None]:
#full_pipeline = ColumnTransformer([
#        ("num", num_pipeline, numerical_features),
#        ("cat", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features),
#   ])

In [None]:
#prepared_train_data = full_pipeline.fit_transform(train)

# 7. Features Selection


**Using LGBM to Select the Best Features that Actually Affect the Model Results**

In [None]:
train = train.drop(['timestamp'],axis=1)

**Reducing the Memory Usage Again**

In [None]:
reduced_train_data = reduce_mem_usage(train, 'Train Data')

**Creating Training and Testing Data for the Model to Work With**

In [None]:
x = reduced_train_data.drop(['meter_reading'],axis=1)

In [None]:
y = reduced_train_data['meter_reading']

**Running The Feature Selection Model**

In [None]:
parameters = {
    
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'subsample': 0.2,
    'learning_rate': 0.9,  # 0.3 #0.5 #0.6 
    'feature_fraction': 0.9, #0.5 #0.6 #0.8
    'bagging_fraction': 0.9, #0.8
    'alpha': 0.1,
    'lambda': 0.1,
    'num_leaves': 110, #110 #100 #150 large, but over-fitting
    'max_bin': 66,  #60 #50 # large,but slower,over-fitting
    'max_depth': 10, # deal with over-fitting
    'min_data_in_leaf': 30, # deal with over-fitting

}


kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

models = []

for train_index, val_index in kf.split(x):
    train_x = x.iloc[train_index]
    val_x = x.iloc[val_index]
    train_y = y.iloc[train_index]
    val_y = y.iloc[val_index]
    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_eval = lgb.Dataset(val_x, val_y)
    gbm = lgb.train(parameters,
                    lgb_train,
                    num_boost_round = 10, 
                    valid_sets = (lgb_train, lgb_eval),
                    early_stopping_rounds = 10,
                    verbose_eval = 10) 
    
    models.append(gbm)
    
gc.collect()

**Visualizing the Feature Importance Highlighted by the Model**

In [None]:
feature_imp = pd.DataFrame(sorted(zip(gbm.feature_importance(), gbm.feature_name()),reverse = True), columns=['Value','Feature'])

plt.figure(figsize=(16, 8))

sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))

plt.title('LightGBM Features Importance Plot')

plt.tight_layout()

plt.show()

**Removing the Least Important features as Indicated by the Feature Selection Method**

In [None]:
new_data = reduced_train_data[['building_id','square_feet','primary_use','meter','site_id','air_temperature','dayofyear','hour','isDayTime','dew_temperature','dayofweek', 'meter_reading']]

**Creating a Heatmap to See the Correlated Features in the New Data**

In [None]:
fig, ax = plt.subplots(figsize=(20,10)) 

sns.heatmap(new_data.corr(), annot=True, vmin = -1,vmax = 1,linewidths = 1,annot_kws={"size": 5},fmt="g",ax=ax,cmap="YlGnBu").set_title('Correlation Matrix')

**Removing Highly Correlated Features**

In [None]:
new_data.drop(['site_id','dew_temperature'],axis = 1,inplace=True)

**Creating another Heatmap to Verify that there are no Further Improvements to be Done**

In [None]:
fig, ax = plt.subplots(figsize=(20,10)) 

sns.heatmap(new_data.corr(), annot=True, vmin = -1,vmax = 1,linewidths=1,annot_kws={"size": 5},fmt="g",ax=ax,cmap="YlGnBu").set_title('Correlation Matrix')