In [4]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor,Ridge,ElasticNet
from sklearn.model_selection import train_test_split,KFold,GroupKFold
import lightgbm as lgb
import gc
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn. linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout, Activation
from tensorflow.keras.optimizers import *

from prettytable import PrettyTable

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ashrae-great-energy-predictor-iii-featherdataset/test.feather
/kaggle/input/ashrae-great-energy-predictor-iii-featherdataset/weather_train.feather
/kaggle/input/ashrae-great-energy-predictor-iii-featherdataset/building_metadata.feather
/kaggle/input/ashrae-great-energy-predictor-iii-featherdataset/weather_test.feather
/kaggle/input/ashrae-great-energy-predictor-iii-featherdataset/sample_submission.feather
/kaggle/input/ashrae-great-energy-predictor-iii-featherdataset/train.feather
/kaggle/input/ashrae-energy-prediction/sample_submission.csv
/kaggle/input/ashrae-energy-prediction/building_metadata.csv
/kaggle/input/ashrae-energy-prediction/weather_train.csv
/kaggle/input/ashrae-energy-prediction/weather_test.csv
/kaggle/input/ashrae-energy-prediction/train.csv
/kaggle/input/ashrae-energy-prediction/test.csv
/kaggle/input/ashrae-great-energy-predictor-iii-dataset/sample_submission.csv
/kaggle/input/ashrae-great-energy-predictor-iii-dataset/building_metadata.csv
/kaggle/inpu

In [5]:
data_path = "/kaggle/input/ashrae-energy-prediction/"

train_path = data_path + "train.csv"

building_path = data_path + "building_metadata.csv"

weather_train_path = data_path + "weather_train.csv"

In [6]:
train_data = pd.read_csv(train_path)

building_data = pd.read_csv(building_path)

weather_train_data = pd.read_csv(weather_train_path)

In [7]:
train_data.to_feather('train_data.feather')

building_data.to_feather('building_data.feather')

weather_train_data.to_feather('weather_train_data.feather')

In [8]:
train_data = pd.read_feather('train_data.feather')

building_data = pd.read_feather('building_data.feather')

weather_train_data = pd.read_feather('weather_train_data.feather')

In [9]:
def reduce_mem_usage(df, df_name):

    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:

        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage of {} is reduced by {:.2f} %. Usage dropped from {:.2f} MB to {:.2f} MB.'.format(df_name, (100 * (start_mem - end_mem) / start_mem), start_mem, end_mem))
    
    return df

In [10]:
train_data = reduce_mem_usage(train_data, 'Train Data')

building_data = reduce_mem_usage(building_data, 'Building Data')

weather_train_data = reduce_mem_usage(weather_train_data, 'Weather Train Data')

Memory usage of Train Data is reduced by 71.82 %. Usage dropped from 616.95 MB to 173.84 MB.
Memory usage of Building Data is reduced by 73.88 %. Usage dropped from 0.07 MB to 0.02 MB.
Memory usage of Weather Train Data is reduced by 73.06 %. Usage dropped from 9.60 MB to 2.59 MB.


In [11]:
train = train_data.merge(building_data, on='building_id', how='left')
train = train.merge(weather_train_data, on=['site_id', 'timestamp'], how='left')

In [12]:
def breakdown_timestamp(dataframe):
    
    dataframe['timestamp']= pd.to_datetime(dataframe['timestamp'])

    dataframe['hour']= np.uint8(dataframe['timestamp'].dt.hour)
    
    dataframe['day']= np.uint16(dataframe['timestamp'].dt.day)
    dataframe['dayofweek']= np.uint8(dataframe['timestamp'].dt.dayofweek)
    dataframe['dayofyear']= np.uint16(dataframe['timestamp'].dt.dayofyear)

    dataframe['month']= np.uint8(dataframe['timestamp'].dt.month)

    dataframe['year']= np.uint16(dataframe['timestamp'].dt.year)
    
    return dataframe

In [13]:
train = breakdown_timestamp(train)

In [14]:
train['meter_reading'] = np.log1p(train['meter_reading'])

In [15]:
train['square_feet'] = np.log1p(train['square_feet'])

In [16]:
zero_meter_readings = list(train[train['meter_reading'] == 0].index)
train.drop(zero_meter_readings, axis = 0, inplace = True)

In [17]:
threshold = len(train) * 0.5
train.dropna(axis=1, thresh = threshold, inplace = True)


In [18]:
train['cloud_coverage'].fillna(train['cloud_coverage'].median(), inplace=True)
train['sea_level_pressure'].fillna(train['sea_level_pressure'].median(), inplace=True)
train['precip_depth_1_hr'].fillna(train['precip_depth_1_hr'].median(), inplace=True)
train['wind_direction'].fillna(train['wind_direction'].median(), inplace=True)
train['wind_speed'].fillna(train['wind_speed'].median(), inplace=True)
train['dew_temperature'].fillna(train['dew_temperature'].median(), inplace=True)
train['air_temperature'].fillna(train['air_temperature'].median(), inplace=True)

In [19]:
train['season'] = train['timestamp'].apply(lambda x: 'Spring' if x.month==3 or x.month==4 or x.month==5 else 
                                                  'Summer' if x.month==6 or x.month==7 or x.month==8 else 
                                                  'Autumn' if x.month==9 or x.month==10 or x.month==11 else 
                                                  'Winter')

train['isDayTime'] = train['timestamp'].apply(lambda x: 1 if x.hour >=6 and x.hour <=18 else 0)

In [20]:
# Encoding categorical data

categorical_features = ['primary_use', 'season']

encoder = preprocessing.LabelEncoder()

for i in categorical_features:
    
    train[i] = encoder.fit_transform(train[i])

In [21]:
train = train.drop(['timestamp'],axis=1)

In [22]:
reduced_train_data = reduce_mem_usage(train, 'Train Data')

Memory usage of Train Data is reduced by 36.62 %. Usage dropped from 1241.96 MB to 787.16 MB.


In [23]:
new_data = reduced_train_data[['building_id','square_feet','primary_use','meter','site_id','air_temperature','dayofyear','hour','isDayTime','dew_temperature','dayofweek', 'meter_reading']]

In [24]:
new_data.drop(['site_id','dew_temperature'],axis=1, inplace = True)

In [25]:
X_train = new_data.drop(['meter_reading'],axis = 1)

Y_train = new_data['meter_reading'].values

In [26]:
def baselineModel(y_actual,y_pred):

    rmsle_score = np.sqrt(np.mean((y_actual - y_pred) * (y_actual-y_pred)))
    
    print("The RMSLE Score of the Baseline Model is :",rmsle_score)

baselineModel(Y_train, np.median(Y_train))

The RMSLE Score of the Baseline Model is : 1.772


In [27]:
train_x, test_x, train_y, test_y = train_test_split(X_train,Y_train, test_size=0.3, random_state=42)

In [28]:
def RMSLE(y_actual, y_pred):
    
    return np.sqrt(mean_squared_error(y_actual, y_pred))

In [29]:
linear_Regression = LinearRegression()
linear_Regression.fit(train_x, train_y)

LinearRegression()

In [30]:
print('Linear Regression Traininig RMSLE = ', RMSLE((train_y) , (linear_Regression.predict(train_x))))
print('Linear Regression Testing RMSLE = ',RMSLE((test_y) ,(linear_Regression.predict(test_x))))

Linear Regression Traininig RMSLE =  1.3809857
Linear Regression Testing RMSLE =  1.3814555


In [31]:
lin_scores = cross_val_score(linear_Regression, train_x, train_y,scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

print("Linear Regression CV Scores:", lin_rmse_scores)
print("==========================================================================================")
print("Mean CV Score:", lin_rmse_scores.mean())

Linear Regression CV Scores: [1.38071032 1.38093543 1.38030438 1.38139296 1.38185845 1.38161115
 1.38206013 1.37990234 1.38196516 1.37913127]
Mean CV Score: 1.3809871585828213


In [32]:
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'fit_intercept' : [False],
              'l1_ratio':[0.5]}

elasticnet = GridSearchCV(estimator = ElasticNet(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

elasticnet.fit(train_x, train_y)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


GridSearchCV(cv=3, estimator=ElasticNet(), n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'fit_intercept': [False], 'l1_ratio': [0.5]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [33]:
print("Best ElasticNet Estimator is : ", elasticnet.best_estimator_)
print("Best ElasticNet Paramteres are : ", elasticnet.best_params_)

Best ElasticNet Estimator is :  ElasticNet(alpha=0.001, fit_intercept=False)
Best ElasticNet Paramteres are :  {'alpha': 0.001, 'fit_intercept': False, 'l1_ratio': 0.5}


In [34]:
elastic_model = ElasticNet(alpha = 0.001, fit_intercept = False, l1_ratio = 0.5)
elastic_model.fit(train_x, train_y)

ElasticNet(alpha=0.001, fit_intercept=False)

In [35]:
print('ElasticNet Training RMSLE = ',RMSLE((train_y) , (elastic_model.predict(train_x))))
print('ElasticNet Testing RMSLE = ',RMSLE((test_y) , (elastic_model.predict(test_x))))

ElasticNet Training RMSLE =  1.4734745
ElasticNet Testing RMSLE =  1.4735804


In [36]:
parameters = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'fit_intercept' : [True],
              'solver' : ['lsqr']}

ridge = GridSearchCV(estimator = Ridge(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

ridge.fit(train_x, train_y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


GridSearchCV(cv=3, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'fit_intercept': [True], 'solver': ['lsqr']},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [37]:
 print("Best Ridge Estimator is :",ridge.best_estimator_)
print("Best Ridge Paramteres are : ", ridge.best_params_)

Best Ridge Estimator is : Ridge(alpha=1000, solver='lsqr')
Best Ridge Paramteres are :  {'alpha': 1000, 'fit_intercept': True, 'solver': 'lsqr'}


In [38]:
Ridge = Ridge(alpha = 0.0001, fit_intercept = True, solver = "lsqr")
Ridge.fit(train_x, train_y)

Ridge(alpha=0.0001, solver='lsqr')

In [39]:
print('Ridge Training RMSLE = ',RMSLE((train_y) , (Ridge.predict(train_x))))
print('Ridge Testing RMSLE = ',RMSLE(test_y , Ridge.predict(test_x)))

Ridge Training RMSLE =  1.3840854
Ridge Testing RMSLE =  1.3844967


In [40]:
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10]}

lasso = GridSearchCV(estimator = Lasso(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

lasso.fit(train_x, train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


GridSearchCV(cv=3, estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [41]:
print("Best Lasso Estimator ",lasso.best_estimator_)
print("Best Lasso Paramteres are : ", lasso.best_params_)


Best Lasso Estimator  Lasso(alpha=0.001)
Best Lasso Paramteres are :  {'alpha': 0.001}


In [42]:
Lasso = Lasso(alpha = 0.001)
Lasso.fit(train_x, train_y)

Lasso(alpha=0.001)

In [43]:
print('Lasso Traininig RMSLE = ',RMSLE((train_y) , (Lasso.predict(train_x))))
print('Lasso Testing RMSLE = ',RMSLE(test_y , Lasso.predict(test_x)))

Lasso Traininig RMSLE =  1.380988
Lasso Testing RMSLE =  1.3814559


In [44]:
parameters = {'max_depth': [3,5,7,9,11,15]}


decission_tree = GridSearchCV(estimator = DecisionTreeRegressor(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

decission_tree.fit(train_x, train_y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


GridSearchCV(cv=3, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 7, 9, 11, 15]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [45]:
print("Best Decision Tree Estimator ",decission_tree.best_estimator_)
print("Best Decision Tree Paramteres are : ", decission_tree.best_params_)

Best Decision Tree Estimator  DecisionTreeRegressor(max_depth=15)
Best Decision Tree Paramteres are :  {'max_depth': 15}


In [46]:
DecissionTree = DecisionTreeRegressor(max_depth=15)
DecissionTree.fit(train_x, train_y)

DecisionTreeRegressor(max_depth=15)

In [47]:
print('Decision Tree Training RMSLE = ',RMSLE((train_y) , (DecissionTree.predict(train_x))))
print('Decision Tree Testing RMSLE = ',RMSLE(test_y , DecissionTree.predict(test_x)))

Decision Tree Training RMSLE =  0.6065745680939864
Decision Tree Testing RMSLE =  0.6093457095734802


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
