In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
DATA_PATH = "../input/ashrae-energy-prediction/"
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
import datetime
from sklearn import metrics
import seaborn as sns
import sklearn.ensemble as ske
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

* **Import data**

In [None]:
#Loading data
Weather_Train = pd.read_csv(DATA_PATH + 'weather_train.csv')
Train= pd.read_csv(DATA_PATH +'train.csv')
Building= pd.read_csv(DATA_PATH +'building_metadata.csv')

* **Memory usage reduction**

In [None]:
#Function That reduces the used memory
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#Reducing the needed data's memory usage
Train = reduce_mem_usage(Train)
Weather_Train = reduce_mem_usage(Weather_Train)
Building = reduce_mem_usage(Building)

* **Features Engineering**

In [None]:
#Merging tables
results = Building.merge(Train,left_on='building_id',right_on='building_id',how='left')
data = results.merge(Weather_Train,left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')

#printing first 5 rows in the dataset
data.head()

In [None]:
# Add and Drop Features
data = data.drop(columns=['year_built', 'floor_count', 'wind_direction', 'dew_temperature'])

#Function that is used to clean memory from deleted data
gc.collect()

In [None]:
#fixing timestamp and taking only the day and the month and then dropping timestamp column
data["timestamp"] = pd.to_datetime(data["timestamp"])
data["day"]= data["timestamp"].dt.day
data["month"]= data["timestamp"].dt.month
data= data.drop("timestamp", axis = 1) 

In [None]:
#Change data type to float 32 for filling NA value before transforming them into int for smooth modeling processing
data['wind_speed'] = data['wind_speed'].astype('float32')
data['air_temperature'] = data['air_temperature'].astype('float32')
data['precip_depth_1_hr'] = data['precip_depth_1_hr'].astype('float32')
data['cloud_coverage'] = data['cloud_coverage'].astype('float32')

In [None]:
#Filling Null Values
data['precip_depth_1_hr'].fillna(data['precip_depth_1_hr'].mean(), inplace = True)
data['cloud_coverage'].fillna(data['cloud_coverage'].mean(), inplace = True)
data['wind_speed'].fillna(data['wind_speed'].mean(), inplace=True)
data['air_temperature'].fillna(data['air_temperature'].mean(), inplace=True)

# Printing the sum of nulls inside the columns
data.isnull().sum()

* **DecisionTree model**

In [None]:
# Here column 'primaty_use' was treated by get_dummies function and get_dummies is used for data manipulation
data_linearR = pd.get_dummies(data,columns = ['primary_use'])

In [None]:
#printing the columns of "data_linearR"
data_linearR.columns

In [None]:
#Using the important features
XD =data_linearR[['building_id', 'meter', 'air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage',
       'square_feet', 'primary_use_Education', 'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Services',
       'primary_use_Technology/science', 'primary_use_Utility',
       'primary_use_Warehouse/storage', 'month', 'day']]

# Create target variable
YD = data_linearR['meter_reading']

# Train, test, split
XD_train,XD_test, YD_train, YD_test = train_test_split(XD,YD, test_size = .20, random_state= 0)

In [None]:
#applying the DecisionTreeRegressor at depth = 2 & 5 and fitting the model
regr_depth2 = DecisionTreeRegressor(max_depth=2)
regr_depth5 = DecisionTreeRegressor(max_depth=5)
regr_depth2.fit(XD_train, YD_train)
regr_depth5.fit(XD_train, YD_train)

In [None]:
#generating the predicted values when the depth = 2 & 5
y_1 = regr_depth2.predict(XD_test)
y_2 = regr_depth5.predict(XD_test)

In [None]:
#creating a DataFrame that contains the actual and the predicted values while the depth = 2
df=pd.DataFrame({'Actual':YD_test, 'Predicted':y_1})
#printing the head of the data (first 5 columns)
df.head()

In [None]:
#Caluclating Mean Absolute Error, Mean Squared Error, Root Mean Squared Error, R^2 & The Accuracy when the depth =2
print('Mean Absolute Error:', metrics.mean_absolute_error(YD_test, y_1))
print('Mean Squared Error:', metrics.mean_squared_error(YD_test, y_1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(YD_test, y_1)))
print('R^2 =',metrics.explained_variance_score(YD_test,y_1))
print('Accuracy for depth2  %d', regr_depth2.score(XD_train, YD_train))
#For depth 2 desicion tree modeling, R2 was obtained at 0.147

In [None]:
#creating a DataFrame that contains the actual and the predicted values while the depth = 5
df=pd.DataFrame({'Actual':YD_test, 'Predicted':y_2})
df.head()

In [None]:
#Caluclating Mean Absolute Error, Mean Squared Error, Root Mean Squared Error, R^2 & The Accuracy when the depth =5
print('Mean Absolute Error:', metrics.mean_absolute_error(YD_test, y_2))
print('Mean Squared Error:', metrics.mean_squared_error(YD_test, y_2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(YD_test, y_2)))
print('R^2 =',metrics.explained_variance_score(YD_test,y_2))
print('Accuracy for depth5 %d', regr_depth5.score(XD_train, YD_train))
#For depth 5 desicion tree modeling, R2 was obtained at 0.723

In [None]:
#Plot that compares the results of depth=2 & depth =5
plt.plot(XD_test, y_1, color="blue",label="max_depth=2", linewidth=2)
plt.plot(XD_test, y_2, color="green", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.show()

In [None]:
## Predicting test set results
yd_pred = regr_depth5.predict(XD_test)
yd_pred