In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
DATASET_PATH  = '../input/ashrae-energy-prediction/'

In [None]:
dataset = pd.read_csv('../input/ashrae-energy-prediction/train.csv')


In [None]:
trainData = dataset
trainData

In [None]:
dataset.isnull().sum()

In [None]:
buildingData = pd.read_csv(DATASET_PATH + 'building_metadata.csv')

In [None]:
buildingData.isnull().sum()

# Merging building data with training data

In [None]:
data = dataset.merge(buildingData,on = 'building_id',how = 'left')

# Removing features having alot of nulls 

In [None]:
data = data.drop('year_built',axis = 1)
data = data.drop('floor_count',axis = 1)

In [None]:
#DATA_PATH = "D:Study/Machine Learning/ML Project/"
weather_train = pd.read_csv(DATASET_PATH + 'weather_train.csv')
weather_test = weather_df = pd.read_csv(DATASET_PATH + 'weather_test.csv')

# Concatinating weather train and weather test data

In [None]:
import datetime as dt
frames = [weather_train,weather_test]
weather_data = pd.concat(frames)
data = data.merge(weather_data, on=['site_id', 'timestamp'], how='left')


# Converting timestamp feature to datetime to extract another features such as day,hour,month,and year

In [None]:
data["timestamp"] = pd.to_datetime(data["timestamp"])
data["day"]       = data["timestamp"].dt.day
data["hour"]      = data["timestamp"].dt.hour
data["week"]      = data["timestamp"].dt.weekday
data["month"]     = data["timestamp"].dt.month
data["year"]      = data["timestamp"].dt.year
data              = data.drop("timestamp", axis = 1)
del weather_data

In [None]:
data

In [None]:
data.isnull().sum()

# Removing useless features

In [None]:
data.isnull().sum()

In [None]:
# precip_depth_1_hr
data = data.drop('cloud_coverage',axis = 1)
data

In [None]:
data = data.drop(['wind_direction','sea_level_pressure'],axis=1)
data

# Filling missing data

In [None]:
data['air_temperature'] = data['air_temperature'].fillna(data['air_temperature'].mean())
data['wind_speed'] = data['wind_speed'].fillna(data['wind_speed'].mean())
data['dew_temperature'] = data['dew_temperature'].fillna(data['dew_temperature'].mean())
data['precip_depth_1_hr'] = data['precip_depth_1_hr'].fillna(data['precip_depth_1_hr'].mean())

In [None]:
del dataset

In [None]:
data.describe()

# Fixing side_id 0 error

In [None]:
data['meter_reading'] = [j*0.2931 if i == 0 else j for i,j in zip(data['site_id'],data['meter_reading'])]
data

In [None]:
meter_values = data['meter'].unique()
meter_values

# Filtering buildings that use electricity meter

In [None]:
electricMeter = data[data['meter']==0]


# Graph shows the correlation between electricity meter and air temperature

In [None]:
plt.scatter(electricMeter['meter_reading'],electricMeter['air_temperature'])
plt.show()

# Filtering buildings by meter

In [None]:
chilledWaterMeter = data[data['meter']==1]
stemMeter = data[data['meter']==2]
hotWaterMeter = data[data['meter']==3]

# Graph shows the correlation between chilled water meter and air temperature

In [None]:
plt.scatter(chilledWaterMeter['meter_reading'],chilledWaterMeter['air_temperature'])
plt.show()

# Graph shows the correlation between stem meter and air temperature

In [None]:
plt.scatter(stemMeter['meter_reading'],stemMeter['air_temperature'])
plt.show()

# Graph shows the correlation between hot water meter and air temperature

In [None]:
plt.scatter(hotWaterMeter['meter_reading'],hotWaterMeter['air_temperature'])
plt.show()

# Removing outliers

**there are outliers values  in buildings that use electricity meter and hot water**

***Electricity Meter Outliers***

In [None]:
mask1 = data["meter"] == 0
mask2 = data["meter_reading"] > 40000
mask = np.logical_and(mask1, mask2)
print(data.shape)
data[mask]["meter_reading"] = data[mask1]["meter_reading"].mean()

***Hot water Meter Outliers***

In [None]:
mask1 = data["meter"] == 3
mask2 = data["meter_reading"] > 140000
mask = np.logical_and(mask1, mask2)
print(data.shape)
data[mask]["meter_reading"] = data[mask1]["meter_reading"].mean()
print(data.shape)

# Mean of using electricity meter per month 

In [None]:
newData = data[data['meter']==0]
newData.index = newData['month']
newData.drop(columns = 'month')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['month','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Mean of using Chilled Water meter per month 

In [None]:
newData = data[data['meter']==1]
newData.index = newData['month']
newData.drop(columns = 'month')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['month','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Mean of using Stem meter per month 

In [None]:
newData = data[data['meter']==2]
newData.index = newData['month']
newData.drop(columns = 'month')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['month','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Mean of using Hot Water meter per month 

In [None]:
newData = data[data['meter']==3]
newData.index = newData['month']
newData.drop(columns = 'month')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['month','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Mean of using electricity meter per hour 

In [None]:
newData = data[data['meter']==0]
newData.index = newData['hour']
newData.drop(columns = 'hour')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['hour','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Mean of using Chilled Water meter per hour

In [None]:
newData = data[data['meter']==1]
newData.index = newData['hour']
newData.drop(columns = 'hour')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['hour','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Mean of using Stem meter per hour 

In [None]:
newData = data[data['meter']==2]
newData.index = newData['hour']
newData.drop(columns = 'hour')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['hour','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Mean of using Hot Water meter per hour

In [None]:
newData = data[data['meter']==3]
newData.index = newData['hour']
newData.drop(columns = 'hour')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['hour','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Usage of different types of buildings that use Electricity meter

In [None]:
newData = data[data['meter']==0]
newData.index = newData['primary_use']
newData.drop(columns = 'primary_use')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['primary_use','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (15,7))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Usage of different types of buildings that use Chilled Water meter

In [None]:
newData = data[data['meter']==1]
newData.index = newData['primary_use']
newData.drop(columns = 'primary_use')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['primary_use','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (24,11))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Usage of different types of buildings that use Stem meter

In [None]:
newData = data[data['meter']==2]
newData.index = newData['primary_use']
newData.drop(columns = 'primary_use')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['primary_use','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (25,17))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

# Usage of different types of buildings that use Hot Water meter

In [None]:
newData = data[data['meter']==3]
newData.index = newData['primary_use']
newData.drop(columns = 'primary_use')
newData.sort_index(inplace = True)
data_to_plot = newData.groupby([newData.index])['primary_use','meter_reading'].mean()
idx = data_to_plot.index
meter = data_to_plot.meter_reading
plt.style.use('ggplot')
plt.figure(figsize = (25,17))
plt.plot(idx,meter,'-bo',label = 'meter_reading')
plt.show()

In [None]:
data.isnull().sum()

# Converting primary use to numiric

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
data['primary_use'] = le.fit_transform(data['primary_use'])

In [None]:
data