In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
DATASET_PATH  = '../input/ashrae-energy-prediction/'

In [None]:
data = pd.read_csv(DATASET_PATH+'train.csv')


In [None]:
buildingData = pd.read_csv(DATASET_PATH+'building_metadata.csv')

In [None]:
buildingData.isnull().sum()

# Merging building data with training data

In [None]:
data = data.merge(buildingData,on = 'building_id',how = 'left')

**Removing features having a lot of nulls**

In [None]:
data = data.drop(['year_built','floor_count'],axis = 1)

In [None]:
#DATA_PATH = "D:Study/Machine Learning/ML Project/"
weather_train = pd.read_csv(DATASET_PATH + 'weather_train.csv')
weather_test  = pd.read_csv(DATASET_PATH + 'weather_test.csv')

# Concatenating weather train and test data

In [None]:
import datetime as dt
frames = [weather_train,weather_test]
weather_data = pd.concat(frames)
del weather_train
del weather_test

# Merging data with weather data

In [None]:
data = data.merge(weather_data, on=['site_id', 'timestamp'], how='left')


# Converting timestamp feature to datetime to extract another features like day,month,hour,and year

In [None]:
data["timestamp"] = pd.to_datetime(data["timestamp"])
data["day"]       = data["timestamp"].dt.day
data["hour"]      = data["timestamp"].dt.hour
data["week"]      = data["timestamp"].dt.weekday
data["month"]     = data["timestamp"].dt.month
data["year"]      = data["timestamp"].dt.year
#data              = data.drop("timestamp", axis = 1)
#del weather_data

# Deleting useless variables to reduce memory usage

In [None]:
del frames

In [None]:
data.isnull().sum()

# Removing unimportant features

In [None]:
data = data.drop(['cloud_coverage','wind_direction','sea_level_pressure'],axis=1)
data

# Filling nulls

In [None]:
data['air_temperature'] = data['air_temperature'].fillna(data['air_temperature'].mean())
data['wind_speed'] = data['wind_speed'].fillna(data['wind_speed'].mean())
data['dew_temperature'] = data['dew_temperature'].fillna(data['dew_temperature'].mean())
data['precip_depth_1_hr'] = data['precip_depth_1_hr'].fillna(data['precip_depth_1_hr'].mean())

In [None]:
data.describe()

# Fixing the site_id 0 error

In [None]:
data['meter_reading'] = [j*0.2931 if i == 0 else j for i,j in zip(data['site_id'],data['meter_reading'])]
data

# Removing Outliers

In [None]:
mask1 = data["meter"] == 0
mask2 = data["meter_reading"] > 40000
mask = np.logical_and(mask1, mask2)
print(data.shape)
data[mask]["meter_reading"] = data[mask1]["meter_reading"].mean()

In [None]:
mask1 = data["meter"] == 0
mask2 = data["meter_reading"] > 40000
mask = np.logical_and(mask1, mask2)
print(data.shape)
data[mask]["meter_reading"] = data[mask1]["meter_reading"].mean()

In [None]:
mask1 = data["meter"] == 3
mask2 = data["meter_reading"] > 140000
mask = np.logical_and(mask1, mask2)
print(data.shape)
data[mask]["meter_reading"] = data[mask1]["meter_reading"].mean()
print(data.shape)
del mask1
del mask2

# Converting primary use to numiric

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
data['primary_use'] = le.fit_transform(data['primary_use'])

# Removing timestamp to reduce features size

In [None]:
data = data.drop('timestamp',axis = 1)

# Test data section

> **we will do the same operations that we did on training data**

In [None]:
testData = pd.read_csv(DATASET_PATH+'test.csv')

In [None]:

testData = testData.merge(buildingData, on='building_id', how='left')
testData = testData.merge(weather_data, on=['site_id', 'timestamp'], how='left')
testData

In [None]:
testData = testData.drop(['row_id','year_built','floor_count','cloud_coverage','sea_level_pressure','wind_direction'],axis=1)

In [None]:
del weather_data
del buildingData

In [None]:
testData["timestamp"] = pd.to_datetime(testData["timestamp"])
testData["day"]       = testData["timestamp"].dt.day
testData["hour"]      = testData["timestamp"].dt.hour
testData["week"]      = testData["timestamp"].dt.weekday
testData["month"]     = testData["timestamp"].dt.month
testData["year"]      = testData["timestamp"].dt.year
testData              = testData.drop("timestamp", axis = 1)


In [None]:
testData['air_temperature'] = testData['air_temperature'].fillna(testData['air_temperature'].mean())
testData['wind_speed'] = testData['wind_speed'].fillna(testData['wind_speed'].mean())
testData['dew_temperature'] = testData['dew_temperature'].fillna(testData['dew_temperature'].mean())
testData['precip_depth_1_hr'] = testData['precip_depth_1_hr'].fillna(testData['precip_depth_1_hr'].mean())

In [None]:
testData

In [None]:
testData['primary_use'] = le.fit_transform(testData['primary_use'])

# Algorithms Section

In [None]:
from sklearn.linear_model import LinearRegression

# Preparing X train and Y train for pridection

# **We could not use test data because of memory leak**> > 

In [None]:
x_train = data[['building_id','meter','primary_use','square_feet','air_temperature','dew_temperature','precip_depth_1_hr','wind_speed','day','hour','week','month','year']]
y_train = data['meter_reading']
#x_test = testData
x_train

In [None]:
y_train

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train.values,y_train.values)

In [None]:
from sklearn.svm import SVR

In [None]:
svrModel = SVR(C = 5)
svrModel.fit(x_train.values,y_train.values)

In [None]:
from sklearn.linear_model import SGDRegressor


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [None]:
reg = make_pipeline(StandardScaler(),
                     SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(x_train.values, y_train.values)

In [None]:
reg.score(x_train.values, y_train.values)

In [None]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(x_train.values, y_train.values)


In [None]:
clf.score(x_train.values, y_train.values)