In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#usefulcols=['OSEBuildingID','BuildingType','PrimaryPropertyType','PropertyName','YearBuilt','ENERGYSTARScore','SiteEnergyUse(kBtu)','TotalGHGEmissions']
data2016= pd.read_csv('/kaggle/input/sea-building-energy-benchmarking/2016-building-energy-benchmarking.csv')
data2016

In [None]:
#usefulcols_=['OSEBuildingID','BuildingType','PrimaryPropertyType','PropertyName','YearBuilt','ENERGYSTARScore','SiteEnergyUse(kBtu)','GHGEmissions(MetricTonsCO2e)']
data2015= pd.read_csv('/kaggle/input/sea-building-energy-benchmarking/2015-building-energy-benchmarking.csv')
data2015

**Cleaning Data**

First let us find unique columns. Then we will rename columns in both 2015 and 2016 datasets, so all columns match. Unnecessary or empty columns will be removed.

In [None]:
data2015.columns

In [None]:
data2016.columns

In [None]:
column_2015= set(data2015.columns)
column_2016=set(data2016.columns)
common_columns= column_2015.intersection(column_2016)
unique_2015= column_2015.difference(common_columns)
unique_2015

In [None]:
unique_2016= column_2016.difference(common_columns)
unique_2016

In [None]:
unique2015df= data2015[list(unique_2015)]
unique2015df

In [None]:
unique2016df= data2016[list(unique_2016)]
unique2016df

In [None]:
unique2015df.info()

In [None]:
unique2016df.info()

In [None]:
unique2016df['Comments'].unique()

In [None]:
unique2015df['Comment'].unique()

Drop comments columns in both datasets as it is not useful in our analysis.

In [None]:
data_2015_updated=data2015.copy()
data_2015_updated.drop('Comment', axis=1,inplace=True)

data_2016_updated=data2016.copy()
data_2016_updated.drop('Comments', axis=1,inplace=True)

In [None]:
data_2015_updated.rename({'Zip Codes': 'ZipCode'}, axis=1, inplace=True)
data_2015_updated['ZipCode']=data_2015_updated['ZipCode'].astype('object')
data_2016_updated['ZipCode']=data_2016_updated['ZipCode'].astype('object')

In [None]:
import ast

l=[]
for row in data_2015_updated['Location']:
    a=ast.literal_eval(row)
    l.append(a['latitude'])

data_2015_updated['Latitude']=l

l1=[]
for row in data_2015_updated['Location']:
    a=ast.literal_eval(row)
    l1.append(a['longitude'])

data_2015_updated['Longitude']=l1

data_2015_updated.drop('Location', axis=1, inplace=True)


* Extract latitude, longitude, address.
Remove other columns which are nt useful for our case.

In [None]:
data_2015_updated.drop(['OtherFuelUse(kBtu)','2010 Census Tracts','SPD Beats','City Council Districts','Seattle Police Department Micro Community Policing Plan Areas'], axis=1, inplace=True)

In [None]:
data_2016_updated.drop(['City','Address','State'], axis=1, inplace=True)

In [None]:
#make the CHGEmissions column name same in both dataframes
data_2015_updated.rename({'GHGEmissions(MetricTonsCO2e)':'TotalGHGEmissions', 'GHGEmissionsIntensity(kgCO2e/ft2)': 'GHGEmissionsIntensity'}, axis=1, inplace=True)



In [None]:
column_2015_updated= set(data_2015_updated.columns)
column_2016_updated=set(data_2016_updated.columns)
common_columns= column_2015_updated.intersection(column_2016_updated)
unique_2015_updated= column_2015_updated.difference(common_columns)
unique_2015_updated

In [None]:
unique_2016_updated= column_2016_updated.difference(common_columns)
unique_2016_updated

No unique columns remain in the datasets. Now we can combine them

In [None]:
data_2015_updated.info()

In [None]:
data_2016_updated.info()

Making datatypes same in both datasets for correct combination

In [None]:
data_2015_updated[['Latitude', 'Longitude', 'NumberofBuildings']].astype(float)
data_2016_updated['NumberofFloors'].astype(float)
data_2015_updated.drop('DefaultData', axis=1, inplace=True)
data_2016_updated.drop('DefaultData', axis=1, inplace=True)

In [None]:
bldg_data=pd.concat([data_2015_updated, data_2016_updated], ignore_index=True)
bldg_data

Now we can work with one dataset bldg_data

In [None]:
bldg_data.info()

In [None]:
#lets find number of nan values in each column
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # display full series
    print(bldg_data.isna().sum())

In [None]:
#There are a few columns which have a large number of missing values (>1000). Let us drop them 

bldg_data.dropna(axis=1, thresh=5000, inplace=True)                             #require atleast 5000 non NaN values


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # display full series
    print(bldg_data.isna().sum())

**Univariate Analysis**
Simple univariate analysis will help us fill missing items in the dataset and also give us basic information about various columns.

In [None]:
sns.__version__

In [None]:
sns.set_theme()
sns.displot(bldg_data, x='ENERGYSTARScore', binwidth=5)

In [None]:

sns.displot(bldg_data, x='ENERGYSTARScore', hue='BuildingType', element="step", palette='bright', height=6, aspect=1.5)


In [None]:
sns.displot(bldg_data, x='BuildingType', aspect=3.5)

We can observe that distribution of EnergyStar Score varies according to building type. So we can fill NaN values using building type as a parameter. 

In [None]:
bldg_data['BuildingType'].value_counts()

In [None]:
#columns will be filled with mean values for NaN locations

bldg_data.fillna({'ENERGYSTARScore': bldg_data.groupby('BuildingType')['ENERGYSTARScore'].transform('mean')},
                inplace=True)
bldg_data['ENERGYSTARScore'].fillna(method='ffill', inplace=True)
bldg_data.fillna({'TotalGHGEmissions': bldg_data.groupby('BuildingType')['TotalGHGEmissions'].transform('mean')},
                inplace=True)

bldg_data.fillna({'Electricity(kWh)': bldg_data.groupby('BuildingType')['Electricity(kWh)'].transform('mean')},
                inplace=True)

Observing distribution of other important factors-

In [None]:
sns.displot(bldg_data, x='Electricity(kWh)', aspect=4)
plt.xlim(0,10000000)

We can see that most buildings have electricity consumption under 2X10^6 kWh.

In [None]:
sns.displot(bldg_data, x='GHGEmissionsIntensity')
plt.xlim(0,10)

In [None]:
plt.plot(bldg_data['ENERGYSTARScore'], bldg_data['Electricity(kWh)'], 'go')
plt.ylim(0, 50000000)
plt.title('Energystar vs Electricity consumption')
plt.xlabel('Energystar score')
plt.ylabel('Electricity consumption in kWh')
plt.show()

In [None]:
sns.regplot(y='Electricity(kBtu)' ,x='ENERGYSTARScore',  data=bldg_data, line_kws={'color':'red'})
plt.ylim(0,50000000)

In [None]:
bldg_data=bldg_data[bldg_data['Electricity(kWh)']<50000000]


In [None]:
sns.regplot(x='ENERGYSTARScore' ,y='TotalGHGEmissions',  data=bldg_data, line_kws={'color':'red'})
plt.ylim(0,2500)



On increasing electricity consumption or GHG Emissions, Energystar score decreases


In [None]:
sns.regplot(y='Electricity(kWh)' ,x='TotalGHGEmissions',  data=bldg_data, line_kws={'color':'red'})
plt.xlim(0, 4000)

Power consumption clearly increases on inreasing GHG Emissions.

In [None]:
sns.regplot(x='NumberofFloors', y='Electricity(kWh)', data=bldg_data)
plt.title('No. of floors vs Electricity consumption')

In [None]:
sns.regplot(x='NumberofFloors', y='TotalGHGEmissions', data=bldg_data)
plt.title('No. of floors vs GHG Emissions')

Both emissions and electricity consumption increase with an increase in number of floors in buildings. Thus this factor can be used in prediction of electricity and emission.

In [None]:
#Since neighborhood names were in uppercase in 2015 dataset and lowercase in 2016 one, we will make all neigborhood names uppercase
bldg_data.replace({'Central':'CENTRAL',
                  'North': 'NORTH',
                  'Delridge':'DELRIDGE',
                  'Northwest': 'NORTHWEST'}, inplace=True)

In [None]:
plt.figure(figsize=(10,5))
ax=sns.countplot(data=bldg_data, x='Neighborhood')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()

plt.show()

In [None]:
sns.catplot(x='Neighborhood', y='Electricity(kBtu)', data=bldg_data, height=10, aspect=3, kind='box')

plt.ylim(0,20000000)

In [None]:
sns.catplot(x='Neighborhood', y='TotalGHGEmissions', data=bldg_data, height=10, aspect=3, kind='box')
plt.ylim(0,500)

From the above graph, we can observe that- the neighborhood of Downtown has the highest median electricity consumption, followed by Lake Union. Such areas have a higher probability of high electricity consumption as compared to other areas. 
This parameter can be suitable for prediction of electricity consumption.

Similarly, Downtown, East and Lake Union neighborhoods have highest median GHG emisions. Thus, neighborhood can be useful in prediciton of GHG emissions as well. 

In [None]:
sns.catplot(x='BuildingType', y='TotalGHGEmissions', data=bldg_data, height=10, aspect=3, kind='box')
plt.ylim(0,1000)

In [None]:
sns.catplot(x='BuildingType', y='Electricity(kBtu)', data=bldg_data, height=10, aspect=3, kind='box')
plt.ylim(0,50000000)

Above boxplots depict that Campuses have much higher electricity consumption and GHG emission out of all building types, followed by multifamily HR.
Building type can be a helpful parameter in our predicitons.

In [None]:
sns.regplot(x='YearBuilt' ,y='GHGEmissionsIntensity',  data=bldg_data, line_kws={'color':'red'})
plt.ylim(0,10)

We can see that Emissions are negatively correlated to year built, with older buildings having higher GHG emissions. 
Thus this variable can be used in emission prediction.

In [None]:
sns.regplot(x='PropertyGFATotal' ,y='GHGEmissionsIntensity',  data=bldg_data, line_kws={'color':'red'})
plt.xlim(0,2000000)

In [None]:
sns.regplot(x='PropertyGFATotal' ,y='Electricity(kBtu)',  data=bldg_data, line_kws={'color':'red'})
plt.xlim(0,2000000)

And as we can already guess, electricity consumption is proportional to the Gross Floor Area (GFA) of the property.

In [None]:
# let us take a subset of bldg_data dataset- of columns which can help us predict energy consumption

bldg_x= bldg_data[['BuildingType','Neighborhood','ENERGYSTARScore','TotalGHGEmissions','YearBuilt','PropertyGFATotal']]
bldg_y= bldg_data[['Electricity(kWh)']]

In [None]:
bldg_x.head(10)

In [None]:
bldg_x.info()

In [None]:
bldg_x['BuildingTypeRep']=bldg_x['BuildingType'].replace({'NonResidential': 0, 'Nonresidential COS': 1, 
                                'Multifamily MR (5-9)':2,'SPS-District K-12':3, 
                                'Multifamily LR (1-4)':4, 'Campus':5,
                                'Multifamily HR (10+)':6, 'Nonresidential WA':7})

In [None]:
bldg_x['Neighborhood'].unique()

In [None]:
bldg_x['NeighborhoodRep']= bldg_x['Neighborhood'].replace({'DOWNTOWN':0, 'SOUTHEAST':1, 'NORTHEAST':2, 'EAST':3,
                                                           'CENTRAL':4, 'NORTH':5,'MAGNOLIA / QUEEN ANNE':6, 'LAKE UNION':7, 
                                                           'GREATER DUWAMISH':8,'BALLARD':9, 'NORTHWEST':10, 'SOUTHWEST':11, 
                                                           'DELRIDGE':12, 'Ballard':13,'DELRIDGE NEIGHBORHOODS':14})

In [None]:
bldg_pred_x= bldg_x.drop(['BuildingType', 'Neighborhood'], axis=1)
bldg_pred_x

Let us begin with simple linear regression

In [None]:
#train test split 
from sklearn.model_selection import train_test_split


In [None]:
x_train,x_test,y_train,y_test=train_test_split(bldg_pred_x, bldg_y, test_size=0.2, random_state=0)

In [None]:
from sklearn import linear_model

In [None]:
reg1= linear_model.LinearRegression()
reg1train_x= np.asanyarray(x_train[['PropertyGFATotal']])
reg1train_y= np.asanyarray(y_train)

print(reg1train_x.shape)
print(reg1train_y.shape)

In [None]:
reg1.fit (reg1train_x, reg1train_y)
# The coefficients
print ('Coefficients: ', reg1.coef_)
print ('Intercept: ',reg1.intercept_)

In [None]:
from sklearn.metrics import r2_score

In [None]:
reg1test_x= np.asanyarray(x_test[['PropertyGFATotal']])
reg1test_y= np.asanyarray(y_test)
reg1_yhat= reg1.predict(y_test)

print('r^2 score is ', r2_score(reg1test_y, reg1_yhat))

In [None]:
print(reg1test_y)

In [None]:
reg1_yhat