In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder 
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,LabelEncoder

import gc
import psutil

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
building_metadata = pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
sample_submission = pd.read_csv("../input/ashrae-energy-prediction/sample_submission.csv")
test = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
train = pd.read_csv("../input/ashrae-energy-prediction/train.csv")
weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")
weather_train = pd.read_csv("../input/ashrae-energy-prediction/weather_train.csv")

sns.set()


# Basic information

### summary of results:
the csv file, **building meta-data** (**count = 1449 **) contains all information regarding each building's structural info like, it's primary use, square feet, year_built and floor count, as well as identifications such as bulding and site ids.
It is worth notting that floor count and year built have many missing values and that site id has several categories with many repetitions. 

Also, there is a high amount of variance in the square_feet features which could be a problem. The most populated categories in primary_use are:

Education                        549

Office                           279

Entertainment/public assembly    184

Public services                  156

Lodging/residential              147

the rest have less than 50 ocurrences.

On the other hand, **training.csv (count=20216100)** has no missing values. The file provides information on the meter_reading and type of meter used to get it as well as the timestamp corresponding to the time of the reading. Meter reading has a standard deviation of the order of 10^5 but 75% of the samples are below a number close to 2* 10^2 which implies a prescence of outliers. The timestamp features shows readings from all of 2016.

Finally **weather_training.csv (count=139773)** , deals with features related to the weather conditions to the sites of the buildings. Precipitation depth and cloud coverage have a significant prescence of missing values. It also includes a timestamp as to when the readings were made in the year 2016.



* ### building metadata

In [None]:
msno.matrix(building_metadata)
plt.show()
building_metadata.head()

In [None]:
print("lenght: ", len(building_metadata))
#for numeric features
building_metadata[['square_feet','floor_count']].describe()

In [None]:
#categorical features
print(building_metadata['primary_use'].value_counts())
print(building_metadata['site_id'].value_counts())

#dates
building_metadata.year_built.dropna().sort_values()

### Training set

In [None]:
print(len(train))

In [None]:
msno.matrix(train)
plt.show()
train.head()

In [None]:
#numeric varialbes
train[['meter_reading']].describe()


In [None]:
#categorial
print(train['meter'].value_counts())
#dates
train.timestamp.drop_duplicates().sort_values()

### weather train

In [None]:
print('length: ', len(weather_train))

In [None]:
msno.matrix(weather_train)
plt.show()
weather_train.head()

In [None]:
#numeric varialbes
weather_train[['air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr',
                    'sea_level_pressure','wind_direction','wind_speed']].describe()

In [None]:
#dates
weather_train.timestamp.sort_values().drop_duplicates()

 # Building the dataset
The data set is built by merging the building meta data in such a way that the timesatmp meassurements are preserved


In [None]:
pd_data = train.merge(building_metadata, how='left',on='building_id')
pd_data = pd_data.merge(weather_train, how = 'left', on = ['site_id','timestamp'])

In [None]:
print(len(train))
print(len(building_metadata))
print(len(weather_train))
print(len(pd_data))

pd_data.isna().sum()

In [None]:
num_features = ['floor_count','year_built','square_feet','air_temperature','cloud_coverage',
               'dew_temperature','precip_depth_1_hr','sea_level_pressure','wind_direction'
               ,'wind_speed']
cat_features = ['primary_use','site_id','meter']
target = ['meter_reading']

In [None]:
print(pd_data[num_features].dtypes)
for col in pd_data.columns:
    if(pd_data[col].dtype == np.float64):
        pd_data[col] = pd_data[col].astype(np.float32)
    if(pd_data[col].dtype==np.int64):
        pd_data[col] = pd_data[col].astype(np.int32)
        
print("available RAM:", psutil.virtual_memory())

gc.collect()

print("available RAM:", psutil.virtual_memory())

In [None]:
print(pd_data[num_features].dtypes)


# Basic Statistical description

**1. meter_reading:** There are a wide arrange of values for meter readings. So of the even up to 10^7.

**2. site_id :** The most populated sites are 3 and 13 but otherwise it is pretty evenly distributed inbetween.

**3. meter:** The meter with the most amount of readings is (by a large margin) is electricity (0). Hotwater (3) seems to be rare. 

**4. primary_use.** Education is the most common use. Followed by office, entertainment and public services.

**5. year built:**  There are two main spikes: the  70's and early 2000's

**6. meter reagin :** By taking the mean reading for each site and each time unit, one can see that the behaviour of the reading accross the year is different for each site. Some sites present an increase in readings between june and august, while others are mostly uniform during the year. After repeating the same procedure but sepparating each meter type, it is found that chilled water tends to have the highest reading accross the year, but no defined patter is made obvious for all sites.



In [None]:
print(pd_data['meter_reading'].describe())
plt.figure(figsize=(15,7))
#with respect to meter
plt.subplot(1,2,1)
sns.boxplot(y = pd_data['meter_reading'])
plt.xlabel('meter_reading')

plt.subplot(1,2,2)
sns.boxplot(y = pd_data['meter_reading'],showfliers=False)
plt.xlabel('meter_reading')


In [None]:
# site population
dd = building_metadata.site_id.value_counts().reset_index().rename(columns={'index':'site','site_id':'count'})
plt.figure(figsize=(10,6))
sns.barplot(x = 'site',y='count',data = dd)
plt.title('population for each site')

In [None]:
# meter population
dd = pd_data.meter.value_counts().reset_index().rename(columns={'index':'meter_type','meter':'count'})
plt.figure(figsize=(10,6))
sns.barplot(x = 'meter_type',y='count',data = dd)
plt.title('population for each type of meter')

In [None]:
#primary use
dd = building_metadata.primary_use.value_counts().reset_index().rename(columns={'index':'primary_use','primary_use':'count'})
plt.figure(figsize=(10,6))
sns.barplot(x = 'primary_use',y='count',data = dd)
plt.xticks(rotation=90);
plt.title('population for use')

In [None]:
print('year built mode: ', building_metadata['year_built'].dropna().mode())
# year built
plt.figure(figsize=(11,7))
sns.distplot(building_metadata['year_built'].dropna(),kde=True)
plt.ylabel('occurences')
plt.xlabel('year built')
plt.title('distribution of year built')
plt.show()

## Meter reading across the year

First, we take the mean meter reading for each site at each day and plot the resulting time series. Then, the procedure is repeated but sepparating each type of meter.


In [None]:
#we can look at how the meter reading changes during the year
pd_stamp = pd_data.copy()
pd_stamp['timestamp'] = pd.to_datetime(pd_stamp.timestamp)
pd_stamp['timestamp'] = pd_stamp.timestamp.dt.date
pd_meter = pd_stamp.fillna(0).groupby(['site_id','timestamp'])['meter_reading'].mean().reset_index()
#pd_weather.isna().sum()
pd_meter = pd_meter.set_index('timestamp')

In [None]:
#meter reading
plt.figure(figsize=(12,35))
i=1
for s in pd_meter.site_id.unique():
    plt.subplot(16,1,i)
    plt.plot(pd_meter[pd_meter['site_id']==s]['meter_reading'],alpha=0.5, color='navy',label=str(s));
    i+=1
    plt.legend() 
plt.tight_layout()

In [None]:
pd_meter = pd_stamp.fillna(0).groupby(['site_id','meter','timestamp'])['meter_reading'].mean().reset_index()
pd_weather.isna().sum()
pd_meter = pd_meter.set_index('timestamp')


In [None]:
#meter reading
plt.figure(figsize=(12,35))
i=1
for s in pd_meter.site_id.unique():
    plt.subplot(16,1,i)
    for j in range(0,len(pd_meter.meter.unique())):
        me = pd_meter.meter.unique()[j] 
        plt.plot(pd_meter[(pd_meter['site_id']==s) & (pd_meter['meter']==me)]['meter_reading'],alpha=0.5,label='site_'+str(s)+'_meter_'+str(me));
    i+=1
    plt.legend() 
plt.tight_layout()

# Outliers

As will be shown, the distribution for meter reading has heavy tails. Which could be problematic for the algorithm. It is necesary to correctly identify these.

It is worth notting that almost all of the outliers have missing values for yearl_built and floor_number. In fact, only 10 buildings have a reported year_built. 

On the other hand, sites 9 and 13 have the highest number of outliers. With respect to meter type,steam (type 2) holds the overwelming majority of outlier values, which could be due to the inefficiency of this energy source. Education, also holds the majority of outliers. 

Going more in depth with sites 9 and 13, they hold meters of types 0, 1 and 2 ( electricity, hot water and steam)The weather variables during the year for sites 9 and 13 are not too far from the norm. The only one worth notting is dew temperature whose min and max values tend to be on the higher end of the spectrum.



In [None]:
print(pd_data['meter_reading'].describe())

plt.figure(figsize=(11,7))
sns.distplot(pd_data['meter_reading'],kde=False)
plt.ylabel('occurences')
plt.xlabel('meter reading')
plt.title('all meter readings')
plt.show()



As can be seen. There are some outliers with more than 300 in meter reading. Otherwise, the distribution seems to be highest around values close to zero.

In [None]:

plt.figure(figsize=(11,7))
sns.distplot(pd_data[pd_data['meter_reading']<=(10**3)]['meter_reading'],kde=False)
plt.ylabel('occurences')
plt.xlabel('meter reading')
plt.title('meters below Q3')
plt.show()

plt.figure(figsize=(11,7))
sns.distplot(pd_data[(pd_data['meter_reading']>(10**3)) & (pd_data['meter_reading']<=(10**5))]['meter_reading'],kde=False)
plt.ylabel('occurences')
plt.xlabel('meter reading')
plt.title('Above 10^3 and below in 10^5')
plt.show()

plt.figure(figsize=(11,7))
sns.distplot(pd_data[pd_data['meter_reading'] >(10**5)]['meter_reading'],kde=False)
plt.ylabel('occurences')
plt.xlabel('meter reading')
plt.title('Above 10^5')
plt.show()

> As can be seen from the graphs above, the tails of the distribution of meter reading are very heavy. We can try to identify those building with more than 10^5 meter reading

In [None]:
pd_outliers = pd_data[pd_data['meter_reading']>(10**4)]
pd_outlier_buildings = pd_outliers.drop_duplicates(subset='building_id',keep='first')
print("total number of buildings: ",len(pd_data.building_id.unique()) )
print("number of buildings with more than 10^4 in meter reading: ",len(pd_outlier_buildings) )
len(pd_data.building_id.unique())

In [None]:

msno.matrix(pd_outliers)
plt.show()

In [None]:
# outlier population by site
dd = pd_outlier_buildings.site_id.value_counts().reset_index().rename(columns={'index':'site','site_id':'count'})
plt.figure(figsize=(10,6))
sns.barplot(x = 'site',y='count',data = dd)
plt.title('outleir population for each site')

We can try and take a look at the weather patterns of site 9 and 13 and try to identify a difference between them and the rest

In [None]:
# meter population
dd = pd_outliers.meter.value_counts().reset_index().rename(columns={'index':'meter_type','meter':'count'})
plt.figure(figsize=(10,6))
sns.barplot(x = 'meter_type',y='count',data = dd)
plt.title('outlier population for each type of meter')

## Looking a the meter types in site 9 and 13

In [None]:
b_plot = pd_data[pd_data.site_id.isin([9,13])]['meter'].value_counts().reset_index().rename(columns={'index':'meter_type',
                                                                                           'meter':'count'})
plt.figure(figsize=(10,6))
sns.barplot(x = 'meter_type',y='count',data = b_plot)
plt.title('meter types in sites 9 and 13')

## Looking at the weather in site 9 and 13

Then at meter type and site at the same time

we can start by taking the mean, max and min of each weather conndition for site and point in time and comparing the time series of each site

In [None]:
pd_stamp = pd_data.copy()
pd_stamp['timestamp'] = pd.to_datetime(pd_stamp.timestamp)
pd_stamp['timestamp'] = pd_stamp.timestamp.dt.date
pd_weather = pd_stamp.fillna(-1).groupby(['site_id','timestamp']).agg({'air_temperature':['mean', 'max','min'], 
                            'cloud_coverage':['mean', 'max','min'],'dew_temperature':['mean', 'max','min'],
                            'precip_depth_1_hr':['mean', 'max','min'],'sea_level_pressure':['mean', 'max','min'],
                            'wind_direction':['mean', 'max','min']}).reset_index()
#pd_weather.isna().sum()
pd_weather = pd_weather.set_index('timestamp')

In [None]:
pd_stamp = pd_data.copy()
pd_stamp['timestamp'] = pd.to_datetime(pd_stamp.timestamp)
pd_stamp['timestamp'] = pd_stamp.timestamp.dt.date

### air temperature

In [None]:
#for air temperature
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['air_temperature']['mean'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['air_temperature']['mean'],alpha=0.2, color='navy');

plt.title('mean air temperature, outlier sites in red')


In [None]:
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['air_temperature']['max'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['air_temperature']['max'],alpha=0.2, color='navy');

plt.title('max air temperature, outlier sites in red')


In [None]:
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['air_temperature']['min'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['air_temperature']['min'],alpha=0.2, color='navy');

plt.title('min air temperature, outlier sites in red')


> ### Cloud coverage

In [None]:
#for cloud cover
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['cloud_coverage']['mean'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['cloud_coverage']['mean'],alpha=0.2, color='navy');

plt.title('mean cloud coverage, outlier sites in red')

In [None]:
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['cloud_coverage']['max'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['cloud_coverage']['max'],alpha=0.2, color='navy');

plt.title('max cloud coverage, outlier sites in red')

In [None]:
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['cloud_coverage']['min'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['cloud_coverage']['min'],alpha=0.2, color='navy');

plt.title('min coverage, outlier sites in red')

### Dew Temperature

In [None]:
#fordew
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['dew_temperature']['mean'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['dew_temperature']['mean'],alpha=0.2, color='navy');

plt.title('mean dew_temperature, outlier sites in red')

In [None]:

#for dew
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['dew_temperature']['max'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['dew_temperature']['max'],alpha=0.2, color='navy');

plt.title('max dew_temperature, outlier sites in red')

In [None]:
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['dew_temperature']['min'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['dew_temperature']['min'],alpha=0.2, color='navy');

plt.title('min dew_temperature, outlier sites in red')

### Precipitation depth 1 hours

In [None]:
#precipitation depth
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['precip_depth_1_hr']['mean'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['precip_depth_1_hr']['mean'],alpha=0.2, color='navy');

plt.title('mean precip_depth_1_hr, outlier sites in red')

In [None]:
#precipitation depth
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['precip_depth_1_hr']['min'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['precip_depth_1_hr']['min'],alpha=0.2, color='navy');

plt.title('min precip_depth_1_hr, outlier sites in red')

In [None]:
#precipitation depth
plt.figure(figsize=(15,7))
for s in pd_weather.site_id.unique():
    if(s==13 or s==9):
        plt.plot(pd_weather[pd_weather.site_id==s]['precip_depth_1_hr']['max'],color='red');
    else:
        plt.plot(pd_weather[pd_weather.site_id==s]['precip_depth_1_hr']['max'],alpha=0.2, color='navy');

plt.title('max precip_depth_1_hr, outlier sites in red')

## Year built and outliers

In [None]:
#primary use
dd = pd_outlier_buildings.primary_use.value_counts().reset_index().rename(columns={'index':'primary_use','primary_use':'count'})
plt.figure(figsize=(10,6))
sns.barplot(x = 'primary_use',y='count',data = dd)
plt.xticks(rotation=90);
plt.title('population for use')

In [None]:
print('year built mode: ', pd_outlier_buildings['year_built'].dropna().mode())
print('buildings with the data available ', len(pd_outlier_buildings['year_built'].dropna().unique()))

# year built
plt.figure(figsize=(11,7))
sns.distplot(pd_outlier_buildings['year_built'].dropna(),kde=False)
plt.ylabel('occurences')
plt.xlabel('year built')
plt.title('distribution of year built')
plt.show()

# Correlations to Meter
**correlations numeric**: Square feet has a significant correlation to floor count, wind direction with wind speed
    A pair plot did not yield any significant correlations. Dew temperature and air temperature have more than .8 correlations.

**categorical correlations** Education has many outliers with high meter values. The distributions are pretty much the same except for health care and utility who tend to have a a higher meter overall.

**correlation to year built** : The highest readings are between 2006 and 2012, old buildings tend to have very small readings 


**Meter reading and weather variables** : There are complicated correlations between meter reading and weather variables, with meter reading achiving a peak at a certain value of the weather variable and then descending (simmilar to a poisson-like distribution). Precipitation depth maximizes meter reading with values close to zero and sea level  pressure with very high values. On the other hand, cloud coverage has a reading peak at small values.

**Weather and Weather variables :** Dew temperature and air temperature are stronlgy linearly correlated. Wind speed, cloud coverage and precipitation depth have peaks for a certain air temperature. Finally wind direction and air temperature are completely independent.



## numerical variables and meter reading

In [None]:
#correlation matrix of numerical features
corrmat = pd_data[target+num_features].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

## Categorical variables and metter reading

In [None]:
# correlation of categorical variables to meter
plt.figure(figsize=(15,8))
ax = sns.boxplot(x='primary_use', y='meter_reading', data=pd_data);
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)


In [None]:
plt.figure(figsize=(15,8))
ax = sns.boxplot(x='primary_use', y='meter_reading', data=pd_data,showfliers = False);
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

In [None]:
pd_data_in_built = pd_data.copy()
#pd_data_in_built['year_built'] = pd.to_datetime(pd_data_in_built['year_built']).dt.year
pd_data_in_built = pd_data_in_built.sort_values(by='year_built')

pd_data_in_built.dropna(inplace=True)


In [None]:
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='year_built', y="meter_reading", data=pd_data_in_built)
plt.xticks(rotation=90);

In [None]:
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='year_built', y="meter_reading", data=pd_data_in_built,showfliers=False)
plt.xticks(rotation=90);

## Weather and meter

In [None]:
weather_cols = ['meter_reading','air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr',
               'sea_level_pressure','wind_direction','wind_speed']

pd_stamp= pd_data.copy()
pd_stamp['timestamp'] = pd.to_datetime(pd_stamp.timestamp)
pd_stamp['timestamp'] = pd_stamp.timestamp.dt.date
pd_w= pd_stamp.fillna(0).groupby(['building_id','timestamp'])[weather_cols].mean().reset_index()


In [None]:
plt.figure(figsize=(30,30))
g = sns.pairplot(pd_w[weather_cols])
plt.show() 

In [None]:
print("available RAM:", psutil.virtual_memory())

gc.collect()

print("available RAM:", psutil.virtual_memory())

# Correlations to year built

Grouped by building

**yeat built to mean meter reading**:Not a  lot to see here

**year built to square feet**: There is no obvious correlation between square feet and year built

**year built to site**: Sites 8 to 14 have all  values missing for year built

**year built to meter type**: Nothing



In [None]:
pd_building = pd_data.groupby('building_id')['meter_reading'].mean().reset_index()
m_cols = ['building_id','year_built','floor_count','square_feet','meter','primary_use','site_id']
pd_building = pd_building.merge(pd_data[m_cols], on='building_id',how='left').sort_values(by='year_built')
pd_building.drop_duplicates(inplace=True)

In [None]:
#year built to meter reading
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='year_built', y="meter_reading", data=pd_building,showfliers=True)
plt.xticks(rotation=90);
plt.show()

In [None]:
#year built to size
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='year_built', y="square_feet", data=pd_building,showfliers=True)
plt.xticks(rotation=90);
plt.show()

#year built to size
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='year_built', y="square_feet", data=pd_building,showfliers=False)
plt.xticks(rotation=90);
plt.show()


In [None]:
#year built to size
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.scatterplot(x='year_built', y="square_feet", data=pd_building,hue='meter')
plt.xticks(rotation=90);
plt.show()


In [None]:
#year built to site
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.violinplot(x='site_id', y="year_built", data=pd_building)
plt.xticks(rotation=90);
plt.show()

#year built to site and meter
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.violinplot(x='site_id', y="year_built", data=pd_building, hue='meter')
plt.xticks(rotation=90);
plt.show()

In [None]:

print("available RAM:", psutil.virtual_memory())

gc.collect()

print("available RAM:", psutil.virtual_memory())

# Conclusions
    The results from this notebook show that no single variable will be able to predict the meter reading. 
    It is possible that variables made from interactions between other variables will greatly improve
    the performance of the model,so a great deal of feature engineering might be helpfull. 
    I expect that these interaction will be important with weather related variables (as seen from the pairplots) 
    and time related variables (as seen from the meter reading time series). 
    
    
    Another important consideration is that outliers seem to have a great deal of missing values,
    so it has to be decided if one drops these values or tries to fill the missing numbers with some method.