### Постановка задачи

Построить модель линейной регрессии энергопотребления здания, используя температуру воздуха (air_temperature) и влажность (dew_temperature).


Рассчитать качество построенной модели по проверочным данным.


Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz

Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

### Подключение библиотек

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 12, 6

### Загрузка данных

In [2]:
buildings = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz')
weather = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz', parse_dates=['timestamp'])
energy_0 = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz', parse_dates=['timestamp'])
energy_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   building_id    8784 non-null   int64         
 1   meter          8784 non-null   int64         
 2   timestamp      8784 non-null   datetime64[ns]
 3   meter_reading  8784 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 274.6 KB


### Объединение данных и фильтрация

In [3]:
energy_0 = pd.merge(left=energy_0, right=buildings, how='left', left_on='building_id', right_on='building_id')

energy_0.set_index(['timestamp', 'site_id'], inplace=True)
weather.set_index(['timestamp', 'site_id'], inplace=True)

energy_0 = pd.merge(left=energy_0, right=weather, left_index=True, right_index=True)
energy_0.reset_index(inplace=True)
energy_0 = energy_0[energy_0['meter_reading'] > 0]
energy_0['hour'] = energy_0['timestamp'].dt.hour
energy_0.head()

Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
704,2016-01-30 08:00:00,0,0,0,43.6839,Education,7432,2008.0,,8.3,,6.1,0.0,1019.0,220.0,2.1,8
725,2016-01-31 05:00:00,0,0,0,37.5408,Education,7432,2008.0,,12.8,,10.0,0.0,1021.9,0.0,0.0,5
737,2016-01-31 17:00:00,0,0,0,52.5571,Education,7432,2008.0,,20.6,,11.7,0.0,1020.9,110.0,1.5,17
2366,2016-04-08 14:00:00,0,0,0,59.3827,Education,7432,2008.0,,21.7,2.0,14.4,0.0,1015.1,250.0,3.1,14
2923,2016-05-01 19:00:00,0,0,0,448.0,Education,7432,2008.0,,31.1,,17.2,0.0,1016.1,100.0,4.1,19


### Разделение данных на обучение и проверку

In [4]:
energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2, random_state=11)
energy_0_train.head()

Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
7536,2016-11-10 00:00:00,0,0,0,212.276,Education,7432,2008.0,,21.7,8.0,16.1,-1.0,1017.5,360.0,4.1,0
5979,2016-09-06 03:00:00,0,0,0,260.055,Education,7432,2008.0,,25.6,,20.0,0.0,1021.1,20.0,3.6,3
4834,2016-07-20 10:00:00,0,0,0,302.374,Education,7432,2008.0,,25.0,2.0,23.3,0.0,1019.8,0.0,0.0,10
5952,2016-09-05 00:00:00,0,0,0,245.722,Education,7432,2008.0,,23.9,8.0,22.8,221.0,1018.9,0.0,0.0,0
7272,2016-10-30 00:00:00,0,0,0,208.863,Education,7432,2008.0,,23.9,4.0,18.3,0.0,1019.5,50.0,3.6,0


### Модель линейной регрессии и среднее
meter_reading = A * air_temperature + B * dew_temperature + C

Дополнительно вычислим среднее по часам, чтобы сравнить линейную регрессию с более простой моделью

In [5]:
energy_0_train_averages = energy_0_train.groupby('hour')['meter_reading'].mean()

energy_0_train_lr = energy_0_train[["meter_reading", "air_temperature", "dew_temperature"]]
y = energy_0_train_lr['meter_reading']
x = energy_0_train_lr.drop('meter_reading', axis='columns')

model = LinearRegression().fit(x, y)
print(model.coef_, model.intercept_)

[2.22171764 4.13647871] 101.75314077273657


### Оценка модели

In [6]:
def calculate_model(row):
    meter_reading_log = np.log(row.meter_reading + 1)
    meter_reading_mean = np.log(energy_0_train_averages[row.hour] + 1)
    meter_reading_lr = np.log(
        1 + row.air_temperature * model.coef_[0] + row.dew_temperature * model.coef_[1] + model.intercept_
    )
    row['meter_reading_lr_q'] = (meter_reading_log - meter_reading_lr)**2
    row['meter_reading_mean_q'] = (meter_reading_log - meter_reading_mean)**2
    return row


energy_0_test = energy_0_test.apply(calculate_model, axis='columns')
energy_0_test_lr_rmsle = np.sqrt(energy_0_test['meter_reading_lr_q'].sum() / len(energy_0_test))
energy_0_test_mean_rmsle = np.sqrt(energy_0_test['meter_reading_mean_q'].sum() / len(energy_0_test))
print ("rmsle среднего:", energy_0_test_mean_rmsle)
print ("rmsle линейной регрессии:", energy_0_test_lr_rmsle)

rmsle среднего: 0.2605825449416217
rmsle линейной регрессии: 0.2252610271843787
