# Задание

Разделите набор данных на обучающие/проверочные в пропорции 80/20.

Загрузите данные и очистите значения (нулями и средними). Постройте модель линейной регрессии **для каждого часа в отдельности**, используя температуру воздуха (air_temperature), влажность (dew_temperature), атмосферное давление (sea_level_pressure), скорость ветра (wind_speed) и облачность (cloud_coverage).

Рассчитайте качество построенной модели по проверочным данным. Используйте данные:

http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz

http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz

http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
rcParams['figure.figsize'] = 12, 6

In [2]:
buildings = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz')
weather = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz', parse_dates=['timestamp'])
energy_0 = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz', parse_dates=['timestamp'])

energy_0 = pd.merge(left=energy_0, right=buildings, left_on='building_id', right_on='building_id', how='left')

energy_0.set_index(['timestamp', 'site_id'], inplace=True)
weather.set_index(['timestamp', 'site_id'], inplace=True)
energy_0 = pd.merge(left=energy_0, right=weather, how='left', left_index=True, right_index=True)
energy_0.reset_index(inplace=True)
energy_0 = energy_0[energy_0['meter_reading'] > 0]
energy_0['hour'] = energy_0['timestamp'].dt.hour

print(energy_0.shape)
energy_0.head()

(5411, 17)


Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
704,2016-01-30 08:00:00,0,0,0,43.6839,Education,7432,2008.0,,8.3,,6.1,0.0,1019.0,220.0,2.1,8
725,2016-01-31 05:00:00,0,0,0,37.5408,Education,7432,2008.0,,12.8,,10.0,0.0,1021.9,0.0,0.0,5
737,2016-01-31 17:00:00,0,0,0,52.5571,Education,7432,2008.0,,20.6,,11.7,0.0,1020.9,110.0,1.5,17
2366,2016-04-08 14:00:00,0,0,0,59.3827,Education,7432,2008.0,,21.7,2.0,14.4,0.0,1015.1,250.0,3.1,14
2923,2016-05-01 19:00:00,0,0,0,448.0,Education,7432,2008.0,,31.1,,17.2,0.0,1016.1,100.0,4.1,19


In [3]:
def show_nulls(df: pd.DataFrame):
    print('Columns with null values:')
    no_nulls = True
    for column in df.columns:
        amount_of_nulls = df[column].isnull().sum()
        if amount_of_nulls:
            print('\t', column, 'has', amount_of_nulls, 'nulls')
            no_nulls = False
    if no_nulls:
        print('\tthere are no nulls.')

            
def fillna_to_mean(df: pd.DataFrame, columns: list, inplace=False):
    if inplace:
        result_df = df
    else:
        result_df = pd.DataFrame(df)
        
    for column in columns:
        mean_value = result_df[column].mean()
        result_df.fillna(mean_value, inplace=True)
    
    return result_df


show_nulls(energy_0)

Columns with null values:
	 floor_count has 5411 nulls
	 cloud_coverage has 2261 nulls
	 sea_level_pressure has 28 nulls
	 wind_direction has 175 nulls


### Заполнение пропущенных данных
* floor_count: весь столбец удаляется
* air_temperature: NaN -> 0
* cloud_coverage: NaN -> 0
* dew_temperature: NaN -> 0
* precip_depth_1_hr: NaN -> 0, -1 -> 0
* sea_level_pressure: NaN -> среднее
* wind_direction: NaN -> среднее (роза ветров)

In [4]:
energy_0 = energy_0.drop('floor_count', axis='columns')
energy_0['air_temperature'].fillna(0, inplace=True)
energy_0['cloud_coverage'].fillna(0, inplace=True)
energy_0['dew_temperature'].fillna(0, inplace=True)
energy_0['precip_depth_1_hr'] = energy_0['precip_depth_1_hr'].map(lambda value: 0 if value != value else 0 if value == -1 else value)
fillna_to_mean(energy_0, ['sea_level_pressure', 'wind_direction'], inplace=True)

show_nulls(energy_0)

Columns with null values:
	there are no nulls.


### Разделение данных на обучающую и отложенную выборки в пропорции 80/20

In [5]:
energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2, random_state=11)

### Модели линейной регрессии для каждого часа в отдельности
meter_reading = $a_0 \cdot $ air_temperature + $a_1 \cdot $ dew_temperature + $a_2 \cdot $ sea_level_pressure + $a_3 \cdot $ wind_speed + $a_4 \cdot $ cloud_coverage + $b$

In [6]:
FEATURES = ['air_temperature', 'dew_temperature', 'sea_level_pressure', 'wind_speed', 'cloud_coverage']
lr_model_params_by_hours = dict()
model = LinearRegression()

for hour in range(0, 24):
    hour_filter = energy_0_train['hour'] == hour
    model.fit(energy_0_train[hour_filter][FEATURES], energy_0_train[hour_filter]['meter_reading'])
    lr_model_params_by_hours[hour] = list(model.coef_)
    lr_model_params_by_hours[hour].append(model.intercept_)

### Какое получилось качество модели линейной регрессии по часам с точностью до десятых?

In [7]:
def calculate_lgsq_error(row):
    params = lr_model_params_by_hours[row.hour]
    prediction = params[0] * row[FEATURES[0]] + params[1] * row[FEATURES[1]] + params[2] * row[FEATURES[2]] +\
                    params[3] * row[FEATURES[3]] + params[4] * row[FEATURES[4]] + params[5]
    return (np.log(row.meter_reading + 1) - np.log(prediction + 1))**2


meter_reading_prediction_lgsq_error = energy_0_test.apply(calculate_lgsq_error, axis='columns')
meter_reading_rmsle = np.sqrt(meter_reading_prediction_lgsq_error.sum() / len(meter_reading_prediction_lgsq_error))

print('Качество модели линейной регрессии:', round(meter_reading_rmsle, 1))

Качество модели линейной регрессии: 0.2
