### Постановка задачи
Посчитаем модель линейной регрессии по первым 100 зданиям и найдем ее точность, используя в качестве параметров только дни недели и праздники, применяя fit_intercept=False и логарифмируя целевой показатель.

Для вычисления отсутствующих или некорректных данных построим модели по всем зданиям одного типа в одном городе и во всех городах.

Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz

Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


def reduce_mem_usage(df: pd.DataFrame):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != 'datetime':
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(
        'Потребление памяти меньше на ',
        round(start_mem - end_mem, 2),
        ' Мб (-',
        round(100 * (start_mem - end_mem) / start_mem, 1),
        '%)',
        sep=''
    )
    return df

### Загрузка данных, отсечение 100 зданий, объединение и оптимизация

In [2]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

energy = energy[energy['building_id'] < 100]

energy = pd.merge(left=energy, right=buildings, how='left', left_on='building_id', right_on='building_id')
energy = pd.merge(
    left=energy.set_index(['timestamp', 'site_id']), 
    right=weather.set_index(['timestamp', 'site_id']),
    how='left', left_index=True, right_index=True
)
energy.reset_index(inplace=True)
energy = energy.drop(columns=['meter', 'year_built', 'square_feet', 'floor_count'], axis='columns')
del buildings
del weather
energy = reduce_mem_usage(energy)
energy.info()

Потребление памяти меньше на 56.89 Мб (-71.9%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864557 entries, 0 to 864556
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           864557 non-null  datetime64[ns]
 1   site_id             864557 non-null  int8          
 2   building_id         864557 non-null  int8          
 3   meter_reading       864557 non-null  float16       
 4   primary_use         864557 non-null  category      
 5   air_temperature     864263 non-null  float16       
 6   cloud_coverage      487693 non-null  float16       
 7   dew_temperature     864263 non-null  float16       
 8   precip_depth_1_hr   864459 non-null  float16       
 9   sea_level_pressure  856210 non-null  float16       
 10  wind_direction      839970 non-null  float16       
 11  wind_speed          864557 non-null  float16       
dtypes: category(1), datetime64[ns](1), floa

### Обогащение данных: час, дни недели, праздники, логарифм

In [3]:
energy['hour'] = energy['timestamp'].dt.hour.astype('int8')
energy['weekday'] = energy['timestamp'].dt.weekday.astype('int8')
for weekday in range(0,7):
    energy['is_wday' + str(weekday)] = energy['weekday'].isin([weekday]).astype('int8')
energy['date'] = pd.to_datetime(energy['timestamp'].dt.date)

dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
energy['is_holiday'] = energy['date'].isin(us_holidays).astype('int8')
energy['meter_reading_log'] = np.log(energy['meter_reading'] + 1)

### Разделение данных

In [4]:
energy_train, energy_test = train_test_split(energy[(energy['meter_reading'] > 0)], test_size=0.2)
energy_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 427872 entries, 368772 to 402457
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           427872 non-null  datetime64[ns]
 1   site_id             427872 non-null  int8          
 2   building_id         427872 non-null  int8          
 3   meter_reading       427872 non-null  float16       
 4   primary_use         427872 non-null  category      
 5   air_temperature     427868 non-null  float16       
 6   cloud_coverage      249207 non-null  float16       
 7   dew_temperature     427868 non-null  float16       
 8   precip_depth_1_hr   427869 non-null  float16       
 9   sea_level_pressure  425624 non-null  float16       
 10  wind_direction      414121 non-null  float16       
 11  wind_speed          427872 non-null  float16       
 12  hour                427872 non-null  int8          
 13  weekday             4278

### Линейная регрессия: по часам

In [7]:
hours = range(0, 24)
buildings = range(0, energy_train['building_id'].max() + 1)
lr_columns = ['meter_reading_log', 'hour', 'building_id', 'is_holiday']
for wday in range(0,7):
    lr_columns.append('is_wday' + str(wday))
energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)
energy_lr = [[]]*len(buildings)
for building in buildings:
    energy_lr[building] = [[]]*len(hours)
    energy_train_b = energy_train_lr[energy_train_lr['building_id']==building]
    for hour in hours:
        energy_lr[building].append([0]*(len(lr_columns)-3))
        energy_train_bh = pd.DataFrame(energy_train_b[energy_train_b['hour']==hour])
        y = energy_train_bh['meter_reading_log']
        if len(y) > 0:
            x = energy_train_bh.drop(labels=['meter_reading_log', 'hour', 'building_id'], axis=1)
            model = LinearRegression(fit_intercept=False).fit(x, y)
            energy_lr[building][hour] = model.coef_
            energy_lr[building][hour] = np.append(energy_lr[building][hour], model.intercept_)
energy_lr[0]

[array([-0.11766605,  5.4910592 ,  5.45123922,  5.48958333,  5.44323696,
         5.48602254,  5.41601563,  5.42563101,  0.        ]),
 array([-0.01997522,  5.42699457,  5.45870536,  5.52701823,  5.4930198 ,
         5.43447825,  5.47251157,  5.43709591,  0.        ]),
 array([-0.03857146,  5.41226518,  5.43815104,  5.51883371,  5.46567159,
         5.4575    ,  5.43460648,  5.44596354,  0.        ]),
 array([-0.11844328,  5.45594555,  5.45457176,  5.48995536,  5.45916873,
         5.45348773,  5.49725116,  5.484375  ,  0.        ]),
 array([-0.16508853,  5.44459645,  5.43113426,  5.50984375,  5.49847854,
         5.47929185,  5.4561942 ,  5.39609375,  0.        ]),
 array([-0.19883573,  5.54257965,  5.45453125,  5.49038462,  5.46137349,
         5.47148466,  5.40880409,  5.37769397,  0.        ]),
 array([-0.09658498,  5.5200467 ,  5.41493056,  5.48060826,  5.4496131 ,
         5.46063702,  5.47511574,  5.45529514,  0.        ]),
 array([-0.01462932,  5.42357004,  5.45796875,  5.47890

### Линейная регрессия: по типам зданий

In [10]:
sites = range(0, energy['site_id'].max() + 1)
primary_uses = energy['primary_use'].unique()
lr_columns_use = ['meter_reading_log', 'hour', 'building_id', 'is_holiday', 'primary_use', 'site_id']
lr_columns_use.extend(['is_wday' + str(wday) for wday in range(0, 7)])
energy_lr_use = {}
energy_lr_use_site = {}
energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns_use)
for primary_use in primary_uses:
    energy_train_u = energy_train_lr[energy_train_lr['primary_use'] == primary_use]
    if len(energy_train_u) > 0:
        energy_lr_use_site[primary_use] = [[]]*len(sites)
        for site in sites:
            energy_lr_use_site[primary_use][site] = [[]]*len(hours)
            energy_train_us = energy_train_u[energy_train_u['site_id']==site]
            if len(energy_train_us) > 0:
                for hour in hours:
                    energy_train_uth = energy_train_us[energy_train_us['hour']==hour]
                    y = energy_train_uth['meter_reading_log']
                    if len(y) > 0:
                        x = energy_train_uth.drop(labels=['meter_reading_log', 'hour', 'building_id', 'site_id', 'primary_use'], axis=1)
                        model = LinearRegression(fit_intercept=False).fit(x, y)
                        energy_lr_use_site[primary_use][site][hour] = model.coef_
                        energy_lr_use_site[primary_use][site][hour] = np.append(energy_lr_use_site[primary_use][site][hour], model.intercept_)
        energy_lr_use[primary_use] = [[]]*len(hours)
        for hour in hours:
            energy_train_th = energy_train_u[energy_train_u['hour']==hour]
            y = energy_train_th['meter_reading_log']
            if len(y) > 0:
                x = energy_train_th.drop(labels=['meter_reading_log',
                    'hour', 'building_id', 'site_id', 'primary_use'], axis=1)
                model = LinearRegression(fit_intercept=False).fit(x, y)
                energy_lr_use[primary_use][hour] = model.coef_
                energy_lr_use[primary_use][hour] = np.append(energy_lr_use[primary_use][hour], model.intercept_)
energy_lr_use_site

{'Education': [[array([-0.01826136,  5.61119987,  5.66523021,  5.6681673 ,  5.68615154,
           5.66356715,  5.63581342,  5.62852515,  0.        ]),
   array([-0.0680797 ,  5.59780226,  5.61785183,  5.64797443,  5.64312437,
           5.70559393,  5.6291554 ,  5.57958778,  0.        ]),
   array([5.76976078e-04, 5.62162097e+00, 5.59952309e+00, 5.64931441e+00,
          5.66268399e+00, 5.62465016e+00, 5.59354554e+00, 5.54213828e+00,
          0.00000000e+00]),
   array([-0.14166113,  5.59284175,  5.6204064 ,  5.59444948,  5.62516929,
           5.62995259,  5.5817979 ,  5.55365548,  0.        ]),
   array([-0.10800793,  5.60499108,  5.60917969,  5.66754375,  5.61547129,
           5.63815875,  5.56479308,  5.57254748,  0.        ]),
   array([-0.08261908,  5.61885547,  5.66828157,  5.64613334,  5.70501458,
           5.65384714,  5.63031069,  5.61803937,  0.        ]),
   array([0.01146315, 5.71537138, 5.72601418, 5.72416713, 5.74376147,
          5.72207518, 5.69281939, 5.6951439 , 

### Расчет качества
Используем индивидуальные модели здания, иначе общую модель по всем зданиям данного типа в городе, иначе общую модель по всем зданиям такого типа (по всем городам)

In [11]:
def calculate_model (x):
    lr = -1
    model = energy_lr[x.building_id][x.hour]
    if len(model) == 0:
        model = energy_lr_use_site[x.primary_use][x.site_id][x.hour]
    if len(model) == 0:
        model = energy_lr_use[x.primary_use][x.hour]
    if len(model) > 0:
        lr = np.sum([x[col] * model[i] for i,col in enumerate(lr_columns[3:])])
        lr += model[len(lr_columns)-3]
        lr = np.exp(lr)
    if lr < 0:
        lr = 0
    x['meter_reading_lr_q'] = (np.log(x.meter_reading + 1) -
                               np.log(1 + lr))**2
    return x

energy_test = energy_test.apply(calculate_model,
                                    axis=1, result_type='expand')
energy_test_lr_rmsle = np.sqrt(energy_test['meter_reading_lr_q'].sum() / len(energy_test))
print ('Качество линейной регрессии, 100 зданий:', energy_test_lr_rmsle, round(energy_test_lr_rmsle, 1))

Качество линейной регрессии, 100 зданий: 0.3394756782487827 0.3
