### Постановка задачи
Посчитаем модели линейной регрессии для 20 зданий по оптимальному набору параметров: метеорологические данные, дни недели, недели года, месяцы и праздники по всему набору данных.

Загрузим данные решения, посчитаем значение энергопотребления для требуемых дат для тех зданий, которые посчитаны в модели, и выгрузим результат в виде файла.

Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/test.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_test.csv.gz
Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

### Подключение библиотек

In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.linear_model import LinearRegression

### Загрузка данных 20 зданий из HDF5

In [2]:
energy = pd.read_hdf('energy.0-20.ready.h5', "energy")
print (energy.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175680 entries, 0 to 175679
Data columns (total 97 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   timestamp              175680 non-null  datetime64[ns]
 1   site_id                175680 non-null  int8          
 2   building_id            175680 non-null  int8          
 3   meter                  175680 non-null  int8          
 4   meter_reading          175680 non-null  float16       
 5   primary_use            175680 non-null  category      
 6   square_feet            175680 non-null  int32         
 7   year_built             175680 non-null  float16       
 8   floor_count            0 non-null       float64       
 9   air_temperature        175680 non-null  float16       
 10  cloud_coverage         175680 non-null  float16       
 11  dew_temperature        175680 non-null  float16       
 12  precip_depth_1_hr      175680 non-null  floa

### Загрузка данных для расчета, оптимизация памяти

In [3]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

Все результаты в оперативной памяти занимают порядка 8 Гб. Для оптимизации потребления памяти сначала рассчитаем результаты только для первыx 20 зданий, а затем присоединим к ним остальные, заполненные нулями.

In [4]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz",
                       usecols=["site_id", "building_id"])
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_test.csv.gz")
weather = weather[weather["site_id"] == 0]
weather = weather.drop(columns=["wind_direction"], axis=1)
results = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/test.csv.gz")
results = results[(results["building_id"] < 20) & (results["meter"] == 0)]
results = pd.merge(left=results, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
del buildings
results = results.drop(columns=["meter"], axis=1)
print (results.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350400 entries, 0 to 350399
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   row_id       350400 non-null  int64 
 1   building_id  350400 non-null  int64 
 2   timestamp    350400 non-null  object
 3   site_id      350400 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 13.4+ MB
None


### Интерполяция значений и обогащение погодных данных: только для 1 города

In [5]:
interpolate_columns = ["air_temperature", "dew_temperature",
                       "cloud_coverage", "wind_speed",
                       "sea_level_pressure"]
for col in interpolate_columns:
    weather[col] = weather[col].interpolate(limit_direction='both',
                            kind='cubic')
weather["air_temperature_diff1"] = weather["air_temperature"].diff()
weather.at[0, "air_temperature_diff1"] = weather.at[1, "air_temperature_diff1"]
weather["air_temperature_diff2"] = weather["air_temperature_diff1"].diff()
weather.at[0, "air_temperature_diff2"] = weather.at[1, "air_temperature_diff2"]

### Объединение данных по погоде

In [6]:
results = results.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
results = pd.merge(left=results, right=weather, how="left",
                  left_index=True, right_index=True)
results.reset_index(inplace=True)
results = results.drop(columns=["site_id"], axis=1)
del weather
results = reduce_mem_usage(results)
print (results.info())

Потребление памяти меньше на 19.72 Мб (минус 67.0 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350400 entries, 0 to 350399
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   timestamp              350400 non-null  datetime64[ns]
 1   row_id                 350400 non-null  int32         
 2   building_id            350400 non-null  int8          
 3   air_temperature        350400 non-null  float16       
 4   cloud_coverage         350400 non-null  float16       
 5   dew_temperature        350400 non-null  float16       
 6   precip_depth_1_hr      349800 non-null  float16       
 7   sea_level_pressure     350400 non-null  float16       
 8   wind_speed             350400 non-null  float16       
 9   air_temperature_diff1  350400 non-null  float16       
 10  air_temperature_diff2  350400 non-null  float16       
dtypes: datetime64[ns](1), float16(8), int32(1), int8(1)

### Обогащение данных по дате

In [7]:
results["hour"] = results["timestamp"].dt.hour.astype("int8")
results["weekday"] = results["timestamp"].dt.weekday.astype("int8")
results["week"] = results["timestamp"].dt.isocalendar().week.astype("int8")
results["month"] = results["timestamp"].dt.month.astype("int8")
results["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start='2016-12-31', end='2018-06-01')
us_holidays = calendar().holidays(start=dates_range.min(),
                                  end=dates_range.max())
results['is_holiday'] = results['date'].isin(us_holidays).astype("int8")
for weekday in range(0,7):
    results['is_wday' + str(weekday)] = results['weekday'].isin([weekday]).astype("int8")
for week in range(1,54):
    results['is_w' + str(week)] = results['week'].isin([week]).astype("int8")
for month in range(1,13):
    results['is_m' + str(month)] = results['month'].isin([month]).astype("int8")

### Линейная регрессия

In [8]:
hours = range(0, 24)
buildings = range(0, energy["building_id"].max() + 1)
lr_columns = ["meter_reading_log", "hour", "building_id",
             "air_temperature", "dew_temperature",
             "sea_level_pressure", "wind_speed", "cloud_coverage",
             "air_temperature_diff1", "air_temperature_diff2",
             "is_holiday"]
for wday in range(0,7):
    lr_columns.append("is_wday" + str(wday))
for week in range(1,54):
    lr_columns.append("is_w" + str(week))
for month in range(1,13):
    lr_columns.append("is_m" + str(month))
energy_train_lr = pd.DataFrame(energy, columns=lr_columns)
energy_lr = [[]]*len(buildings)
for building in buildings:
    energy_lr[building] = [[]]*len(hours)
    energy_train_b = energy_train_lr[energy_train_lr["building_id"]==building]
    for hour in hours:
        energy_train_bh = energy_train_b[energy_train_b["hour"]==hour]
        y = energy_train_bh["meter_reading_log"]
        x = energy_train_bh.drop(labels=["meter_reading_log",
            "hour", "building_id"], axis=1)
        model = LinearRegression(fit_intercept=False).fit(x, y)
        energy_lr[building][hour] = model.coef_
        energy_lr[building][hour] = np.append(energy_lr[building][hour], model.intercept_)
print (energy_lr[0])

[array([ 1.88613161e-02, -4.26037377e-03, -2.50945985e-03, -2.24281996e-02,
       -1.52427554e-02,  2.63583660e-03,  8.23050737e-03,  2.55594254e-02,
        3.28055239e+00,  3.29540801e+00,  3.31469488e+00,  3.27908397e+00,
        3.28840876e+00,  3.39234781e+00,  3.37532806e+00, -9.89563465e-02,
       -1.40835077e-01, -1.17597818e-01, -1.26219511e-01, -1.83717370e+00,
       -1.79578996e+00, -1.84611821e+00, -1.81608105e+00, -1.84135866e+00,
       -1.82323098e+00, -1.88986254e+00, -1.79563451e+00, -1.80946922e+00,
       -1.76640701e+00, -1.69228792e+00, -1.71959615e+00, -1.73955023e+00,
       -1.86711621e+00, -1.86529493e+00, -2.97872782e-01,  3.41836691e+00,
        3.25611258e+00,  3.36631250e+00,  3.40030098e+00,  3.27770662e+00,
        3.29736257e+00,  3.18111157e+00,  3.37601256e+00,  3.40007353e+00,
        3.33457565e+00,  9.16525900e-01,  1.03262055e+00,  1.02102685e+00,
        8.56441140e-01,  8.40755820e-01,  8.01853657e-01,  7.81660080e-01,
        7.53522217e-01, 

### Расчет финальных показателей, только энергопотребление, только 20 первых зданий

In [9]:
def calculate_model (x):
    lr = -1
    model = energy_lr[x.building_id][x.hour]
    if len(model) > 0:
        lr = np.sum([x[col] * model[i] for i,col in enumerate(lr_columns[3:])])
        lr += model[len(lr_columns)-3]
        lr = np.exp(lr)
    if lr < 0 or lr != lr or lr*lr == lr:
        lr = 0
    x["meter_reading"] = lr
    if x["row_id"] % 1000000 == 0:
        print ("Готово", x["row_id"])
    return x

results = results.apply(calculate_model, axis=1, result_type="expand")

Готово 0


### Усечение данных до требуемого формата: row_id, meter_reading

In [10]:
results_ready = pd.DataFrame(results, columns=["row_id", "meter_reading"])

### Загрузка всех данных для заполнения их нулями

In [11]:
results = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/test.csv.gz",
                     usecols=["row_id"])
results = pd.merge(left=results, right=results_ready, how="left",
                  left_on="row_id", right_on="row_id")
results.fillna(value=0, inplace=True)
print (results.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
 #   Column         Dtype  
---  ------         -----  
 0   row_id         int64  
 1   meter_reading  float64
dtypes: float64(1), int64(1)
memory usage: 954.4 MB
None


### Выгрузка результатов в CSV файл
Итоговый файл занимает около 1 Гб

In [12]:
results.to_csv("submission.csv",index=False)

### Освобождение памяти

In [13]:
del energy
del results
del results_ready