### Постановка задачи
Заполним отсутствующие значения по погоде интерполяционными данными.

Посчитаем модель линейной регрессии по первому зданию и найдем ее точность.

Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz

Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

In [1]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

### Отсечение здания 0 и отсутствующих значений (работаем только с первым зданием)

In [4]:
energy = energy[energy['building_id'] == 0]

### Объединение данных

In [5]:
energy = pd.merge(left=energy, right=buildings, how="left", left_on="building_id", right_on="building_id")
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left", left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "site_id", "floor_count"], axis=1)
del buildings
del weather
energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           8784 non-null   object 
 1   building_id         8784 non-null   int64  
 2   meter_reading       8784 non-null   float64
 3   primary_use         8784 non-null   object 
 4   square_feet         8784 non-null   int64  
 5   year_built          8784 non-null   float64
 6   air_temperature     8781 non-null   float64
 7   cloud_coverage      4954 non-null   float64
 8   dew_temperature     8781 non-null   float64
 9   precip_depth_1_hr   8783 non-null   float64
 10  sea_level_pressure  8699 non-null   float64
 11  wind_direction      8534 non-null   float64
 12  wind_speed          8784 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 892.2+ KB


### Оптимизация памяти

In [6]:
def reduce_mem_usage(df: pd.DataFrame):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != 'datetime':
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(
        'Потребление памяти меньше на ',
        round(start_mem - end_mem, 2),
        ' Мб (-',
        round(100 * (start_mem - end_mem) / start_mem, 1),
        '%)',
        sep=''
    )
    return df

In [7]:
energy = reduce_mem_usage(energy)

Потребление памяти меньше на 0.62 Мб (-71.1%)


### Интерполяция данных

In [14]:
energy['precip_depth_1_hr'] = energy['precip_depth_1_hr'].map(lambda x: x if x > 0 else 0)
interpolate_columns = [
    "air_temperature", "dew_temperature", "cloud_coverage", "wind_speed", "precip_depth_1_hr", "sea_level_pressure"
]
for col in interpolate_columns:
    energy[col] = energy[col].interpolate(limit_direction='both', kind='cubic')

### Проверка качества интерполяции

In [10]:
pd.set_option('use_inf_as_na', True)
for col in interpolate_columns:
    print(col, 'Inf+NaN:', energy[col].isnull().sum())

air_temperature Inf+NaN: 0
dew_temperature Inf+NaN: 0
cloud_coverage Inf+NaN: 0
wind_speed Inf+NaN: 0
precip_depth_1_hr Inf+NaN: 0
sea_level_pressure Inf+NaN: 0


### Разделение данных

In [11]:
energy_train, energy_test = train_test_split(energy[energy['meter_reading'] > 0], test_size=0.2)
energy_train.head()

Unnamed: 0,timestamp,building_id,meter_reading,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
4191,2016-06-23 15:00:00,0,300.25,Education,7432,2008.0,30.59375,4.0,23.296875,0.0,1021.0,,1.5
8252,2016-12-09 20:00:00,0,98.3125,Education,7432,2008.0,16.09375,6.0,6.699219,0.0,1025.0,20.0,6.699219
4445,2016-07-04 05:00:00,0,242.25,Education,7432,2008.0,26.703125,4.0,24.40625,0.0,1020.5,0.0,0.0
7092,2016-10-22 12:00:00,0,238.25,Education,7432,2008.0,14.398438,0.0,7.199219,0.0,1015.0,320.0,3.599609
4556,2016-07-08 20:00:00,0,298.25,Education,7432,2008.0,36.09375,4.0,20.0,0.0,1017.5,,2.599609


### Линейная регрессия

In [12]:
regression_columns = [
    "meter_reading", "air_temperature", "dew_temperature", "cloud_coverage",
    "wind_speed", "precip_depth_1_hr", "sea_level_pressure"
]

energy_train_lr = energy_train[regression_columns]
y= energy_train_lr['meter_reading']
x= energy_train_lr.drop('meter_reading', axis='columns')
model = LinearRegression().fit(x, y)
print(model.coef_, model.intercept_)

[ 2.5887434   3.92030364 -2.6663704  -1.65959858  0.18471739 -0.93998661] 1069.5495971600421


### Предсказание и оценка модели

In [13]:
def calculate_model(x):
    lr = np.sum([x[col] * model.coef_[i] for i, col in enumerate(regression_columns[1:])]) + model.intercept_
    x['meter_reading_lr_q'] = (np.log(1 + x.meter_reading) - np.log(1 + lr))**2
    return x


energy_test = energy_test.apply(calculate_model, axis='columns')
energy_test_lr_rmsle = np.sqrt(energy_test['meter_reading_lr_q'].sum() / len(energy_test))
print('Качество линейной регрессии:', energy_test_lr_rmsle, round(energy_test_lr_rmsle, 1))

Качество линейной регрессии: 0.20095632113681797 0.2
