# Пункт 1 - EDA

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [10]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


Колонка	Описание   
**datetime** -	Дата и время в формате **yyyy-mm-dd hh:mm:ss**. Это временная метка для каждой записи.   
**season** -	Сезон (1 = весна, 2 = лето, 3 = осень, 4 = зима). Да, это на первый взгляд странно, но это так.   
**holiday**	 -  Флаг, показывающий, является ли день праздничным (1 = да, 0 = нет).   
**workingday** - Рабочий день (1 = рабочий день, 0 = выходной или праздник).   
**weather**	 -Погодные условия:   
**1** = Ясно, малооблачно, переменная облачность   
**2** = Туманно, облачно   
**3** = Лёгкий дождь или снег + облачно   
**4** = Сильный дождь, снег, шторм (практически не встречается в датасете)   
**temp**	- Температура в градусах Цельсия (по ощущениям ближе к реальной).   
**atemp**	- Ощущаемая температура (индекс, показывающий, как температура ощущается на улице).   
**humidity**  - Влажность (в процентах).   
**windspeed**	-  Скорость ветра (в м/с).   
**casual**	 -  Количество аренд для незарегистрированных пользователей **(только в train-части)**.   
**registered**	-  Количество аренд для зарегистрированных пользователей **(только в train-части)**.   
**count**	-  Общее количество аренд **(целевая переменная для задачи прогнозирования в train-части)**.   

In [12]:
df.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132
std,1.116174,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454
min,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0
50%,3.0,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0
75%,4.0,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0
max,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0


In [13]:
plt.figure(figsize=(12, 10))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5)
plt.title("Корреляция")
plt.show()

NameError: name 'plt' is not defined

In [14]:
def percent_miss(column : pd.Series):
    return f'{round((column.isna().sum()/column.shape[0])*100, 2)}%' # Функция учителя 


In [15]:
for i in df.columns:
    print(f'{i} - {percent_miss(df[i])}')

datetime - 0.0%
season - 0.0%
holiday - 0.0%
workingday - 0.0%
weather - 0.0%
temp - 0.0%
atemp - 0.0%
humidity - 0.0%
windspeed - 0.0%
casual - 0.0%
registered - 0.0%
count - 0.0%


# Пункт 2 - Подготовка к обучению

In [16]:
df = pd.read_csv('train.csv', header=None, names=[
    'datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'dew_point', 'humidity', 
    'wind_speed', 'target1', 'target2', 'target3'
], skiprows=[0])

In [17]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df['day_of_week'] = df['datetime'].dt.dayofweek

In [18]:
X = df[['season', 'holiday', 'workingday', 'temp', 'dew_point', 'humidity', 'wind_speed',
        'hour', 'day', 'month', 'year', 'day_of_week']]
y = df[['target1', 'target2', 'target3']]

In [19]:
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return f'MAE: {mae} \nMSE: {mse} \nRMSE: {rmse} \nR2: {r2}'

# 3.1 - Линейная регрессия

In [268]:
model = MultiOutputRegressor(LinearRegression())

In [269]:
model.fit(X_train, y_train)

In [270]:
print(evaluate_model(model, X_train, y_train))

MAE: 66.89005001125399 
MSE: 10136.407292415737 
RMSE: 100.67972632270977 
R2: 0.39829856274767916


In [271]:
print(evaluate_model(model, X_test, y_test))

MAE: 96.55359478055449 
MSE: 20742.934622307872 
RMSE: 144.0240765369036 
R2: 0.3240389068085094


# 3.2 - Дерево решений

In [272]:
model = DecisionTreeRegressor()

In [273]:
model.fit(X_train, y_train)

In [274]:
print(evaluate_model(model, X_train, y_train))

MAE: 0.0 
MSE: 0.0 
RMSE: 0.0 
R2: 1.0


In [275]:
print(evaluate_model(model, X_test, y_test))

MAE: 43.637588001224366 
MSE: 5542.908172635445 
RMSE: 74.45070968523702 
R2: 0.7624769623099507


# 3.3 - RandomForestRegressor()

In [22]:
model = RandomForestRegressor()

In [23]:
model.fit(X_train, y_train)

In [24]:
print(evaluate_model(model, X_train, y_train))

MAE: 6.169139488592869 
MSE: 126.98538368549987 
RMSE: 11.268779156834155 
R2: 0.991140794645487


In [25]:
print(evaluate_model(model, X_test, y_test))

MAE: 35.38049892868082 
MSE: 3503.3156546678906 
RMSE: 59.18881359402206 
R2: 0.8591581554592791


# 3.4 - XGBoost

In [284]:
model = MultiOutputRegressor(XGBRegressor())
model.fit(X_train, y_train)

In [285]:
print(evaluate_model(model, X_train, y_train))

MAE: 9.77837085723877 
MSE: 257.4658508300781 
RMSE: 16.045742451818118 
R2: 0.9832821488380432


In [286]:
print(evaluate_model(model, X_test, y_test))

MAE: 34.57893371582031 
MSE: 3175.567626953125 
RMSE: 56.35217499753781 
R2: 0.8774496912956238


# 3.5 - LightGBM

In [288]:
model = MultiOutputRegressor(LGBMRegressor())
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 293
[LightGBM] [Info] Number of data points in the train set: 8708, number of used features: 12
[LightGBM] [Info] Start training from score 34.025609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 293
[LightGBM] [Info] Number of data points in the train set: 8708, number of used features: 12
[LightGBM] [Info] Start training from score 140.365641
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough

In [289]:
print(evaluate_model(model, X_train, y_train))

MAE: 13.965742969615713 
MSE: 542.2334917844732 
RMSE: 23.2859075791448 
R2: 0.965127278946852


In [292]:
print(evaluate_model(model, X_test, y_test))

MAE: 31.84171425881568 
MSE: 2821.611075611638 
RMSE: 53.11883917793797 
R2: 0.8866340976546575
