In [None]:
import os
import sys
while not os.getcwd().endswith('ml'):
    os.chdir('..')
sys.path.insert(0, os.getcwd())

In [None]:
import pandas as pd
import numpy as np
import copy

from datetime import datetime, timedelta

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer

#### Считывание данных

In [None]:
key = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/key.csv")
train = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/train.csv")
test = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/test.csv")
weather = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/weather.csv")

#### Описание данных

#### Предобработка данных

In [None]:
def minutes_from_midnight(time_str):
    if len(time_str) == 1:
        return np.nan
    return int(time_str[:2])*60 + int(time_str[2:])

In [None]:
def preprocessing(data, key, weather, target_field='units'):
    data = data.join(key.set_index("store_nbr"), on="store_nbr")
    data = data.join(weather.set_index(["station_nbr", "date"]), on=["station_nbr", "date"])
    
    data['sunrise'] = data['sunrise'].apply(minutes_from_midnight)
    data['sunrise'] = data['sunset'].apply(minutes_from_midnight)
    
    mlb = MultiLabelBinarizer()
    codesum_data = pd.DataFrame(mlb.fit_transform(weather['codesum'].str.split()),columns=mlb.classes_)
    for c in codesum_data.columns:
        if len(c) < 4:
            continue
        for sub_c in [c[:2], c[2:]]:
            codesum_data[sub_c] = codesum_data[sub_c] + codesum_data[c] if sub_c in codesum_data.columns else codesum_data[c]
        codesum_data.drop(columns=c, inplace=True)
    data.drop(columns=["codesum"], inplace=True)
    data = data.join(codesum_data)
    
    data.fillna(-1, inplace=True)
    
    needed_columns = ["store_nbr", "item_nbr"]
    if target_field in data.columns:
        needed_columns.append(target_field)
    
    data = data[needed_columns]
    data = pd.get_dummies(data, columns=["store_nbr", "item_nbr"])
    if target_field in data.columns:
        data['units'] = np.log(data['units'] + 1)
        
    return data

In [None]:
preprocessed_train = preprocessing(train, key, weather)
preprocessed_test = preprocessing(test, key, weather)

#### Обучение базовой модели

In [None]:
y = preprocessed_train['units']
X = preprocessed_train.loc[:, preprocessed_train.columns != 'units']

In [None]:
model = RandomForestRegressor(criterion='mse')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_predict = model.predict(X_test)

In [None]:
def get_score(y_true, y_predict):
    return mean_squared_error(y_true, y_predict)

In [None]:
get_score(y_test, y_predict)

In [None]:
print(y_predict[np.abs(y_predict - y_test) > 0.1])
print(y_test[np.abs(y_predict - y_test) > 0.1])

In [None]:
model.score(X_test, y_test)