In [None]:
import os
import sys
while not os.getcwd().endswith('ml'):
    os.chdir('..')
sys.path.insert(0, os.getcwd())

In [None]:
import pandas as pd
import numpy as np
import copy

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate

In [None]:
TARGET_NAME = 'units'

In [None]:
def get_score(y_true, y_predict):
    return math.sqrt(mean_squared_error(y_true, y_predict))

#### Считывание данных

In [None]:
key = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/key.csv")
train = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/train.csv")
test = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/test.csv")
weather = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/weather.csv")

#### Описание данных

#### Предобработка данных

In [None]:
def convert_to_minutes(x):
    x.loc[x == "-"] = np.nan
    x.loc[~x.isnull()] = (x[~x.isnull()].astype(int) % 100) + (x[~x.isnull()].astype(int) // 100 * 60)

#### Предобработка данных

In [None]:
def preprocessing(data, key, weather, target_field=TARGET_NAME):
    data = data.join(key.set_index("store_nbr"), on="store_nbr")
    data = data.join(weather.set_index(["station_nbr", "date"]), on=["station_nbr", "date"])
    
    convert_to_minutes(data['sunrise'])
    convert_to_minutes(data['sunset'])
    
    mlb = MultiLabelBinarizer()
    codesum_data = pd.DataFrame(mlb.fit_transform(weather['codesum'].str.split()),columns=mlb.classes_)
    for c in codesum_data.columns:
        if len(c) < 4:
            continue
        for sub_c in [c[:2], c[2:]]:
            codesum_data.loc[sub_c] = codesum_data[c] + (codesum_data[sub_c]) if sub_c in codesum_data.columns else 0
        codesum_data.drop(columns=c, inplace=True)
    
    data.drop(columns=["codesum"], inplace=True)
    data = data.join(codesum_data.sort_index(axis=1))
    
    data.fillna(-1, inplace=True)

    needed_columns = ["store_nbr", "item_nbr"]
    if target_field in data.columns:
        needed_columns.append(target_field)
    data = data[needed_columns]
    data = pd.get_dummies(data, columns=["store_nbr", "item_nbr"])
    if target_field in data.columns:
        data.loc[:, target_field] = np.log(data[target_field] + 1)
        
    return data

In [None]:
preprocessed_train = preprocessing(train, key, weather)
preprocessed_test = preprocessing(test, key, weather)

train_columns = preprocessed_train.columns
test_columns = preprocessed_test.columns
diff_columns = list(set(train_columns) - set(test_columns))
diff_columns.remove(TARGET_NAME)
print("Diff of columns {}". format(diff_columns))
for column in diff_columns:
    preprocessed_test.loc[column] = 0 # Add missing field after onehot encoding

#### Кросвалидация 

In [None]:
y = preprocessed_train['units']
X = preprocessed_train.loc[:, preprocessed_train.columns != 'units']

X = X[0:10000]
y = y[0:10000]
model = RandomForestRegressor(criterion='mse', n_estimators=100, max_depth=20)

cv_results = cross_validate(model, X, y, cv=3)


In [None]:
print(cv_results)

In [None]:
model = RandomForestRegressor(criterion='mse')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("score: {}".format(get_score(y_test, y_predict)))

#### Обучение базовой модели

In [None]:
y = preprocessed_train['units']
X = preprocessed_train.loc[:, preprocessed_train.columns != 'units']

In [None]:
model = RandomForestRegressor(criterion='mse')

In [None]:
model.fit(X, y)

#### Предсказание

In [None]:
y_predict = model.predict(preprocessed_test)
y_predict = np.exp(y_predict) - 1

In [None]:
_id = (test["store_nbr"].astype("str") + "_" + test["item_nbr"].astype("str") + "_" + test["date"].astype('str')).to_numpy()
_units = np.round(y_predict).astype(int)

In [None]:
prediction = pd.DataFrame(data={"id": _id, "units": _units}).set_index("id") 
prediction.to_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/submissions/benchmark1.csv")