In [4]:
import os
import sys
while not os.getcwd().endswith('ml'):
    os.chdir('..')
sys.path.insert(0, os.getcwd())

In [41]:
import pandas as pd
import numpy as np
import copy
import math

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [48]:
TARGET_NAME = 'units'

#### Считывание данных

In [6]:
key = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/key.csv")
train = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/train.csv")
test = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/test.csv")
weather = pd.read_csv("kaggle/walmart-recruiting-sales-in-stormy-weather/weather.csv")

#### Описание данных

In [7]:
key.sample()

Unnamed: 0,store_nbr,station_nbr
10,11,10


In [8]:
train.sample()

Unnamed: 0,date,store_nbr,item_nbr,units
3018450,2013-10-15,15,28,0


In [9]:
weather.sample()

Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
10081,14,2013-05-26,82,67,75,3,68,70,0,10,519,1935,,0.0,0.0,28.57,29.94,14.9,16,15.1


In [10]:
weather.wetbulb.value_counts()

M      1252
74      777
75      684
73      663
72      609
71      526
70      481
67      474
65      459
64      437
62      433
68      428
66      425
63      423
60      419
61      409
59      408
58      406
69      399
56      383
76      362
55      356
57      344
52      337
54      330
53      325
44      323
45      307
50      305
51      304
       ... 
17       54
15       53
16       48
12       43
10       40
13       38
11       33
8        31
79       21
4        20
7        16
9        14
0        14
5        13
3        11
6         9
2         7
-3        7
1         5
-4        4
-2        3
-5        3
-6        2
-7        2
-10       2
-15       1
-8        1
-9        1
-1        1
80        1
Name: wetbulb, Length: 93, dtype: int64

#### Предобработка данных

In [11]:
def preprocessing(data, key, weather, target_field=TARGET_NAME):
    data = data.join(key.set_index("store_nbr"), on="store_nbr")
    data = data.join(weather.set_index(["station_nbr", "date"]), on=["station_nbr", "date"])
    data.drop(columns=["codesum", "sunrise", "sunset"])
    data.fillna(-1, inplace=True)
    
    needed_columns = ["store_nbr", "item_nbr"]
    if target_field in data.columns:
        needed_columns.append(target_field)
    
    data = data[needed_columns]
    data = pd.get_dummies(data, columns=["store_nbr", "item_nbr"])
    if target_field in data.columns:
        data[target_field] = np.log(data[target_field] + 1)
        
    return data

In [62]:
preprocessed_train = preprocessing(train, key, weather)
preprocessed_test = preprocessing(test, key, weather)

train_columns = preprocessed_train.columns
test_columns = preprocessed_test.columns
diff_columns = list(set(train_columns) - set(test_columns))
diff_columns.remove(TARGET_NAME)
print("Diff of columns {}". format(diff_columns))
for column in diff_columns:
    preprocessed_test[column] = 0 # Add missing field after onehot encoding

Diff of columns ['store_nbr_35']


# Обучение базовой модели

In [63]:
y = preprocessed_train['units']
X = preprocessed_train.loc[:, preprocessed_train.columns != 'units']

In [64]:
print(set(X.columns) - set(preprocessed_test.columns))
print(len(list(preprocessed_test.columns)))

set()
156


In [65]:
model = RandomForestRegressor(criterion='mse')

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)



In [67]:
X_train = X_train[0:100000]
y_train = y_train[0:100000]

In [68]:
model.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [69]:
y_predict = model.predict(X_test)

In [70]:
def get_score(y_true, y_predict):
    return math.sqrt(mean_squared_error(y_true, y_predict))

In [71]:
get_score(y_test, y_predict)

0.1371793813719086

#### Предсказание

In [None]:
y_predict = model.predict(preprocessed_test)