In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

%matplotlib inline

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
df_store = pd.read_csv('../input/store.csv')

In [None]:
y = df_train["Sales"].values

In [None]:
df_train['Year'] = df_train['Date'].apply(lambda x: int(x[0:4]))
df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7]))
df_train['Day'] = df_train['Date'].apply(lambda x: int(x[8:10]))
df_test['Year'] = df_test['Date'].apply(lambda x: int(x[0:4]))
df_test['Month'] = df_test['Date'].apply(lambda x: int(x[5:7]))
df_test['Day'] = df_test['Date'].apply(lambda x: int(x[8:10]))

In [None]:
df_store.CompetitionDistance.fillna(value=0, inplace=True)
df_test.Open.fillna(value=0, inplace=True)
df_train.StateHoliday[df_train["StateHoliday"] == 0] = "0"

In [None]:
print(df_train.shape)
print(df_test.shape)
print(df_store.shape)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_store.head()

In [None]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4))

sns.barplot(x='Year', y='Sales', data=df_train, ax=axis1)
sns.barplot(x='Year', y='Customers', data=df_train, ax=axis2)

In [None]:
df_train.query('Open == 1')[['Sales', 'Customers']].hist(bins=100, figsize=(13,7));

In [None]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4))
sns.barplot(x='Month', y='Sales', data=df_train, ax=axis1)
sns.barplot(x='Month', y='Customers', data=df_train, ax=axis2)

In [None]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4))
sns.barplot(x='DayOfWeek', y='Sales', data=df_train, ax=axis1)
sns.barplot(x='DayOfWeek', y='Customers', data=df_train, ax=axis2)

In [None]:
df_train[['Sales', 'Customers']].corr()

Значения Sales и Customers  сильно скоррелированы 

Далее разбиваем категориальные признаки.

In [None]:
df_DayOfWeek = pd.get_dummies(df_train.DayOfWeek, prefix='DayOfWeek')
df_StateHoliday = pd.get_dummies(df_train.StateHoliday, prefix="StateHoliday_")
df_train = pd.concat([df_train, df_DayOfWeek, df_StateHoliday], axis=1)
del df_train["Date"]
del df_train["Day"]
del df_train["Customers"]
del df_train["DayOfWeek"]
del df_train["Sales"]
del df_train["StateHoliday"]

In [None]:
df_StoreType = pd.get_dummies(df_store.StoreType, prefix='StoreType_')
df_Assortment = pd.get_dummies(df_store.Assortment, prefix='Assortment_')
df_store = pd.concat([df_store, df_StoreType, df_Assortment], axis=1)
del df_store["StoreType"]
del df_store["Assortment"]
del df_store["PromoInterval"]

Делаем merge для Store

In [None]:
df = pd.merge(df_train, df_store, how='left', on=['Store'])

In [None]:
df.fillna(0, inplace=True)

In [None]:
X = df.values[:,1:]

In [None]:
parametrs = range(40, 241, 40)

In [None]:
scores, tst_scr = validation_curve(RandomForestRegressor(n_jobs = 4), X[:20000],\
               y[:20000], 'n_estimators', parametrs, cv=5, scoring='r2', verbose=2)

In [None]:
scores_mean = scores.mean(axis=1)
scores_std = scores.std(axis=1)
tst_scr_mean = tst_scr.mean(axis=1)
tst_scr_std = tst_scr.std(axis=1)
plt.plot(parametrs, tst_scr_mean)
plt.fill_between(parametrs, tst_scr_mean + tst_scr_std, tst_scr_mean - tst_scr_std, alpha=0.3)
plt.plot(parametrs, scores_mean)
plt.fill_between(parametrs, scores_mean + scores_std, scores_mean - scores_std, alpha=0.3)

In [None]:
df.shape

In [None]:
parametrs = range(3, 24)

In [None]:
scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=120, n_jobs = 4), X[:20000], \
                                   y[:20000], 'max_features', parametrs, cv=3, scoring='r2', verbose=2)

In [None]:
scores_mean = scores.mean(axis=1)
scores_std = scores.std(axis=1)
tst_scr_mean = tst_scr.mean(axis=1)
tst_scr_std = tst_scr.std(axis=1)
plt.plot(parametrs, tst_scr_mean)
plt.fill_between(parametrs, tst_scr_mean + tst_scr_std, tst_scr_mean - tst_scr_std, alpha=0.3)
plt.plot(parametrs, scores_mean)
plt.fill_between(parametrs, scores_mean + scores_std, scores_mean - scores_std, alpha=0.3)

In [None]:
parametrs = range(4, 61, 4)

In [None]:
scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=120, n_jobs = 4, max_features=16), X[:20000], \
                                   y[:20000], 'max_depth', parametrs, cv=3, scoring='r2', verbose=2)

In [None]:
scores_mean = scores.mean(axis=1)
scores_std = scores.std(axis=1)
tst_scr_mean = tst_scr.mean(axis=1)
tst_scr_std = tst_scr.std(axis=1)
plt.plot(parametrs, tst_scr_mean)
plt.fill_between(parametrs, tst_scr_mean + tst_scr_std, tst_scr_mean - tst_scr_std, alpha=0.3)
plt.plot(parametrs, scores_mean)
plt.fill_between(parametrs, scores_mean + scores_std, scores_mean - scores_std, alpha=0.3)

In [None]:
model = RandomForestRegressor(n_estimators=120, max_depth=20, max_features=16, n_jobs=4, verbose=2)

In [None]:
model.fit(X, y)

In [None]:
idx = model.feature_importances_.argsort()[::-1]

In [None]:
ax = sns.barplot(x=model.feature_importances_[idx], y=df.drop('Store', axis=1).columns[idx])

Далее обрабатываем test

In [None]:
df_DayOfWeek = pd.get_dummies(df_test.DayOfWeek, prefix='DayOfWeek')
df_StateHoliday = pd.get_dummies(df_test.StateHoliday, prefix="StateHoliday_")
df_StateHoliday = pd.concat([df_StateHoliday, pd.DataFrame(columns=['StateHoliday__b', 'StateHoliday__c'])], axis = 1)

In [None]:
df_StateHoliday.fillna(0, inplace=True)

In [None]:
del df_test["Date"]
del df_test["Day"]
del df_test["DayOfWeek"]
del df_test["StateHoliday"]
df_test = pd.concat([df_test, df_DayOfWeek, df_StateHoliday], axis=1)
test_df = pd.merge(df_test, df_store, how='left', on=['Store'])
test_df.fillna(0, inplace=True)

In [None]:
y_test_pred = model.predict(test_df.values[:,2:])

In [None]:
submission = pd.DataFrame({ "Id": test_df.Id, "Sales": y_test_pred.reshape(-1.1)})

In [None]:
submission

In [None]:
submission.to_csv("rossman.csv",index=False)