In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
past_sales = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
past_sales['Date'] = pd.to_datetime(past_sales['Date'], format='%Y-%m-%d')
past_sales['StateHoliday'] = past_sales['StateHoliday'].replace({0:'0'})
past_sales['month'] = past_sales['Date'].dt.strftime('%b')
print(past_sales.shape)
past_sales.head()

## Descriptive Summary
### On numerical columns

In [None]:
num_cols = ['Sales', 'Customers']
past_sales[num_cols].describe()

### On categorical columns

In [None]:
cat_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Open']
past_sales[cat_cols].astype(object).describe(include=object)

In [None]:
past_sales['StateHoliday'].unique()

### On date columns

In [None]:
date_cols = ['Date']
past_sales[date_cols].describe()

### Data Anomalies

In [None]:
past_sales.isna().sum()

In [None]:
1115 * 942, past_sales.shape[0], 1115 * 942 - past_sales.shape[0]

In [None]:
past_sales['Store'].value_counts()

In [None]:
sample = past_sales[past_sales['Store'] == 233]
sample.set_index('Date')['Sales'].plot.line(figsize=(14,4));

In [None]:
sample = past_sales[past_sales['Store'] == 677]
sample.set_index('Date')['Sales'].plot.line(figsize=(14,4));

In [None]:
past_sales[past_sales['Open'] == 0]['Sales'].min()

In [None]:
past_sales[past_sales['Open'] == 1]['Sales'].min()

In [None]:
#past_sales[(past_sales['Open'] == 1) &
#           (past_sales['Sales'] == 0)]

In [None]:
past_sales['Sales'].plot.hist()

In [None]:
past_sales['Sales'].skew()

## Bivariate analysis

In [None]:
past_sales.groupby('Store')['Sales'].mean().sort_values(ascending=False).head(200).plot.bar()

In [None]:
past_sales.groupby('Store')['Sales'].mean().quantile([0,0.25,0.5,0.75,1])

In [None]:
past_sales.groupby('month')['Sales'].mean().plot.bar()

In [None]:
past_sales.groupby('Promo')['Sales'].mean().plot.bar()

In [None]:
past_sales.groupby('Open')['Sales'].mean().plot.bar()

In [None]:
past_sales.groupby('StateHoliday')['Sales'].mean().plot.bar()

In [None]:
past_sales.groupby('SchoolHoliday')['Sales'].mean().plot.bar()

In [None]:
sns.heatmap(past_sales[['Customers', 'Sales']].corr(), cmap='Blues', annot=True)

In [None]:
store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
print(store.shape)
store.head()

In [None]:
store['StoreType'].value_counts()

In [None]:
store['Assortment'].value_counts()

In [None]:
store.isna().sum() / store.shape[0] * 100

In [None]:
drop_cols = ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
             'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
#store[store['CompetitionDistance'].isna()]
store['CompetitionDistance'] = store['CompetitionDistance'].fillna(0)

In [None]:
past_sales_merged = pd.merge(left=past_sales, right=store.drop(drop_cols, axis=1),
                             on='Store', how='left')
past_sales.shape, past_sales_merged.shape

In [None]:
#past_sales_merged.isna().sum()
past_sales_merged.groupby('StoreType')['Sales'].mean().plot.bar()

In [None]:
past_sales_merged.groupby('Assortment')['Sales'].mean().plot.bar()

In [None]:
past_sales_merged[['CompetitionDistance', 'Sales']].plot.scatter(x='CompetitionDistance', y='Sales')

In [None]:
past_sales_merged[['CompetitionDistance', 'Sales', 'Customers']].corr()

In [None]:
#future_sales = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
#future_sales.head()

### Data Preprocessing

In [None]:
store_avg_sales = past_sales_merged.groupby(['Store'])['Sales'].mean()
bins = store_avg_sales.quantile([0, 0.25, 0.5, 0.75, 1]).tolist()
bins[0] = bins[0]-1
store_sales_labels = pd.cut(store_avg_sales, bins=bins, labels=[0,1,2,3]).to_dict()
past_sales_merged['store_sales_type'] = past_sales_merged['Store'].replace(store_sales_labels)
past_sales_merged.isna().sum().sum()

In [None]:
past_sales_merged['dayofmonth'] = past_sales_merged['Date'].dt.day
store_avg_customers = past_sales_merged.groupby(['Store', 'month'])['Customers'].mean().to_frame().reset_index()
past_sales_with_avg_customers = pd.merge(left=past_sales_merged.drop('Customers', axis=1),
                                         right=store_avg_customers,
                                         on=['Store', 'month'], how='left')

In [None]:
target_col = 'Sales'
drop_cols = ['Date', 'Store']
past_sales_with_avg_customers['DayOfWeek'] = past_sales_with_avg_customers['Date'].dt.strftime('%a')
past_sales_with_avg_customers['month'] = past_sales_with_avg_customers['Date'].dt.strftime('%b')
dummies = pd.get_dummies(past_sales_with_avg_customers.drop(drop_cols, axis=1), drop_first=True)
input_cols = dummies.columns.drop(target_col)
dummies.shape, past_sales_with_avg_customers.drop(drop_cols, axis=1).shape

In [None]:
from sklearn.model_selection import train_test_split
train_x, validate_x, train_y, validate_y = train_test_split(dummies[input_cols],
                                                            dummies[target_col],
                                                            test_size=0.2,
                                                            random_state=1)
train_x.shape, validate_x.shape, train_y.shape, validate_y.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
validate_x_scaled = scaler.transform(validate_x)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.metrics import mean_squared_error
linear_model = LinearRegression().fit(train_x_scaled, train_y)
validate_y_pred = linear_model.predict(validate_x_scaled)
rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
rmse

In [None]:
#import statsmodels.api as sm
#model = sm.OLS(train_y, pd.DataFrame(train_x_scaled, columns=train_x.columns, index=train_x.index))
#results = model.fit()
#print(results.summary())

In [None]:
'''

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
grids = {'max_depth': list(range(5, 15))}
cv = GridSearchCV(estimator=DecisionTreeRegressor(),
                  param_grid=grids, cv=5, return_train_score=True).fit(train_x, train_y)
'''

In [None]:
#cv.best_params_

In [None]:
#df_cv = pd.DataFrame(cv.cv_results_)
#df_cv

In [None]:
'''

import matplotlib.pyplot as plt
plt.plot(df_cv['param_max_depth'], df_cv['mean_train_score'])
plt.plot(df_cv['param_max_depth'], df_cv['mean_test_score'])
plt.legend(['Train score', 'Test Score'])
plt.xlabel('Max Depth')
plt.ylabel('R2 Score');
'''

In [None]:
#df_cv[['param_max_depth', 'mean_test_score', 'rank_test_score']]

In [None]:
'''

dt_model = DecisionTreeRegressor(max_depth=10, random_state=1).fit(train_x, train_y)
validate_y_pred = dt_model.predict(validate_x)
rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
rmse
'''

In [None]:
rf_model = RandomForestRegressor(n_estimators=50).fit(train_x, train_y)
validate_y_pred = rf_model.predict(validate_x)
rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
rmse

In [None]:
future_sales = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
future_sales['Date'] = pd.to_datetime(future_sales['Date'], format='%Y-%m-%d')
future_sales['StateHoliday'] = future_sales['StateHoliday'].replace({0:'0'})
future_sales['month'] = future_sales['Date'].dt.strftime('%b')
drop_cols = ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
             'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
future_sales_merged = pd.merge(left=future_sales, right=store.drop(drop_cols, axis=1),
                             on='Store', how='left')
future_sales_merged['store_sales_type'] = future_sales_merged['Store'].replace(store_sales_labels)

future_sales_merged['dayofmonth'] = future_sales_merged['Date'].dt.day

future_sales_with_avg_customers = pd.merge(left=future_sales_merged,
                                         right=store_avg_customers,
                                         on=['Store', 'month'], how='left')

future_sales_with_avg_customers['DayOfWeek'] = future_sales_with_avg_customers['Date'].dt.strftime('%a')
future_sales_with_avg_customers['month'] = future_sales_with_avg_customers['Date'].dt.strftime('%b')
drop_cols = ['Date', 'Store']
dummies_test = pd.get_dummies(future_sales_with_avg_customers.drop(drop_cols, axis=1), drop_first=True)
dummies_test.shape, dummies.shape

In [None]:
missing_levels = np.setdiff1d(dummies.columns, dummies_test.columns)
for level in missing_levels:
    print(level)
    if level != 'Sales':
        dummies_test[level] = 0
dummies_test['Open'] = dummies_test['Open'].fillna(1)
dummies.shape, dummies_test.shape

In [None]:
dummies_test.isna().sum().sum()

In [None]:
test_y_pred = rf_model.predict(dummies_test[input_cols])


In [None]:
sample_submission = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')
sample_submission.head()

In [None]:
pd.Series(test_y_pred).plot.hist()

In [None]:
submission = pd.DataFrame({
    'Id': future_sales['Id'],
    'Sales': test_y_pred
})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)