In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
past_sales = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
future_sales = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')

past_sales['Date'] = pd.to_datetime(past_sales['Date'], format="%Y-%m-%d")
past_sales['StateHoliday'] = past_sales['StateHoliday'].replace({0: "0"})
past_sales.shape

In [None]:
past_sales['Date'].describe()

In [None]:
past_sales.isna().sum()
store.isna().sum()
future_sales.isna().sum()

In [None]:
past_sales.head()

In [None]:

store.head()

In [None]:

future_sales.head()

## Summary of numerical columns

In [None]:
num_cols = ['Sales', 'Customers']
past_sales[num_cols].describe()

In [None]:
past_sales[past_sales['Open'] == 1]['Sales'].describe()

In [None]:
#past_sales[(past_sales['Open'] == 1) & (past_sales['Sales'] == 0)]

## Summary on categorical columns

In [None]:
past_sales.columns

In [None]:
cat_cols = ['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
past_sales[cat_cols].astype(object).describe()

In [None]:
past_sales['StateHoliday'].unique()

In [None]:
storeid = 100
store_subset = past_sales[past_sales['Store'] == storeid]
store_subset.plot.line(x='Date', y='Sales', figsize=(14,5));

In [None]:
from scipy.stats import f_oneway

In [None]:
num_col = 'Sales'
cat_col = 'Promo'
sample1 = past_sales[past_sales['Promo'] == 1][num_col]
sample0 = past_sales[past_sales['Promo'] == 0][num_col]
statistic, pvalue = f_oneway(sample1, sample0)
if pvalue < 0.05:
    print('Reject null hyphothesis. %s influences %s' % (cat_col, num_col))
else:
    print('Fail to reject null hyphothesis. %s does not influences %s' % (cat_col, num_col))

In [None]:
num_col = 'Sales'
cat_col = 'Promo'
samples_all = {}
for category in past_sales[cat_col].unique():
    sample = past_sales[past_sales[cat_col] == category][num_col].tolist()
    samples_all[category] = sample
statistic, pvalue = f_oneway(*samples_all.values())
if pvalue < 0.05:
    print('Reject null hyphothesis. %s influences %s' % (cat_col, num_col))
else:
    print('Fail to reject null hyphothesis. %s does not influences %s' % (cat_col, num_col))

In [None]:
num_col = 'Sales'
cat_cols = ['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
for cat_col in cat_cols:
    samples_all = {}
    for category in past_sales[cat_col].unique():
        sample = past_sales[past_sales[cat_col] == category][num_col].tolist()
        samples_all[category] = sample
    statistic, pvalue = f_oneway(*samples_all.values())
    if pvalue < 0.05:
        print('%s influences %s' % (cat_col, num_col))
    else:
        print('%s does not influences %s' % (cat_col, num_col))    

In [None]:
kpi = 'Sales'
past_sales[['Customers', 'Sales']].corr()[kpi].drop(kpi).sort_values(ascending=False)

In [None]:
past_sales.groupby(['DayOfWeek'])[kpi].mean().plot.bar();

## Outlier Analysis

In [None]:
past_sales[kpi].plot.box();

## Missing Value Treatment

In [None]:
store.isna().sum() / store.shape[0] * 100

In [None]:
store['CompetitionOpenSinceMonth'].unique()

In [None]:
store.head()

In [None]:
store_imputed = store.copy()
store_imputed['CompetitionDistance'] = store_imputed['CompetitionDistance'].fillna(
    store_imputed['CompetitionDistance'].mean())
store_imputed['CompetitionOpenSinceMonth_1'] = store_imputed['CompetitionOpenSinceMonth'].fillna(
    store_imputed['CompetitionOpenSinceMonth'].mean())


bins = [0, 3, 6, 9, 12.5]
labels = ['0-3', '3-6', '6-9', '9-12']
store_imputed['CompetitionOpenSinceMonth_2'] = pd.cut(store_imputed['CompetitionOpenSinceMonth'],
                                                      bins=bins, labels=labels).astype(object).fillna('NA')
store_imputed = store_imputed.drop('CompetitionOpenSinceMonth',axis=1)
store_imputed['CompetitionOpenSinceYear'] = store_imputed['CompetitionOpenSinceYear'].fillna('NA')
store_imputed['PromoInterval'] = store_imputed['PromoInterval'].fillna('NA')
store_imputed['Promo2SinceYear'] = store_imputed['Promo2SinceYear'].fillna('NA')


bins = [0, 10, 20, 30, 40, 55]
labels = ['0-10', '10-20', '20-30', '30-40', '40-50']
store_imputed['Promo2SinceWeek'] = pd.cut(store_imputed['Promo2SinceWeek'],
                                          bins=bins, labels=labels).astype(object).fillna('NA')
store_imputed.isna().sum()

## Data Preprocessing
- Combine all necessary data
- Standardization

In [None]:
store_avg_customers = past_sales.groupby('Store').agg(Avg_customers=('Customers', 'mean')).reset_index()
past_sales_merged = pd.merge(left=past_sales, right=store_imputed,
                            on='Store', how='left')

past_sales_merged = pd.merge(left=past_sales_merged, right=store_avg_customers,
                             on='Store', how='left')

past_sales_merged['month'] = past_sales_merged['Date'].dt.strftime('%b')
past_sales_merged_new = past_sales_merged.drop(['Customers', 'Date', 'Store'], axis=1)



dummies = pd.get_dummies(past_sales_merged_new, drop_first=True)
dummies.shape, past_sales_merged.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
target_col = 'Sales'
input_cols = dummies.columns.drop(target_col)
train_x, validate_x, train_y, validate_y = train_test_split(dummies[input_cols],
                                                           dummies[target_col],
                                                           test_size=0.2, random_state=1)

In [None]:
scaler = StandardScaler().fit(train_x)
train_x_scaled = pd.DataFrame(scaler.transform(train_x),
                              index=train_x.index, columns=train_x.columns)
validate_x_scaled = pd.DataFrame(scaler.transform(validate_x),
                                index=validate_x.index, columns=validate_x.columns)

In [None]:
#import statsmodels.api as sm
#X_train = sm.add_constant(train_x_scaled)
#X_validate = sm.add_constant(validate_x_scaled)
#cols2drop = ['StoreType_c', 'CompetitionOpenSinceYear_2000.0',
#             'CompetitionOpenSinceMonth_2_3-6']
#model = sm.OLS(train_y,X_train.drop(cols2drop, axis=1)).fit()
#model.summary()

In [None]:
#from sklearn.linear_model import LinearRegression
#from sklearn.metrics import mean_squared_error
#model = LinearRegression().fit(train_x_scaled.drop(cols2drop, axis=1), train_y)
#validate_y_pred = model.predict(validate_x_scaled.drop(cols2drop, axis=1))
#rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
#print(rmse)

In [None]:
#from sklearn.tree import DecisionTreeRegressor
#dt_model = DecisionTreeRegressor(max_depth=5).fit(train_x, train_y)
#validate_y_pred = dt_model.predict(validate_x)
#rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
#print(rmse)

In [None]:
#from sklearn.model_selection import GridSearchCV
#grid = {'max_depth': list(range(5, 15))}
#cv = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=grid, cv=5,
#                 return_train_score=True).fit(train_x, train_y)

In [None]:
#cv.best_params_

In [None]:
#grid_results = pd.DataFrame(cv.cv_results_)
#grid_results[['param_max_depth', 'mean_train_score', 'mean_test_score', 'rank_test_score']]
##import matplotlib.pyplot as plt
#plt.plot(grid_results['param_max_depth'], grid_results['mean_train_score'])
#plt.plot(grid_results['param_max_depth'], grid_results['mean_test_score'])
#plt.legend(['Train Score', 'Test Score'])

In [None]:
#cv.best_params_

In [None]:
#grid_results2 = pd.DataFrame(cv.cv_results_)
#grid_results_combined = pd.concat([grid_results, grid_results2])
#plt.plot(grid_results_combined['param_max_depth'], grid_results_combined['mean_train_score'])
#plt.plot(grid_results_combined['param_max_depth'], grid_results_combined['mean_test_score'])
#plt.legend(['Train Score', 'Test Score'])

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(max_depth=20).fit(train_x, train_y)
validate_y_pred = dt_model.predict(validate_x)
rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
print(rmse)

In [None]:
#from sklearn.ensemble import RandomForestRegressor
#rf_model = RandomForestRegressor(n_estimators=50).fit(train_x, train_y)
##validate_y_pred = rf_model.predict(validate_x)
#rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
#print(rmse)

In [None]:
future_sales_proc = future_sales.copy()
future_sales_proc['Date'] = pd.to_datetime(future_sales_proc['Date'], format="%Y-%m-%d")
future_sales_proc['StateHoliday'] = future_sales_proc['StateHoliday'].replace({0: "0"})


future_sales_proc = pd.merge(left=future_sales_proc, right=store_imputed,
                            on='Store', how='left')

future_sales_proc = pd.merge(left=future_sales_proc, right=store_avg_customers,
                             on='Store', how='left')

future_sales_proc['month'] = future_sales_proc['Date'].dt.strftime('%b')
future_sales_proc = future_sales_proc.drop(['Date', 'Store'], axis=1)

dummies_future = pd.get_dummies(future_sales_proc, drop_first=True)

In [None]:
for col in set(dummies.columns) - set(dummies_future.columns):
    if col != 'Sales':
        dummies_future[col] = 0
dummies_future['Open'] = dummies_future['Open'].fillna(1)

In [None]:
test_y_pred = dt_model.predict(dummies_future.drop('Id',axis=1))

In [None]:
sample = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')
submission = pd.DataFrame({
    'Id': dummies_future['Id'],
    'Sales': test_y_pred
})
submission.to_csv('submission.csv', index=False)

### Handling outliers by segmenting stores

In [None]:
past_sales_merged.head()

In [None]:
stores_summary = past_sales_merged.groupby(['Store']).agg(
    avg_sales=('Sales', 'mean'),
    max_sales=('Sales', 'max'),
    store_type=('StoreType', 'first'),
    assortment=('Assortment', 'first'),
    promo2=('Promo2', 'first'),
    avg_customers=('Customers','mean')
)
stores_dummies = pd.get_dummies(stores_summary, drop_first=True)
from sklearn.cluster import KMeans
cluster_model = KMeans(n_clusters=4).fit(stores_dummies)
stores_summary = stores_summary.reset_index()


In [None]:
stores_summary['cluster'] = cluster_model.labels_
stores_summary.groupby('cluster')['avg_sales', 'max_sales', 'avg_customers'].mean()

In [None]:
stores_summary['cluster'].value_counts()

In [None]:
stores_cluster1 = stores_summary[stores_summary['cluster']==3]['Store'].tolist()
past_sales_merged_subset = past_sales_merged[past_sales_merged['Store'].isin(stores_cluster1)]
past_sales_merged_new = past_sales_merged_subset.drop(['Customers', 'Date', 'Store'], axis=1)
dummies = pd.get_dummies(past_sales_merged_new, drop_first=True)

target_col = 'Sales'
input_cols = dummies.columns.drop(target_col)
train_x, validate_x, train_y, validate_y = train_test_split(dummies[input_cols],
                                                           dummies[target_col],
                                                           test_size=0.2, random_state=1)
train_x.shape

In [None]:
model_cluster = DecisionTreeRegressor(max_depth=20).fit(train_x, train_y)
validate_y_pred = model_cluster.predict(validate_x)
rmse = mean_squared_error(validate_y, validate_y_pred, squared=False)
print(rmse)