In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
past_sales=pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store=pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
past_sales.head()

In [5]:
past_sales['Date'] = pd.to_datetime(past_sales['Date'], format='%Y-%m-%d')
past_sales['StateHoliday'] = past_sales['StateHoliday'].replace({0: '0'})
past_sales['month'] = past_sales['Date'].dt.month
past_sales['day_of_month'] = past_sales['Date'].dt.day
past_sales.shape

In [6]:
past_sales.head()

In [7]:
num_cols = ['Sales', 'Customers']
cat_cols = ['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
date_cols = ['Date', 'DayOfWeek']

In [8]:
past_sales[cat_cols].astype(object).describe()

In [9]:
past_sales[num_cols].describe()

In [10]:
past_sales.isna().sum()

In [11]:
store.head()

In [12]:
store.isna().sum() / len(store) * 100

In [13]:
store_new = store.copy()
avg_comp_dist = store_new['CompetitionDistance'].mean()
store_new['CompetitionDistance'] = store_new['CompetitionDistance'].fillna(avg_comp_dist)

# Create a new unknown category for missing values in categorical columns
store_new['CompetitionOpenSinceMonth'] = store_new['CompetitionOpenSinceMonth'].fillna('Unknown')
store_new['CompetitionOpenSinceYear'] = store_new['CompetitionOpenSinceYear'].fillna('Unknown')
store_new['PromoInterval'] = store_new['PromoInterval'].fillna('Unknown')
store_new['Promo2SinceYear'] = store_new['Promo2SinceYear'].fillna('Unknown')
store_new['Promo2SinceWeek'] = store_new['Promo2SinceWeek'].fillna('Unknown')

In [14]:
store_new.head()

In [15]:
past_sales_merged = pd.merge(left=past_sales, right=store_new,
                             how='left', on='Store')

In [16]:
sns.boxplot(data=past_sales_merged, x='Promo', y='Sales')

In [17]:
store_avg_customers = past_sales_merged.groupby('Store').agg(Avg_customers=('Customers', 'mean')).reset_index()
past_sales_avg_cust =  pd.merge(left=past_sales_merged,
                               right=store_avg_customers,
                               on='Store',
                               how='left')
store_avg_customers

In [18]:
drop_col_names = ['Date', 'Store']
dummies = pd.get_dummies(past_sales_merged.drop(drop_col_names, axis=1), drop_first=True)
dummies.shape

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [20]:
target_col_name = 'Sales'
input_col_names = dummies.columns.drop(target_col_name)
train_x, validate_x, train_y, validate_y = train_test_split(dummies[input_col_names],
                                                            dummies[target_col_name],
                                                            test_size=0.2,
                                                            random_state=1)
train_x.shape, validate_x.shape

In [21]:
scaler = StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
validate_x_scaled = scaler.transform(validate_x)
df_train_x_scaled = pd.DataFrame(train_x_scaled,
                                index=train_x.index,
                                columns=train_x.columns)
df_validate_x_scaled = pd.DataFrame(validate_x_scaled,
                                   index=validate_x.index,
                                   columns=validate_x.columns)

In [22]:
import statsmodels.api as sm

In [23]:
train_x_with_constant = sm.add_constant(df_train_x_scaled)
validate_x_with_constant = sm.add_constant(df_validate_x_scaled)
ols_model = sm.OLS(train_y, train_x_with_constant).fit()

In [24]:
pvalues = ols_model.pvalues
pvalues[pvalues>=0.05].index

In [25]:
col_insig = ['StateHoliday_c', 'CompetitionOpenSinceMonth_12.0',
       'CompetitionOpenSinceYear_1961.0', 'Promo2SinceWeek_37.0']
train_x_with_constant = sm.add_constant(df_train_x_scaled.drop(col_insig, axis=1))
validate_x_with_constant = sm.add_constant(df_validate_x_scaled.drop(col_insig, axis=1))
ols_model = sm.OLS(train_y, train_x_with_constant).fit()

In [26]:
from sklearn.linear_model import Lasso
X = df_train_x_scaled.drop(col_insig, axis=1)
lasso_model = Lasso(alpha=10).fit(X, train_y)
df_coef = pd.DataFrame({
    'variable': X.columns,
    'coef': lasso_model.coef_
})
df_coef.head(25)

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
cv = cross_val_score(linear_model, df_train_x_scaled.drop(col_insig, axis=1), train_y)

In [28]:
cv

In [29]:
np.mean(cv)

In [30]:
validate_y_pred = ols_model.predict(validate_x_with_constant)
print('Validate R2', r2_score(validate_y, validate_y_pred))
print('Validate RMSE', mean_squared_error(validate_y, validate_y_pred, squared=False))

In [31]:
from sklearn.tree import DecisionTreeRegressor

In [32]:
dt_model=DecisionTreeRegressor()
dt= dt_model.fit(train_x, train_y)

In [33]:
validate_y_pred = dt.predict(validate_x)
print('Validate R2', r2_score(validate_y, validate_y_pred))
print('Validate RMSE', mean_squared_error(validate_y, validate_y_pred, squared=False))

In [34]:
future_sales = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')
future_sales.head()

In [35]:
future_sales['StateHoliday'] = future_sales['StateHoliday'].replace({0: '0'})
future_sales['Date'] = pd.to_datetime(future_sales['Date'], format='%Y-%m-%d')
future_sales['month'] = future_sales['Date'].dt.month
future_sales['day_of_month'] = future_sales['Date'].dt.day
future_sales['Open'] = future_sales['Open'].fillna(1)

future_sales_merged = pd.merge(left=future_sales, right=store_new,
                             how='left', on='Store')

future_sales_avg_cust =  pd.merge(left=future_sales_merged,
                               right=store_avg_customers,
                               on='Store',
                               how='left')
future_dummies = pd.get_dummies(future_sales_avg_cust.drop(['Id', 'Date', 'Store'], axis=1), drop_first=True)
missing_cats = ['Promo2SinceWeek_26.0', 'Promo2SinceWeek_50.0',
       'Promo2SinceWeek_6.0', 'StateHoliday_b', 'StateHoliday_c']
for col in missing_cats:
    future_dummies[col] = 0

In [36]:
future_dummies.shape, train_x.shape

In [37]:
test_y_pred = dt_model.predict(future_dummies)

In [38]:
df_submission = pd.DataFrame({
    'Id': future_sales['Id'],
    'Sales': test_y_pred
})
df_submission.head()

In [39]:
df_submission.to_csv('submission.csv', index=False)