In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## The plan for the project is the following:

    1 First, we download the dataset.
    2 Next, we do some preliminary analysis of the data.
    3 After that, we set up a validation strategy to make sure our model produces correct predictions.
    4 Then we implement a linear regression model in Python and NumPy.
    5 Next, we cover feature engineering to extract important features from the data to improve the model.
    6 Finally, we see how to make our model stable with regularization and use it to predict car prices.

# Problem Statement
Our task is to predict the sales for a few identified stores on a given day.

From a pure business perspective, the first question you would need to ask is: 
    
    Who is the end stakeholder for Rossmann Store Sales and how is he going to utilize the solution? 

## Designing an SCQ
The four components can be defined as follows
* **Desired Future State**
    
    * The marketing & promotions team now have the means to accurately forecast sales for the required stores and therefore design promotional campaigns based on expected store performance in increase overall sales.


* **Current Situation**

     * The marketing team at Rossman wants to design promotional campaigns for store customers and thereby increase sales.
     * The lake of visibility into future stores sales makes the campaign design complicated and less effective for stores expected to perform poorly.


* **Complication**
        
     * The team lacks the tools to study and estimate future stores sales.
   

* **Question**

     * How can we estimate future sales for a store?
     * How accurate is the estimate?

# Designing the Solution

We are going to develop an ML model that can learn the sales for a store as a function of internal, external, and temporal (time-based) 
attributes and then predict future sales given the attributes available.

we consider the data in way that it can be represented as 

    sales as a function of store + other attributes

Instead of a time-series based model defined as 

    Sales as a function of time 

In this way, we can define a model that can learn the patterns from various stores and other external attributes (which we will explore with the data) to predict the expected sales.


# Exploring the Data

In [None]:
df = pd.read_csv("/kaggle/input/rossmann-store-sales/train.csv", parse_dates=['Date'],
                 dtype={
                           'Store': str,
                           'DayOfWeek': str,
                           'StateHoliday': str,
                       })
df_Close = df[df.Sales == 0]
df_Open = df[df.Sales != 0]
print(f"{df_Close.shape[0]/df.shape[0]*100:.2f}% of stores are close,   {df_Open.shape[0]/df.shape[0]*100:.2f}% of stores are open")
print("\nFor this Analysis I’m focusing only on open Stores")

df = df_Open
df.drop(labels='Open', axis=1, inplace=True)

# Looking at the Data Dictionary
    
* **Store:**         a unique ID for each store,
* **Sales:**         the turnover for a given day *(our target y variable)*,
* **Customers:**     the number of customers on a given day,
* **Open:**          an indicator for whether the store was open: 0 = closed, 1 = open,
* **Promo:**         indicates whether a store is running a promo on that day,
* **SchoolHoliday:** indicates if the (Store, Date) was affected by the closure of public schools.
* **StateHoliday:**  indicates a state holiday.
    
        Normally all stores, with few exceptions, are closed on state holidays. 
        Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = none

## Finding Data Types

In [None]:
pd.DataFrame(df.dtypes, columns=['Type']).T

### Duplicated row

No duplicated rows found.


In [None]:
df[df.duplicated()]

### Null values

No missing data, Sales — our target variable — doesn’t have any missing values.

In [None]:
pd.DataFrame(df.isnull().sum(), columns=['Null Values']).T

### Target Value

**Sales:** the turnover for a given day (our target y variable)

In [None]:
# distribution of Sales has a very long tail
plt.title("Distribution of sales")
plt.xlabel("Sales")
graph = sns.kdeplot(x="Sales", data=df)

In [None]:
# distribution of Sales has a very long tail
plt.title("Distribution of sales")
plt.xlabel("Sales")
graph = sns.histplot(x="Sales", data=df, kde=True, hue='Promo')

The long tail makes it quite difficult for us to see the distribution, but it has an even stronger effect on a model: 
    
such distribution can greatly confuse the model, so it won’t learn well enough. 

One way to solve this problem is log transformation

In [None]:
df['Log_Sales'] = np.log1p(df.Sales)
plt.title("Distribution of sales after log transformation")
plt.xlabel("$Log(Sales + 1)$")
ax = sns.histplot(x="Log_Sales", data=df, kde=True)

#### Validation framework

Let’s split the DataFrame such that

* 15% of data goes to validation,
* 15% goes to test,
* The remaining 70% goes to training.

In [None]:
n = len(df)

n_val = int(0.15 * n)
n_test = int(0.15 * n)
n_train = n - (n_val + n_test)

np.random.seed(16)
idx = np.arange(n)
np.random.shuffle(idx)

df_shuffle = df.iloc[idx]

df_train = df_shuffle.iloc[:n_train].copy()
df_val = df_shuffle.iloc[n_train: n_train + n_val].copy()
df_test = df_shuffle.iloc[n_train + n_val:].copy()

print(n_val, n_test, n_train)

Now the DataFrame is split into three parts, and we can continue.

In [None]:
y_train = df_train.Log_Sales.values

y_val = df_val.Log_Sales.values

y_test = df_test.Log_Sales.values

### Machine learning for regression

The problem we are solving is a regression problem: the goal is to predict the sales for a few identified stores on a given day. 

For this project we will use the simplest regression model: linear regression. 

#### Linear regression

#### Training linear regression model

We have multiple ways to do that. 

We will use normal equation, which is the simplest method to implement

To implement the normal equation, we need to do the following:

1. Create a function that takes in a matrix X with features and a vector y with the target.
1. Add a dummy column (the feature that is always set to 1) to the matrix X.
1. Train the model: compute the weights w by using the normal equation.
1. Split this w into the bias w0 and the rest of the weights, and return them.

In [None]:
def train_linear_regression(X, y):
    
    # adding the dummy column
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X]) 
    
    # normal equation formula
    XT_X = X.T @ X 
    XT_X_inv = np.linalg.inv(XT_X) 
    w = XT_X_inv @ X.T @ y 
    
    return w[0], w[1:]

### Predicting the sales

We now have a function for training a linear regression model at our disposal,

so let’s use it to build a simple baseline solution.

## Simple feature engineering

### Working with 'Store'

In [None]:
print(df.Store.nunique())

**'Store'** is categorical variable with 1115 different values.

Instead of doing On Hot encoding I propose to substitute the Average Sales for each Store

In [None]:
tmp = df_train[['Store', 'Sales']].groupby(['Store']).mean()
df_train = df_train.merge(tmp, on=["Store"], how="inner")
df_train.rename(columns={'Sales_x':'Sales', 'Sales_y':'Sales_Avg'}, inplace=True)

df_val = df_val.merge(tmp, on=["Store"], how="left")
df_val.rename(columns={'Sales_x':'Sales', 'Sales_y':'Sales_Avg'}, inplace=True)

df_test = df_test.merge(tmp, on=["Store"], how="left")
df_test.rename(columns={'Sales_x':'Sales', 'Sales_y':'Sales_Avg'}, inplace=True)

df_submit = pd.read_csv("/kaggle/input/rossmann-store-sales/test.csv", parse_dates=['Date'],
                 dtype={
                           'Store': str,
                           'DayOfWeek': str,
                           'StateHoliday': str,
                       })
df_submit = df_submit.merge(tmp, on=["Store"], how="left")
df_submit.rename(columns={'Sales':'Sales_Avg'}, inplace=True)

### Working with 'Date'

In [None]:
print(df_train.Date.min())



Let’s create additional features that will help our model learn patterns better. 

We will create the **week number**, **month**, **day**, **quarter**, and **year** as features from the date variable.

Similarly, since we are already creating time-related features, we can add a new feature based on climate and seasons.

Considering that the stores are in Europe, we can refer to the standard season cycles and create a new **season** feature with values of Spring, Summer, Fall, and Winter.

Lets’ take ‘2013-01-01’ as referential of time and define Days as our numerical variable

In [None]:
df_train['Age_Day'] = (df_train.Date - df_train.Date.min()).dt.days
df_val['Age_Day'] = (df_val.Date - df_train.Date.min()).dt.days
df_test['Age_Day'] = (df_test.Date - df_train.Date.min()).dt.days
df_submit['Age_Day'] = (df_submit.Date - df_train.Date.min()).dt.days

In [None]:
df_train["Month"] = df_train["Date"].dt.month
df_train["Quarter"] = df_train["Date"].dt.quarter
df_train["Year"] = df_train["Date"].dt.year
df_train["Day"] = df_train["Date"].dt.day
df_train["Week"] = df_train["Date"].dt.isocalendar().week

df_train["Season"] = np.where(df_train["Month"].isin([3,4,5]),"Spring",
                            np.where(df_train["Month"].isin([6,7,8]),"Summer",
                                     np.where(df_train["Month"].isin([9,10,11]),"Fall",
                                              np.where(df_train["Month"].isin([12,1,2]),"Winter","None"))))

df_train[["Date","Year","Month","Day","Week","Quarter","Season"]].head()

In [None]:
df_val["Month"] = df_val["Date"].dt.month
df_val["Quarter"] = df_val["Date"].dt.quarter
df_val["Year"] = df_val["Date"].dt.year
df_val["Day"] = df_val["Date"].dt.day
df_val["Week"] = df_val["Date"].dt.isocalendar().week

df_val["Season"] = np.where(df_val["Month"].isin([3,4,5]),"Spring",
                            np.where(df_val["Month"].isin([6,7,8]),"Summer",
                                     np.where(df_val["Month"].isin([9,10,11]),"Fall",
                                              np.where(df_val["Month"].isin([12,1,2]),"Winter","None"))))

df_val[["Date","Year","Month","Day","Week","Quarter","Season"]].head()

In [None]:
df_test["Month"] = df_test["Date"].dt.month
df_test["Quarter"] = df_test["Date"].dt.quarter
df_test["Year"] = df_test["Date"].dt.year
df_test["Day"] = df_test["Date"].dt.day
df_test["Week"] = df_test["Date"].dt.isocalendar().week

df_test["Season"] = np.where(df_test["Month"].isin([3,4,5]),"Spring",
                            np.where(df_test["Month"].isin([6,7,8]),"Summer",
                                     np.where(df_test["Month"].isin([9,10,11]),"Fall",
                                              np.where(df_test["Month"].isin([12,1,2]),"Winter","None"))))

df_test[["Date","Year","Month","Day","Week","Quarter","Season"]].head()

In [None]:
df_submit["Month"] = df_submit["Date"].dt.month
df_submit["Quarter"] = df_submit["Date"].dt.quarter
df_submit["Year"] = df_submit["Date"].dt.year
df_submit["Day"] = df_submit["Date"].dt.day
df_submit["Week"] = df_submit["Date"].dt.isocalendar().week

df_submit["Season"] = np.where(df_submit["Month"].isin([3,4,5]),"Spring",
                            np.where(df_submit["Month"].isin([6,7,8]),"Summer",
                                     np.where(df_submit["Month"].isin([9,10,11]),"Fall",
                                              np.where(df_submit["Month"].isin([12,1,2]),"Winter","None"))))

df_submit[["Date","Year","Month","Day","Week","Quarter","Season"]].head()

### Baseline solution

We will start with a very naive way of creating features:

select a few numerical features, and form the matrix X from them

This time, we include a couple more features and use the following columns:

    * Sales_Avg
    * Age_Day
    * SchoolHoliday
    * Promo

In [None]:
base = ['Sales_Avg', 'Age_Day', 'Promo', 'SchoolHoliday']
df_num = df_train[base]

X_train = df_num.values 
print(X_train.shape, y_train.shape)

In [None]:
w_0, w = train_linear_regression(X_train, y_train)

We have just trained the first model! 

Now we can apply it to the training data to see how well it predicts

In [None]:
y_pred  = w_0 + X_train @ w

To see how good the predictions are, we can use histplot to plot the predicted values and compare them with the actual prices

In [None]:
sns.histplot(y_pred, label='prediction', color='red')
sns.histplot(y_train, label='target', color= 'green')
plt.title("Training Predictions vs actual distribution")
plt.xlabel("$Log(Sales + 1)$")
plt.xlim(6, 11)
ax = plt.legend()

#### $RMSE$: Evaluating model quality

In [None]:
def rmse(y, y_pred):
    error = y - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

print(f"Now we can use RMSE = {rmse(y_train, y_pred):.3f} to evaluate the quality of the model.")

The code prints 0.425. This number tells us that on average, the model’s predictions are off by 0.425. 

This result alone may not be very useful, but we can use it to compare this model with other models. 

If one model has a better (lower) RMSE than the other, it indicates that model is better

#### Validating the model

First, we create the X_val matrix, following the same steps as for X_train:

In [None]:
df_num = df_val[base]
X_val = df_num.values

We’re ready to apply the model to X_val to get predictions:

In [None]:
y_pred  = w_0 + X_val @ w

print(f"Now we can (on the evalation dataset) use RMSE = {rmse(y_val, y_pred):.3f} to evaluate the quality of the model.")

## Feature engineering

### Handling categorical variables

Categorical variables describe characteristics of objects and can take one of a few possible values.

The day of week, for example, is categorical: it can take only one of the three possible values (1, 2, 3, 4, 5, 6, and 7).

This method of encoding categorical variables is called **one-hot encoding**

Let’s test if adding more features leads to any improvements:
* **'DayOfWeek**', 
* **'StateHoliday'**,
* **'Month'**, 
* **'Quarter'**, 
* **'Year'**, 
* **'Day'**, 
* **'Week'**, 
* **'Season'** 

In [None]:
pd.DataFrame(df.DayOfWeek.value_counts()).T

In [None]:
def prepare_X(df):
    df = df.copy() 
    features = base.copy()
    
    for v in [1, 2, 3, 4, 6]:
        feature = f'day_of_week_{v}'
        value = (df['DayOfWeek'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in ['Spring', 'Summer', 'Winter']:
        feature = f'is_{v}'
        value = (df['Season'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in np.arange(1,31):
        feature = f'is_Day_{v}'
        value = (df['Day'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in np.arange(1,52):
        feature = f'is_Week_{v}'
        value = (df['Week'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in [1, 2, 3]:
        feature = f'is_Quarter_{v}'
        value = (df['Quarter'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in [2013, 2014]:
        feature = f'is_Year_{v}'
        value = (df['Year'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in ['0', 'a', 'b']:
        feature = f'is_StateHoliday_{v}'
        value = (df['StateHoliday'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in [1, 2, 3, 4, 6, 7, 8, 9, 10, 11]:
        feature = f'is_Month_{v}'
        value = (df['Month'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [None]:
# X_train = prepare_X(df_train)
# w_0, w = train_linear_regression(X_train, y_train)
# X_val = prepare_X(df_val)
# y_pred = w_0 + X_val @ w
# print(f'validation: {rmse(y_val, y_pred):.3f}.') 

> X_train = prepare_X(df_train) 
>  
> w_0, w = train_linear_regression(X_train, y_train) 
>  
> X_val = prepare_X(df_val) 
>  
> y_pred = w_0 + X_val @ w 
>  
> print(f'validation: {rmse(y_val, y_pred):.3f}.') 

Now we have **LinAlgError: Singular matrix**

#### Regularization

Regularization is an important concept in machine learning: it means “controlling” 

    controlling the weights of the model so that they behave correctly and don’t grow too large, as in our case.

In [None]:
w_0, max(w[1:]), min(w[1:])

In [None]:
# Linear regression with regularization
def train_linear_regression_reg(X, y, r=0.0): 
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T @X
    
    reg = r * np.eye(XTX.shape[0]) 
    XTX = XTX + reg 
    XTX_inv = np.linalg.inv(XTX)
    
    w = XTX_inv @ X.T @ y
    return w[0], w[1:]

Let’s check what happens with our weights for different values of r:

In [None]:
for r in [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    print('%5s, %.2f, %.2f, %.2f' % (r, w_0, max(w[1:]), min(w[1:])))

We see that the values that we selected become smaller as r grows. 
 
Now let’s check whether regularization helps with our problem and what RMSE we get after that. 

Let’s run it with r=1e4:

In [None]:
X_train = prepare_X(df_train) 
w_0, w = train_linear_regression_reg(X_train, y_train, r=10000)

X_val = prepare_X(df_val) 
y_pred = w_0 + X_val @ w 
print(f"Now we can (on the evalation dataset) use RMSE = {rmse(y_val, y_pred):.3f} to evaluate the quality of the model.")

This result is an improvement over the previous score: 0.425. 

Let’s try a couple of different ones to select the best parameter r:

In [None]:
#np.arange(9000, 31001, 1000): # [0.01, 0.1, 1, 5, 10, 100, 1000, 10000, 11000, 12000, 13000, 14000, 15000]:
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

for r in np.arange(21000, 23001, 200):
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val @ w
    print('%6s' %r, rmse(y_val, y_pred))

## We also notice that the performance for values below 22400 don’t change much except in the sixth digit, 
which we shouldn’t consider to be significant. 
 
Let’s take the model with r=22400 as the final model. 

Now we can check it against the test dataset to verify if the model works:

In [None]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=22400)

X_val = prepare_X(df_val)
y_pred = w_0 + X_val @ w
print(f'validation: {rmse(y_val, y_pred):.3f}')

X_test = prepare_X(df_test)
y_pred = w_0 + X_test @ w
print(f'test: {rmse(y_test, y_pred):.3f}')

Because these two numbers are pretty close,

We conclude that the model can generalize well to the new unseen data.

## Using the model

Submit has 11 NULL value for ‘Open’ feature, let’s fill it with the mode.

In [None]:
df_submit.Open.isnull().sum()

In [None]:
df_submit['Open'] = df_submit[['Open']].fillna(1.0)

In [None]:
df_submit.Open.isnull().sum()

In [None]:
Close_df = df_submit.groupby(['Open']).get_group(0)
Close_df['Sales_pred'] = 0.0

Open_df = df_submit.groupby(['Open']).get_group(1)
X_submit = prepare_X(Open_df)
y_pred = w_0 + X_submit @ w
Open_df['Sales_pred'] = np.expm1(y_pred)

In [None]:
my_submission = pd.concat([Close_df[['Id', 'Sales_pred']], Open_df[['Id', 'Sales_pred']]]).sort_index()
my_submission.rename(columns={'Sales_pred':'Sales'}, inplace=True)
my_submission.to_csv("submission.csv", index=False)

## Looking at the Store Data

Let’s now look at the store csv file and if I can improve my model with this new additional features.

In [None]:
store_df = pd.read_csv("/kaggle/input/rossmann-store-sales/store.csv",
                       dtype={
                           'Store': str
                       })
print("Shape of the Dataset:",store_df.shape)
store_df.head(5)

## Data Dictionary
    
* **StoreType:** differentiates between four different store models: a, b, c, d
* **Assortment:** describes an assortment level: a = basic, b = extra, c = extended,
* **CompetitionDistance:** distance in meters to the nearest competitor store,
* **CompetitionOpenSince[Month/Year]:** gives the approximate year and month of the time the nearest competitor was opened,
* **Promo2:** Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
* **Promo2Since[Year/Week]:** describes the year and calendar week when the store started participating in Promo2
* **PromoInterval:** describes the consecutive intervals at which Promo2 is started, naming the months the promotion is started anew 

    (e.g., “Feb, May, Aug, Nov” means each round starts in February, May, August, and November of any given year for that store)

## Finding Data Types

In [None]:
pd.DataFrame(store_df.dtypes, columns=['Type']).T

### Duplicated Store

In [None]:
store_df[store_df.duplicated()]

### Null values

One important aspect: is there any missing data in the dataset? 

Let’s have a look at the number of missing data points in each column (if any) in its associated percentage form.

In [None]:
tmp = pd.DataFrame(store_df.isnull().sum()/store_df.shape[0] * 100, columns=['Null_Values(%)']).round(2)
tmp[tmp['Null_Values(%)'] > 0.0]

We can see that 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'CompetitionOpenSinceMonth', and 'CompetitionOpenSinceYear' have over 30% null values.

This is a big loss and there is nothing much we can do to fix this.

As a rule of thumb, if there is a loss of anything between 0% and 10%, we can make a few attempts to fill the missing points and use the feature. 

But, 30% technically becomes beyond the usable range.

On the other hand, we can see 'CompetitionDistance' has around 0.27% missing values. This would much easier to handle and fix.

We will use the mode to fill in the gaps where we have missing values.

In [None]:
store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].mode()[0], inplace=True)

In [None]:
store_df.drop(labels=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'], axis=1, inplace=True)

In [None]:
pd.DataFrame(store_df.dtypes, columns=['Type']).T

In [None]:
store_df

In [None]:
df_train = df_train.merge(store_df, on=["Store"], how="left")
df_val = df_val.merge(store_df, on=["Store"], how="left")
df_test = df_test.merge(store_df, on=["Store"], how="left")

Let’s test if adding more features leads to any improvements:

We beging by adding **'CompetitionDistance'** and **'Promo2'** feature.

In [None]:
base.append('CompetitionDistance')
base.append('Promo2')

In [None]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=10000)

X_val = prepare_X(df_val)
y_pred = w_0 + X_val @ w
print(f'validation: {rmse(y_val, y_pred):.3f} Same as before')

Now it's time the last 2 categorical variable; **'Assortment'** and **'StoreType'**

In [None]:
def prepare_X(df):
    df = df.copy() 
    features = base.copy()
    
    for v in ['a', 'b', 'c']:
        feature = f'is_Assortment_{v}'
        value = (df['Assortment'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in ['a', 'b', 'c', 'd']:
        feature = f'is_StoreType_{v}'
        value = (df['StoreType'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in [1, 2, 3, 4, 6, 7]:
        feature = f'day_of_week_{v}'
        value = (df['DayOfWeek'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in ['Spring', 'Summer', 'Winter', 'Fall']:
        feature = f'is_{v}'
        value = (df['Season'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in np.arange(1,32):
        feature = f'is_Day_{v}'
        value = (df['Day'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in np.arange(1,53):
        feature = f'is_Week_{v}'
        value = (df['Week'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in [1, 2, 3, 4]:
        feature = f'is_Quarter_{v}'
        value = (df['Quarter'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in [2013, 2014, 2015]:
        feature = f'is_Year_{v}'
        value = (df['Year'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in ['0', 'a', 'b', 'c']:
        feature = f'is_StateHoliday_{v}'
        value = (df['StateHoliday'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    for v in [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12]:
        feature = f'is_Month_{v}'
        value = (df['Month'] == v).astype(int) 
        df[feature] = value
        features.append(feature)
    
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [None]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train, r=10000)

X_val = prepare_X(df_val)
y_pred = w_0 + X_val @ w
print(f'validation: {rmse(y_val, y_pred):.3f} Same as before')

X_test = prepare_X(df_test)
y_pred = w_0 + X_test @ w
print(f'test: {rmse(y_test, y_pred):.3f}')

### Because these two Model have RMSE  pretty close, we conclude that using normal equation is not enough for dealing with non-linearity

In [None]:
df_submit = df_submit.merge(store_df, on=["Store"], how="left")

Close_df = df_submit.groupby(['Open']).get_group(0)
Close_df['Sales_pred'] = 0.0

Open_df = df_submit.groupby(['Open']).get_group(1)
X_submit = prepare_X(Open_df)
y_pred = w_0 + X_submit @ w
Open_df['Sales_pred'] = np.expm1(y_pred)

In [None]:
my_submission = pd.concat([Close_df[['Id', 'Sales_pred']], Open_df[['Id', 'Sales_pred']]]).sort_index()
my_submission.rename(columns={'Sales_pred':'Sales'}, inplace=True)
my_submission.to_csv("submission.csv", index=False)