# Install Module

In [1]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import Module

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from datetime import timedelta
from sklearn.metrics import mean_squared_error

# Import Dataset

## Download dataset

In [3]:
!gdown "1RQEXXW3aW2LHAawFo0OiAXfYWYgwF0Gh"

Downloading...
From: https://drive.google.com/uc?id=1RQEXXW3aW2LHAawFo0OiAXfYWYgwF0Gh
To: /content/train.csv
100% 17.3M/17.3M [00:00<00:00, 68.2MB/s]


## Open dataset

In [4]:
df = pd.read_csv('/content/train.csv')

In [5]:
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


# Function

## Split data

In [6]:
def train_test_split(data, ratio):
    train_set = pd.DataFrame(columns=data.columns)
    test_set = pd.DataFrame(columns=data.columns)

    lst_store = list(set(data['store']))
    lst_item = list(set(data['item']))

    for i in lst_store:
        for j in lst_item:
            filtered_data = data[(data['store'] == i) & (data['item'] == j)]
            row, col = filtered_data.shape
            train_data = filtered_data.iloc[0:int((1-ratio)*row),:]
            test_data = filtered_data.iloc[int((1-ratio)*row):,:]
            train_set = train_set.append(train_data, ignore_index=True)
            test_set = test_set.append(test_data, ignore_index=True)
        
    return train_set, test_set

## Create Lag

In [7]:
def create_lag(df3):
  dataframe = pd.DataFrame()

  lst_store = list(set(df3['store']))
  lst_item = list(set(df3['item']))
  lst_of_lag = []
  for i in lst_store:
      for j in lst_item:
        lst_lag_per_store_item = []
        for k in range(1,4):
            lst_lag_per_store_item.append(np.array(df3.loc[(df3.store==i) & (df3.item==j)].sales.shift(k)))
        lst_of_lag.append(lst_lag_per_store_item)
  lst = []
  for i in range(3):
    x = np.array([])
    for j in range(len(lst_of_lag)):
      x = np.append(x, lst_of_lag[j][i])
    dataframe['t-' + str(i+1)] = x
    df4 = pd.concat([df, dataframe], axis=1)
    df4.dropna(inplace=True)
  return df4

## Train Model

In [8]:
def train_model(train):
  train_lagged = create_lag(train)
  train = train_lagged[['store','item','t-1','t-2','t-3','sales']]

  train = np.asarray(train)
  train_x, train_y = train[:, :-1], train[:, -1]
  model = RandomForestRegressor(n_estimators=10)
  model.fit(train_x, train_y)
  return model

## Test Model

In [9]:
def test_model(model, test):
  test_lagged = create_lag(test)
  test = test_lagged[['store','item','t-1','t-2','t-3','sales']]
  
  test = np.asarray(test)
  yhat = model.predict(test[:, :-1])
  mse = mean_squared_error(test[:, -1], yhat)
  return mse

# Random Forest Regressor

In [10]:
train, test = train_test_split(df, 0.2)

In [11]:
model = train_model(train)

In [12]:
print("MSE: ", test_model(model, test))

MSE:  321.4987656978132


## Save model

In [15]:
pickle.dump(model, open('random-forest-regressor.pkl', 'wb'))