In [None]:
import numpy as np
import pandas as pd
import os, time, random

In [None]:
#read the dataset prepared after feature engineering
data_path = 'data.csv'
data = pd.read_csv(data_path)

#Understand the dataframe and datatypes for each column
print(data.info())

In [None]:
#remove columns which might lead to ordinal behavior
remove_features = ['id','item_id','dept_id','cat_id','state_id','store_id', 'release',
                   'event_name_1','event_type_1','event_name_2','event_type_2','event_name_1_lag_1',
                   'event_type_1_lag_1', 'event_name_1_lag_2', 'event_type_1_lag_2',
                   'event_name_1_lag_3', 'event_type_1_lag_3', 'date','wm_yr_wk','d',
                   'sales','temp_d','day', 'week', 'month', 'year', 'dayofweek', 'weekend']

In [None]:
#Convert the object columns to datatype 'category'
category_columns=['id','item_id','dept_id','cat_id','store_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2','event_name_1_lag_1', 'event_type_1_lag_1',
                   'event_name_1_lag_2', 'event_type_1_lag_2', 'event_name_1_lag_3', 'event_type_1_lag_3']

#convert each category in the list one 
for col in category_columns:
    data[col] = data[col].astype('category')

#Create a list of Store Ids for which data is considered 
stores_ids = data['store_id']
stores_ids = list(stores_ids.unique())


In [None]:
#Check if it is indeed 'TX_1', since we choose this specific store for modeling purposes
#due to processing power limitations and to avoid OOM(Out of Memory) Error
stores_ids

In [None]:
#copy the dataframe into new df
df = data.copy()
df

In [None]:
#Selected categorical columns are used for OneHotEnconding or to create DummyVariables/Columns
#removes ordinal behavior
df = pd.get_dummies(data=df, columns=['cat_id', 'dept_id','event_name_1','event_name_2','day',
 'week',
 'month',
 'year',
 'dayofweek',
 'weekend'])

In [None]:
#create a temporary date column with integer values which denotes day number 
#this is later used for subsetting the data into test/train
df['temp_d'] = pd.to_numeric(data['d'].str[2:])

#Once selected categorical columns are dummy encoded, 
#create list of categorical columns to remove from df 
features = [col for col in list(df) if col not in remove_features]

In [None]:
#Checking dummy encoded columns
df

In [None]:
#List of features that we are finally considering for Modeling
features

In [None]:
#Creating variables for limiting the data by dates
START_TRAIN = 1000      # Start day for training data
END_TRAIN   = 1885      # End day of our train data, 
#28 days after this are left for testing(1886 - Start day for Testing Data)  
LimitData   = 1913      # End day for Testing Data
    
#Subset the data for 1000 to 1913 days
df = df[(df['temp_d']>=START_TRAIN) & (df['temp_d']<=LimitData)].reset_index(drop=True)
#df = df[(df['temp_d']>=START_TRAIN)].reset_index(drop=True)


In [None]:
#Create train and test datasets   
train_mask = df['temp_d']<=END_TRAIN
#valid_mask = train_mask&(df['temp_d']>(END_TRAIN-P_HORIZON))
preds_mask = df['temp_d']>(END_TRAIN)


train = df[train_mask.values]
test = df[preds_mask.values]

#Split both train and test datasets for independant and depandant variables 
x_train = train[features]
y_train = train[['sales']]


x_test = test[features]
y_test = test[['sales']]



In [None]:
#Fill the NAs with 0, if present

x_test1 = x_test.fillna(0)
y_test1 = y_test.fillna(0)

x_train1 = x_train.fillna(0)
y_train1 = y_train.fillna(0)

In [None]:
y_test1

In [None]:
#Import the necessary libraries for - Linear Regression
from sklearn import linear_model
model = linear_model.LinearRegression()

#Fit the model based on training data 
model.fit(x_train1,y_train1)

In [None]:
#With the linear model built above, predict the sales for test timeframe
testing_predictions  = model.predict(x_test1)
testing_predictions

In [None]:
#Calculate the accuracy for linear regression

from sklearn import metrics
actuals = np.array(y_test1)

#accuracy for train data

lin_acc = model.score(x_train1,y_train1)
print("train accuracy",lin_acc)


#accuracy for test data

lin_acc = model.score(x_test1,y_test1)
print("test accuracy",lin_acc)


In [None]:
# Import the Mean Squared Error and calculate RMSE
from sklearn import metrics
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(y_test1, testing_predictions)))