In [1]:
import pandas as pd
import math as m
import numpy as np
import matplotlib.pyplot as plt
from time import time
#import datetime

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor

"""To display up to 50 columns of dataset"""
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 8)

"""To display all outputs of each cell"""
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(100)

"""To get a timing of each function decorated"""
def timing(f):
    """
    Decorator for timing functions
    Usage:
    @timing
    def function(&):
        pass
    """   
    
    def wrapper(*args, **kwargs):
        start = time()
        result = f(*args, **kwargs)
        end = time()
        print('function:%r took: %2.2f sec' % (f.__name__, end - start))
        return result
    return wrapper

class Forecast():
    """Base processor to be used for all preparation"""""
    def __init__(self, input_directory, trainset, testset): #input_directory = path to the folder containing df. 
        self.input_directory = input_directory
        self.trainset = trainset
        self.testset = testset     
        
    @timing
    def read(self):
        """Read raw data"""
        self.train = pd.read_csv(self.input_directory + self.trainset)
        self.test_original = pd.read_csv(self.input_directory + self.testset)
        self.test = self.test_original
        
    @timing
    def preprocessing(self):
        """processing of the data"""
            
        #type change to category where needed
        self.category_columns = ["Store", "StoreType", "Assortment", "DayOfWeek", "PromoInterval", "StateHoliday",
                                 "OpeningType", "Month", "WeekOfYear", "DateOfMonth",
                                 "AffectedByCompetition", "AffectedByPromo2", "SchoolHoliday", "Promo", "Promo2"]
        
        for column in self.category_columns:
            self.train[column] = self.train[column].astype('category')
            self.test[column] = self.test[column].astype('category')
            
        self.binary_columns = ["AffectedByCompetition", "AffectedByPromo2", "SchoolHoliday", "Promo", "Promo2"]
     
        for column in self.binary_columns:
            self.train[column] = self.train[column].astype('bool')
            self.test[column] = self.test[column].astype('bool')
            
        #extract only data of open stores
        self.train = self.train[self.train["Open"] == 1]
        self.test = self.test[self.test["Open"] == 1]
        
        #Re_indexation 
        self.train = self.train.reset_index(drop='True')
        self.test = self.test.reset_index(drop='True')
        
        """put the date as index"""
        self.train["Date"] = pd.to_datetime(self.train["Date"].astype(str), format = '%Y/%m/%d')
        self.train = self.train.set_index("Date")
        self.test["Date"] = pd.to_datetime(self.test["Date"].astype(str), format = '%Y/%m/%d')
        self.test = self.test.set_index("Date")
    
    @timing
    def model_sales(self, estimator=100):
        self.train_sales = self.train.drop(["Customers", "Open"], axis = 1)
        self.test_sales = self.test.drop(["Customers", "Open"], axis = 1)       
             
        self.y_train_sales = self.train_sales[['Sales']]
        
        self.X_train_sales = self.train_sales.drop(["Sales"], axis = 1)
        self.X_test_sales = self.test_sales.drop(["Sales"], axis = 1)

        self.model_sales = RandomForestRegressor(n_estimators = estimator, random_state = 42)
        self.model_sales.fit(self.X_train_sales, self.y_train_sales)
                       
    @timing
    def rmse_train_sales(self):
        self.predicted_train_sales = self.model_sales.predict(self.X_train_sales)
        self.predicted_train_sales = self.predicted_train_sales.reshape(-1, 1)

        self.errors_sales = abs(self.predicted_train_sales - self.y_train_sales)
        print('Mean Absolute Error of Sales:', round(np.mean(self.errors_sales), 2))
        print('RMSE of Sales:', round(m.sqrt(mean_squared_error(self.predicted_train_sales, self.y_train_sales)), 2))
        
    @timing
    def model_cust(self, estimator=100):
        self.train_cust = self.train.drop(["Sales", "Open"], axis = 1)
        self.test_cust = self.test.drop(["Sales", "Open"], axis = 1)       
             
        self.y_train_cust = self.train_cust[['Customers']]
        
        self.X_train_cust = self.train_cust.drop(["Customers"], axis = 1)
        self.X_test_cust = self.test_cust.drop(["Customers"], axis = 1)

        self.model_cust = RandomForestRegressor(n_estimators = estimator, random_state = 42)
        self.model_cust.fit(self.X_train_cust, self.y_train_cust)
                       
    @timing
    def rmse_train_cust(self):
        self.predicted_train_cust = self.model_cust.predict(self.X_train_cust)
        self.predicted_train_cust = self.predicted_train_cust.reshape(-1, 1)

        self.errors_cust = abs(self.predicted_train_cust - self.y_train_cust)
        print('Mean Absolute Error of Customers:', round(np.mean(self.errors_cust), 2))
        print('RMSE of Customers:', round(m.sqrt(mean_squared_error(self.predicted_train_cust, self.y_train_cust)), 2))
     
    
    @timing 
    def predict_test(self):
        self.result = self.test_original
        self.result["Sales"][self.result["Open"]==0]= self.result["Sales"].fillna(0)
        self.result["Customers"][self.result["Open"]==0]= self.result["Customers"].fillna(0)
        
        #forecasting sales
        self.predicted_test_sales = self.model_sales.predict(self.X_test_sales)
        self.result["pred_sales"] = self.predicted_test_sales
        self.result["Sales"] = self.result["Sales"].isna(self.result["pred_sales"])
        self.result = self.result.drop(["pred_sales"], axis = 1)
        
        #forecasting customers
        self.predicted_test_cust = self.model_cust.predict(self.X_test_cust)
        self.result["pred_cust"] = self.predicted_test_cust
        self.result["Customers"] = self.result["Customers"].isna(self.result["pred_cust"])
        self.result = self.result.drop(["pred_cust"], axis = 1)        
        
        return self.result


Bad key "“backend" on line 1 in
/Users/rfdanti/.matplotlib/matplotlibrc.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.2.1/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
project = Forecast("/Users/rfdanti/Documents/UoM/2_data_analytics/coursework/CODE/", "final_trainset.csv", "final_testset.csv")
project.read()
project.preprocessing()

function:'read' took: 1.58 sec
function:'preprocessing' took: 1.47 sec


In [3]:
project.model_sales(estimator=100)
project.rmse_train_sales()



function:'model_sales' took: 480.80 sec
Mean Absolute Error of Sales: Sales    212.5
dtype: float64
RMSE of Sales: 337.99
function:'rmse_train_sales' took: 26.68 sec


In [4]:
project.model_cust(estimator=100)
project.rmse_train_cust()



function:'model_cust' took: 436.21 sec
Mean Absolute Error of Customers: Customers    18.42
dtype: float64
RMSE of Customers: 30.47
function:'rmse_train_cust' took: 22.78 sec


In [None]:
test = pd.read_csv("final_testset.csv")

In [6]:
result = test
result["Sales"][result["Open"]==0]= result["Sales"].fillna(0)
result["Customers"][result["Open"]==0]= result["Customers"].fillna(0)
result.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Date,Store,StoreType,Assortment,StateHoliday,SchoolHoliday,Promo,Open,OpeningType,DayOfWeek,DateOfMonth,Month,WeekOfYear,Promo2,PromoInterval,AffectedByPromo2,CompetitionDistance,CompetitionAge,AffectedByCompetition,Sales,Customers
0,2015-01-08,1,2,0,0,True,False,True,0,6,8,8,2,False,0,False,1270,2525,True,,
1,2015-01-09,1,2,0,0,True,True,True,0,2,9,9,2,False,0,False,1270,2556,True,,
2,2015-02-08,1,2,0,0,True,False,False,0,7,8,8,6,False,0,False,1270,2526,True,0.0,0.0
3,2015-02-09,1,2,0,0,True,True,True,0,3,9,9,7,False,0,False,1270,2557,True,,
4,2015-03-08,1,2,0,0,True,True,True,0,1,8,8,10,False,0,False,1270,2527,True,,


In [7]:
#preparing test data for forecasting
category_columns = ["Store", "StoreType", "Assortment", "DayOfWeek", "PromoInterval", "StateHoliday",
                                 "OpeningType", "Month", "WeekOfYear", "DateOfMonth"]
for column in category_columns:
    result[column] = result[column].astype('category')
    
result = result.reset_index(drop='True')
result["Date"] = pd.to_datetime(result["Date"].astype(str), format = '%Y/%m/%d')
result = result.set_index("Date")
x_test = result.drop(["Customers", "Open", "Sales"], axis=1)
result["pred_sales"] = np.nan
result["pred_cust"] = np.nan
result.head()

Unnamed: 0_level_0,Store,StoreType,Assortment,StateHoliday,SchoolHoliday,Promo,Open,OpeningType,DayOfWeek,DateOfMonth,Month,WeekOfYear,Promo2,PromoInterval,AffectedByPromo2,CompetitionDistance,CompetitionAge,AffectedByCompetition,Sales,Customers,pred_sales,pred_cust
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-01-08,1,2,0,0,True,False,True,0,6,8,8,2,False,0,False,1270,2525,True,,,,
2015-01-09,1,2,0,0,True,True,True,0,2,9,9,2,False,0,False,1270,2556,True,,,,
2015-02-08,1,2,0,0,True,False,False,0,7,8,8,6,False,0,False,1270,2526,True,0.0,0.0,,
2015-02-09,1,2,0,0,True,True,True,0,3,9,9,7,False,0,False,1270,2557,True,,,,
2015-03-08,1,2,0,0,True,True,True,0,1,8,8,10,False,0,False,1270,2527,True,,,,


In [10]:
#forecasting sales
predicted_sales = project.model_sales.predict(x_test)
result["pred_sales"] = predicted_sales
result["Sales"] = result["Sales"].fillna(result["pred_sales"])
result = result.drop(["pred_sales"], axis = 1)   

#forecasting customer
predicted_cust = project.model_cust.predict(x_test)
result["pred_cust"] = predicted_cust
result["Customers"] = result["Customers"].fillna(result["pred_cust"])
result = result.drop(["pred_cust"], axis = 1)  

result.head()

Unnamed: 0_level_0,Store,StoreType,Assortment,StateHoliday,SchoolHoliday,Promo,Open,OpeningType,DayOfWeek,DateOfMonth,Month,WeekOfYear,Promo2,PromoInterval,AffectedByPromo2,CompetitionDistance,CompetitionAge,AffectedByCompetition,Sales,Customers
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2015-01-08,1,2,0,0,True,False,True,0,6,8,8,2,False,0,False,1270,2525,True,4837.63,612.78
2015-01-09,1,2,0,0,True,True,True,0,2,9,9,2,False,0,False,1270,2556,True,4757.43,555.72
2015-02-08,1,2,0,0,True,False,False,0,7,8,8,6,False,0,False,1270,2526,True,0.0,0.0
2015-02-09,1,2,0,0,True,True,True,0,3,9,9,7,False,0,False,1270,2557,True,4671.1,545.2
2015-03-08,1,2,0,0,True,True,True,0,1,8,8,10,False,0,False,1270,2527,True,5995.28,598.8


In [12]:
result["Sales"] = round(result["Sales"],0)
result["Customers"] = round(result["Customers"],0)
result["Sales"] = result["Sales"].astype('int')
result["Customers"] = result["Customers"].astype('int')
result.head()

Unnamed: 0_level_0,Store,StoreType,Assortment,StateHoliday,SchoolHoliday,Promo,Open,OpeningType,DayOfWeek,DateOfMonth,Month,WeekOfYear,Promo2,PromoInterval,AffectedByPromo2,CompetitionDistance,CompetitionAge,AffectedByCompetition,Sales,Customers
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2015-01-08,1,2,0,0,True,False,True,0,6,8,8,2,False,0,False,1270,2525,True,4838,613
2015-01-09,1,2,0,0,True,True,True,0,2,9,9,2,False,0,False,1270,2556,True,4757,556
2015-02-08,1,2,0,0,True,False,False,0,7,8,8,6,False,0,False,1270,2526,True,0,0
2015-02-09,1,2,0,0,True,True,True,0,3,9,9,7,False,0,False,1270,2557,True,4671,545
2015-03-08,1,2,0,0,True,True,True,0,1,8,8,10,False,0,False,1270,2527,True,5995,599


In [13]:
result.to_csv("final_forecast_testset.csv", index=False)