### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

In [2]:
# Import classifiers
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor

### Import data

In [3]:
data = pd.read_csv('../data/orderproducts_top20.csv', parse_dates=[1], infer_datetime_format=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4321 entries, 0 to 4320
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   order_id           4321 non-null   int64         
 1   order_date         4321 non-null   datetime64[ns]
 2   order_total_price  1904 non-null   float64       
 3   product_price      4321 non-null   float64       
 4   order_discounts    4321 non-null   object        
 5   product_discount   4321 non-null   float64       
 6   order_status       4138 non-null   object        
 7   product_title      4321 non-null   object        
 8   product_sku        4321 non-null   object        
 9   product_quantity   4321 non-null   int64         
 10  product_category   4321 non-null   object        
 11  weekdays           4321 non-null   int64         
 12  sch_holidays       4321 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory us

In [5]:
data.columns

Index(['order_id', 'order_date', 'order_total_price', 'product_price',
       'order_discounts', 'product_discount', 'order_status', 'product_title',
       'product_sku', 'product_quantity', 'product_category', 'weekdays',
       'sch_holidays'],
      dtype='object')

In [7]:
df = data[['order_date', 'product_sku', 'product_quantity'
         ]]

In [8]:
prod_weekly = pd.crosstab(df['order_date'], df['product_sku']).resample('W').sum()

In [11]:
prod_weekly['week'] = [x.week for x in prod_weekly.index]

In [12]:
prod_weekly['sch_holidays'] = ['Sch Holidays' if x.month in [6,11,12] else 'Non Holidays' for x in prod_weekly.index ]

In [13]:
prod_weekly

product_sku,EFX-FLY-BLK,M80-2B-BLK,M80-2G-BLK,M80-AC-BLK,M80-AD-BLK,M80-BTY-BLK-L,M80-BTY-BLK-S,M80-EB-BLK,M80-EG-BLK,M80-SEB-BLK,...,M80-TOUR-V2-BLK,M80-VAD-BLK,M80-VEB-BLK,M80-VEB-GRY,M80-VEG-BLK,M80-VEG-GRY,M80-VHB-BLK,month,week,sch_holidays
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-31,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,12,52,Sch Holidays
2018-01-07,8,3,3,2,6,2,1,3,4,1,...,0,3,1,4,9,4,4,1,1,Non Holidays
2018-01-14,6,1,7,2,3,6,6,4,4,12,...,6,1,3,4,8,0,6,1,2,Non Holidays
2018-01-21,2,1,6,2,4,3,0,3,4,8,...,7,5,3,4,8,1,4,1,3,Non Holidays
2018-01-28,0,6,4,3,0,0,1,1,4,5,...,3,3,1,2,7,0,4,1,4,Non Holidays
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-14,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,2,2,1,2,6,Non Holidays
2021-02-21,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,2,7,Non Holidays
2021-02-28,0,1,1,0,0,1,0,0,0,1,...,0,0,1,1,0,0,1,2,8,Non Holidays
2021-03-07,2,0,0,1,0,0,0,1,1,0,...,1,0,1,0,1,0,0,3,9,Non Holidays


In [14]:
df_EFXFLYBLK = prod_weekly['2018':'2020'][['week', 'sch_holidays', 'EFX-FLY-BLK']].reset_index()

In [15]:
df_EFXFLYBLK

product_sku,order_date,week,sch_holidays,EFX-FLY-BLK
0,2018-01-07,1,Non Holidays,8
1,2018-01-14,2,Non Holidays,6
2,2018-01-21,3,Non Holidays,2
3,2018-01-28,4,Non Holidays,0
4,2018-02-04,5,Non Holidays,0
...,...,...,...,...
151,2020-11-29,48,Sch Holidays,0
152,2020-12-06,49,Sch Holidays,3
153,2020-12-13,50,Sch Holidays,0
154,2020-12-20,51,Sch Holidays,0


### Data Pre-processing

In [16]:
X = df_EFXFLYBLK.drop(['EFX-FLY-BLK','order_date'], axis=1)
Y = df_EFXFLYBLK['EFX-FLY-BLK']

In [17]:
X

product_sku,week,sch_holidays
0,1,Non Holidays
1,2,Non Holidays
2,3,Non Holidays
3,4,Non Holidays
4,5,Non Holidays
...,...,...
151,48,Sch Holidays
152,49,Sch Holidays
153,50,Sch Holidays
154,51,Sch Holidays


In [18]:
Y[0:10]

0     8
1     6
2     2
3     0
4     0
5     3
6     7
7    11
8     6
9     8
Name: EFX-FLY-BLK, dtype: int64

In [19]:
# Build a preprocessing step for numeric features
numerical_cols = []

# Build a preprocessing step for nominal features
nominal_cols = ['sch_holidays', 'week']
for col in nominal_cols: 
    X[col] = X[col].astype('category')

# Test/train split the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=1/5, \
                                                    random_state=1)
    
# Preprocessing for numerical data
## StandardScaler removes the mean and standardise it to between -1 to 1
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('Scaler' , StandardScaler())                      
    ])  

# Preprocessing for nominal data
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('nom', nominal_transformer, nominal_cols),
        ('num', numerical_transformer, numerical_cols)
        ])  
    
# Transform the Training and Test sets to numpy arrays
X_train = preprocessor.fit_transform(X_train)
X_test  = preprocessor.transform(X_test)
features = list(preprocessor.transformers_[0][1]['onehot'].get_feature_names(nominal_cols)) + numerical_cols     

In [20]:
print(X_train.shape)
print(X_test.shape)
print(features)

(124, 53)
(32, 53)
['sch_holidays_Non Holidays', 'sch_holidays_Sch Holidays', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'week_9', 'week_10', 'week_11', 'week_12', 'week_13', 'week_14', 'week_15', 'week_16', 'week_17', 'week_18', 'week_19', 'week_20', 'week_21', 'week_22', 'week_23', 'week_24', 'week_25', 'week_26', 'week_27', 'week_28', 'week_29', 'week_30', 'week_31', 'week_32', 'week_33', 'week_34', 'week_35', 'week_36', 'week_37', 'week_38', 'week_39', 'week_40', 'week_41', 'week_42', 'week_44', 'week_45', 'week_46', 'week_47', 'week_48', 'week_49', 'week_50', 'week_51', 'week_52']


In [22]:
lgreg=LinearRegression()
lgreg.fit(X_train, y_train)

print('Score in training set: {:.3f}'.format(lgreg.score(X_train, y_train)))
print('Score in test set: {:.3f}'.format(lgreg.score(X_test, y_test)))

Score in training set: 0.278
Score in test set: -1.152
