### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import pca
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler



### Import data

In [5]:
data = pd.read_csv('../data/orderproducts_top20.csv', parse_dates=[1], infer_datetime_format=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4321 entries, 0 to 4320
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   order_id           4321 non-null   int64         
 1   order_date         4321 non-null   datetime64[ns]
 2   order_total_price  1904 non-null   float64       
 3   product_price      4321 non-null   float64       
 4   order_discounts    4321 non-null   object        
 5   product_discount   4321 non-null   float64       
 6   order_status       4138 non-null   object        
 7   product_title      4321 non-null   object        
 8   product_sku        4321 non-null   object        
 9   product_quantity   4321 non-null   int64         
 10  product_category   4321 non-null   object        
 11  weekdays           4321 non-null   int64         
 12  sch_holidays       4321 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(3), object(6)
memory us

In [7]:
data['month'] = [x.month for x in data['order_date']]
data['week'] = [x.week for x in data['order_date']]

In [8]:
data['prod_disc'] = ['no' if x==0.00 else 'yes' for x in data['product_discount']]
data['order_fulfilled'] = ['yes' if x=='fulfilled' else 'no' for x in data['order_status']]

In [9]:
data.columns

Index(['order_id', 'order_date', 'order_total_price', 'product_price',
       'order_discounts', 'product_discount', 'order_status', 'product_title',
       'product_sku', 'product_quantity', 'product_category', 'weekdays',
       'sch_holidays', 'month', 'week', 'prod_disc', 'order_fulfilled'],
      dtype='object')

In [10]:
df = data[['month', 'week','weekdays', 'sch_holidays', 
           'product_price', 'prod_disc', 'order_fulfilled',
           'product_sku', 'product_quantity'
         ]]

In [11]:
df

Unnamed: 0,month,week,weekdays,sch_holidays,product_price,prod_disc,order_fulfilled,product_sku,product_quantity
0,1,1,1,Non Holidays,249.99,no,yes,M80-VHB-BLK,1
1,1,1,1,Non Holidays,89.99,no,yes,M80-TICK-V2-BLK,1
2,1,1,2,Non Holidays,249.99,no,yes,M80-VHB-BLK,1
3,1,1,2,Non Holidays,229.99,yes,yes,EFX-FLY-BLK,1
4,1,1,1,Non Holidays,249.99,no,yes,M80-VEG-BLK,1
...,...,...,...,...,...,...,...,...,...
4316,3,9,5,Non Holidays,89.99,no,yes,M80-TICK-V2-BLK,1
4317,3,9,7,Non Holidays,249.99,no,yes,M80-VEG-BLK,1
4318,3,9,2,Non Holidays,249.99,no,yes,M80-VEB-BLK,1
4319,3,9,3,Non Holidays,169.99,no,yes,M80-TOUR-V2-BLK,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4321 entries, 0 to 4320
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   month             4321 non-null   int64  
 1   week              4321 non-null   int64  
 2   weekdays          4321 non-null   int64  
 3   sch_holidays      4321 non-null   object 
 4   product_price     4321 non-null   float64
 5   prod_disc         4321 non-null   object 
 6   order_fulfilled   4321 non-null   object 
 7   product_sku       4321 non-null   object 
 8   product_quantity  4321 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 303.9+ KB


In [19]:
df.groupby('month')['product_sku','prod_disc','order_fulfilled'].sum()

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,product_sku,prod_disc,order_fulfilled
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,M80-VHB-BLKM80-TICK-V2-BLKM80-VHB-BLKEFX-FLY-B...,nononoyesnoyesnonoyesnoyesyesnoyesnonoyesnonoy...,yesyesyesyesyesyesyesyesyesyesyesyesyesyesyesy...
2,M80-2G-BLKM80-SEG-BLKM80-SEG-ASHM80-VEB-GRYM80...,yesnononononononononononononoyesnonoyesnononon...,yesyesyesyesyesyesyesyesyesyesyesyesyesyesyesy...
3,M80-AD-BLKM80-BTY-BLK-LM80-BTY-BLK-LEFX-FLY-BL...,yesnonoyesyesyesnonononoyesnoyesnonoyesnononoy...,yesyesyesyesyesyesyesnoyesyesyesyesyesyesyesye...
4,EFX-FLY-BLKM80-EB-BLKEFX-FLY-BLKM80-SEG-BLKM80...,yesnoyesyesnonoyesyesyesyesyesyesnoyesyesyesno...,yesyesyesyesyesyesyesyesyesyesyesyesnonoyesyes...
5,M80-AD-BLKM80-TOUR-V2-BLKEFX-FLY-BLKM80-VEB-BL...,nononononononononononoyesyesnonoyesyesnononoye...,yesyesyesyesyesyesyesnoyesyesyesyesyesyesyesye...
6,M80-EB-BLKM80-VEG-GRYEFX-FLY-BLKM80-2B-BLKM80-...,nonononononoyesnononoyesnonononoyesyesnoyesyes...,yesyesyesyesyesyesyesyesyesnoyesyesyesyesyesye...
7,M80-VEG-GRYM80-TICK-V2-BLKM80-BTY-BLK-LM80-2G-...,nononoyesyesnonononoyesyesnonoyesyesnoyesyesno...,yesyesyesyesyesyesyesyesyesyesyesyesyesyesyesy...
8,M80-VHB-BLKM80-TICK-V2-BLKEFX-FLY-BLKM80-VHB-B...,noyesnonononononoyesnononoyesyesnononoyesyesno...,yesyesyesyesyesyesyesyesyesyesyesyesyesyesyesy...
9,EFX-FLY-BLKM80-TICK-V2-BLKM80-BTY-BLK-LEFX-FLY...,yesnonononoyesyesnonoyesyesyesyesnoyesnononono...,yesyesyesyesyesyesyesyesyesyesyesyesyesyesyesy...
10,EFX-FLY-BLKM80-VAD-BLKM80-EG-BLKM80-SEB-BLKM80...,nonoyesnononoyesyesnononononononononononononon...,yesyesyesyesyesyesyesyesyesyesyesyesyesyesyesy...


### Data Pre-processing

In [13]:
X = df.drop(['product_quantity'], axis=1)
Y = df['product_quantity']

In [14]:
X

Unnamed: 0,month,week,weekdays,sch_holidays,product_price,prod_disc,order_fulfilled,product_sku
0,1,1,1,Non Holidays,249.99,no,yes,M80-VHB-BLK
1,1,1,1,Non Holidays,89.99,no,yes,M80-TICK-V2-BLK
2,1,1,2,Non Holidays,249.99,no,yes,M80-VHB-BLK
3,1,1,2,Non Holidays,229.99,yes,yes,EFX-FLY-BLK
4,1,1,1,Non Holidays,249.99,no,yes,M80-VEG-BLK
...,...,...,...,...,...,...,...,...
4316,3,9,5,Non Holidays,89.99,no,yes,M80-TICK-V2-BLK
4317,3,9,7,Non Holidays,249.99,no,yes,M80-VEG-BLK
4318,3,9,2,Non Holidays,249.99,no,yes,M80-VEB-BLK
4319,3,9,3,Non Holidays,169.99,no,yes,M80-TOUR-V2-BLK


In [17]:
Y.value_counts

<bound method IndexOpsMixin.value_counts of 0       1
1       1
2       1
3       1
4       1
       ..
4316    1
4317    1
4318    1
4319    1
4320    1
Name: product_quantity, Length: 4321, dtype: int64>

In [16]:
# Build a preprocessing step for numeric features
numerical_cols = ['product_price']

# Build a preprocessing step for nominal features
nominal_cols = ['month', 'week', 'weekdays', 'sch_holidays', 'prod_disc', 'order_fulfilled', 'product_sku']
for col in nominal_cols: 
    X[col] = X[col].astype('category')

# Test/train split the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=1/5, \
                                                    random_state=1,
                                                    stratify=Y)
    
# Preprocessing for numerical data
## StandardScaler removes the mean and standardise it to between -1 to 1
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('Scaler' , StandardScaler())                      
    ])  

# Preprocessing for nominal data
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('nom', nominal_transformer, nominal_cols),
        ('num', numerical_transformer, numerical_cols)
        ])  
    
# Transform the Training and Test sets to numpy arrays
X_train = preprocessor.fit_transform(X_train)
X_test  = preprocessor.transform(X_test)
features = list(preprocessor.transformers_[0][1]['onehot'].get_feature_names(nominal_cols)) + numerical_cols     

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.