In [252]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from tqdm import tqdm 
from xgboost import XGBClassifier
from sklearn.neural_network import MLPRegressor

<b> <h2> Introduction </h2> </b>

For this project we are using a data set of products and are going to try and predict the future demand requested based off of mainly time series data. For this project the data was obtained from https://www.kaggle.com/felixzhao/productdemandforecasting 

In [248]:
df = pd.read_csv('../data/Historical Product Demand.csv')

In [249]:
df

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500
...,...,...,...,...,...
1048570,Product_1791,Whse_J,Category_006,2016/4/27,1000
1048571,Product_1974,Whse_J,Category_006,2016/4/27,1
1048572,Product_1787,Whse_J,Category_006,2016/4/28,2500
1048573,Product_0901,Whse_J,Category_023,2016/10/7,50


First we will clean the data and also refactor and add features before graphing or making any assumptions. For cleaning we will take out all of the strings from the values and also seperate the date and also add day of the week, month, and year. Hopefully that will get a better reading on what the model should do.

In [250]:
dif_type_df = df
dif_type_df["Product_Code"] = dif_type_df["Product_Code"].str[-4:]
dif_type_df["Warehouse"] = dif_type_df["Warehouse"].str[-1:]
dif_type_df["Product_Category"] = dif_type_df["Product_Category"].str[-3:]

In [251]:
dif_type_df["Product_Code"] = dif_type_df["Product_Code"].astype('int32')
dif_type_df["Product_Category"] = dif_type_df["Product_Category"].astype('int32')

#they use parenthases to indicate negative values. We will remove those
dif_type_df = dif_type_df.loc[dif_type_df["Order_Demand"].str[0] != '('] 
dif_type_df["Order_Demand"] = dif_type_df["Order_Demand"].astype('int32')
dif_type_df.dropna(inplace=True)
dif_type_df

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,993,J,28,2012/7/27,100
1,979,J,28,2012/1/19,500
2,979,J,28,2012/2/3,500
3,979,J,28,2012/2/9,500
4,979,J,28,2012/3/2,500
...,...,...,...,...,...
1048570,1791,J,6,2016/4/27,1000
1048571,1974,J,6,2016/4/27,1
1048572,1787,J,6,2016/4/28,2500
1048573,901,J,23,2016/10/7,50


<b> <h2> Feature Engineering </h2> </b>

Here we now have cleaned the data and will now need to start adding features. From the data. The most naive way would most likely be to split the data on the slashes and create new columns from that. We can go farther than that and also add a total day count, like unix, a day of the week, and scale the values. We also might want to one-hot encode the warehouses. The reason that we may not want to just label encode them is because the classifiers may try and extract information from that. Where the warehouse that they are housed in may not provide any information since we do not know if anything relates them i.e. geographical, truck count, capacity etc.

To recap, some more features to add are:

• Splitting up the date into other columns

• Obtaining more data from the date, like day of the week

• one hot encoding the warehouse

In [175]:
dif_type_df = dif_type_df.sort_values(by=["Product_Code", "Date"])

In [176]:
final_df = dif_type_df

In [177]:
date = pd.to_datetime(final_df['Date'], format='%Y/%m/%d')
final_df['DayOfWeek'] = date.dt.dayofweek
final_df['DayOfMonth'] = date.dt.day
final_df['DayOfYear'] = date.dt.dayofyear
final_df['Week'] = date.dt.weekofyear
final_df['Month'] = date.dt.month
final_df['Year'] = date.dt.year

In [178]:
final_df.drop("Date", axis=1, inplace=True)
final_df["Year"] = final_df["Year"] - 2011
final_df["TotalDays"] = final_df["Year"] * 365 + final_df["Month"] * 30 + final_df["DayOfMonth"]

In [179]:
final_df

Unnamed: 0,Product_Code,Warehouse,Product_Category,Order_Demand,DayOfWeek,DayOfMonth,DayOfYear,Week,Month,Year,TotalDays
47145,1,J,5,200,4,16,350,50,12,0,376
47146,1,J,5,100,1,20,354,51,12,0,380
43860,1,A,5,2000,1,10,10,2,1,1,405
47147,1,J,5,200,1,10,10,2,1,1,405
47148,1,J,5,800,1,10,10,2,1,1,405
...,...,...,...,...,...,...,...,...,...,...,...
909658,2172,J,5,300,1,2,215,31,8,5,2067
909706,2172,J,5,600,4,26,239,34,8,5,2091
909813,2172,J,5,900,0,26,270,39,9,5,2121
909731,2172,J,5,200,1,27,271,39,9,5,2122


In [181]:
final_df = pd.get_dummies(final_df, columns=['Warehouse'])
iterate_groups = final_df.groupby(['Product_Code'])

In [245]:
final_df.describe()

Unnamed: 0,Product_Code,Product_Category,Order_Demand,DayOfWeek,DayOfMonth,DayOfYear,Week,Month,Year,TotalDays,Warehouse_A,Warehouse_C,Warehouse_J,Warehouse_S
count,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0,1031437.0
mean,1115.158,14.88574,4962.992,2.022948,15.57185,182.6344,26.69519,6.508216,2.961957,1291.933,0.1362274,0.04094967,0.7377009,0.08512202
std,575.2633,7.941771,29113.06,1.440711,8.801473,105.0001,14.99797,3.446524,1.388986,516.5793,0.3430301,0.1981737,0.4398846,0.2790633
min,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,38.0,0.0,0.0,0.0,0.0
25%,623.0,7.0,20.0,1.0,8.0,90.0,13.0,3.0,2.0,855.0,0.0,0.0,0.0,0.0
50%,1286.0,19.0,300.0,2.0,16.0,183.0,27.0,7.0,3.0,1285.0,0.0,0.0,1.0,0.0
75%,1469.0,19.0,2000.0,3.0,23.0,275.0,40.0,10.0,4.0,1726.0,0.0,0.0,1.0,0.0
max,2172.0,33.0,4000000.0,6.0,31.0,366.0,53.0,12.0,6.0,2229.0,1.0,1.0,1.0,1.0


In [246]:
final_df.corr()

Unnamed: 0,Product_Code,Product_Category,Order_Demand,DayOfWeek,DayOfMonth,DayOfYear,Week,Month,Year,TotalDays,Warehouse_A,Warehouse_C,Warehouse_J,Warehouse_S
Product_Code,1.0,0.147117,0.032071,0.001891,0.001913,0.0086,0.009019,0.008479,0.016372,0.017797,0.013636,-0.034695,0.028821,-0.037552
Product_Category,0.147117,1.0,0.054308,0.002839,-0.001012,0.00451,0.004193,0.004654,-0.020676,-0.019378,0.136713,0.016962,-0.179099,0.102217
Order_Demand,0.032071,0.054308,1.0,-0.004203,-0.001938,-0.000875,-0.001237,-0.000694,0.008237,0.007912,-0.053568,0.062821,-0.031981,0.071647
DayOfWeek,0.001891,0.002839,-0.004203,1.0,0.021295,0.011162,-4.8e-05,0.009413,-0.015028,-0.012502,0.010795,-0.013071,-0.001705,-0.001299
DayOfMonth,0.001913,-0.001012,-0.001938,0.021295,1.0,0.049037,0.044811,-0.034732,-0.006973,0.003243,0.006831,0.009493,-0.014727,0.008075
DayOfYear,0.0086,0.00451,-0.000875,0.011162,0.049037,1.0,0.995683,0.996466,-0.008407,0.192032,0.013247,0.013284,-0.016749,0.000684
Week,0.009019,0.004193,-0.001237,-4.8e-05,0.044811,0.995683,1.0,0.992567,-0.00911,0.190489,0.013399,0.01018,-0.013627,-0.002219
Month,0.008479,0.004654,-0.000694,0.009413,-0.034732,0.996466,0.992567,1.0,-0.007795,0.191912,0.012712,0.012515,-0.015708,0.000248
Year,0.016372,-0.020676,0.008237,-0.015028,-0.006973,-0.008407,-0.00911,-0.007795,1.0,0.979739,-0.014825,0.031343,0.031305,-0.053381
TotalDays,0.017797,-0.019378,0.007912,-0.012502,0.003243,0.192032,0.190489,0.191912,0.979739,1.0,-0.011889,0.033428,0.027329,-0.052202


<b> <h2> Data and Model Discussion </h2> </b>

After finally completing feature adding and also normalizing values, we can look at some of the statistics of the data and try and see what models may work. After analyzing the stats, the data is pretty all over the place and doesn't seem to coorelate too much with anything else, in terms of Order_Demand. For this reason, I think that models like Linear Regression and Naive Bayes will do very poorly against models like Random Forrest and also Gradient Boosting. The reason is just because the simpler models just try and draw coorelations between all variables, meanwhile, more complex models can try and find correlations between the variables and how the affect each other and the result in large. The range of models I will test will be:

• Linear Regression

• Random Forest Regression

• Multi Layer Perceptron

• Gradient Boost (sklearn)

• Gradient Boost (xgboost)

<b> <h2> The Approach </h2> </b>

For the approach of each model, the full data set will be split according to the product being considered. Afterwards, with each iteration, the model will be trained and tested. The score will then be added to a list of scores for each product. Afterwards, the mean will be taken and that will be the metric for analysis. There are others, but that can be left to future implementations. 

In [217]:
#Linear reg
scores = []
for data in tqdm(iterate_groups):
    if len(data[1]) >= 5:
        X = np.array(data[1].drop("Order_Demand", axis=1))
        y = np.array(data[1]["Order_Demand"])

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

        line = LinearRegression()
        line.fit(X_train, y_train)
        scores.append(float(line.score(X_test, y_test)))

100%|██████████| 2160/2160 [00:03<00:00, 620.31it/s]


In [218]:
print(np.nansum(scores)/len(scores))

-454.98523373053746


In [207]:
#rand forest
rand_scores = []
for data in tqdm(iterate_groups):
    if len(data[1]) >= 5:
        X = np.array(data[1].drop("Order_Demand", axis=1))
        y = np.array(data[1]["Order_Demand"])

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

        rand_forest = RandomForestRegressor()
        rand_forest.fit(X_train, y_train)
        rand_scores.append(float(rand_forest.score(X_test, y_test)))

100%|██████████| 2160/2160 [05:52<00:00,  6.13it/s]


In [208]:
print(np.nansum(rand_scores)/len(rand_scores))

-270.59351632064704


In [213]:
#gradient boosting
grad_scores = []
for data in tqdm(iterate_groups):
    if len(data[1]) >= 5:
        X = np.array(data[1].drop("Order_Demand", axis=1))
        y = np.array(data[1]["Order_Demand"])

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

        boost = XGBClassifier(subsample = 0.7, max_depth = 4)
        boost.fit(X_train, y_train)
        grad_scores.append(float(boost.score(X_test, y_test)))

100%|██████████| 2160/2160 [17:58<00:00,  2.00it/s]  


In [214]:
print(np.nansum(grad_scores)/len(grad_scores))

0.26995112403218174


In [219]:
#sklearn gradient boosting
s_grad_scores = []
for data in tqdm(iterate_groups):
    if len(data[1]) >= 5:
        X = np.array(data[1].drop("Order_Demand", axis=1))
        y = np.array(data[1]["Order_Demand"])

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

        s_boost = GradientBoostingRegressor()
        s_boost.fit(X_train, y_train)
        s_grad_scores.append(float(s_boost.score(X_test, y_test)))

100%|██████████| 2160/2160 [01:43<00:00, 20.81it/s]


In [220]:
print(np.nansum(s_grad_scores)/len(s_grad_scores))

-24.864922776121155


In [223]:
#mlp
mlp_scores = []
for data in tqdm(iterate_groups):
    if len(data[1]) >= 5:
        X = np.array(data[1].drop("Order_Demand", axis=1))
        y = np.array(data[1]["Order_Demand"])

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

        mlp = MLPRegressor()
        mlp.fit(X_train, y_train)
        mlp_scores.append(float(mlp.score(X_test, y_test)))

100%|██████████| 2160/2160 [08:13<00:00,  4.38it/s]


In [224]:
print(np.nansum(mlp_scores)/len(mlp_scores))

-477.01761833949496


<b> <h2> Analysis </h2> </b>

The best model was Gradient Boosting, not a complete surprise, with a mean score of 0.270, amazing. The other models were abysmal in comparison. The other models don't even reach positive. Some ways to improve on this would be to apply this in a more real world scenario. This would be to train the model on training data and then test it on one data point, then retrain the data on that point included and repeat that process. This way you are not using an old model by the end. Next, I could have also tested a RNN or a LSTM which might have faired much better than an out of the box MLP. Finally, I could have also looked at which periods have the most amount of correlation, or have the most cyclic behavior and targeted those time periods instead of just running through the whole data set.