In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from pycaret.regression import *
pd.set_option('display.max_columns', 50)


In [2]:
sales_df = pd.read_csv('./Data/train_data_with_ts.csv')

In [3]:
sales_df.head(2)

Unnamed: 0,recipe_id,product_type,calories,carbs,cooking_time,cuisine,dish_type,heat_level,fat,is_classic,number_of_ingredients_per_recipe,preferences,carbs_content,dish_types,seasons,protein_types,proteins,course_type,meta_tags,protein_cuts,sales,week,year_week_ts,no_of_holidays
0,14919,2 person,687.0,99.2,time_level_4,asian,fish,optional_heat,18.6,No,7,shellfish,rice,"fish_and_side,stir_fry",all_seasons,shellfish,29.5,main,healthy_choice,no_cut,3567,1,2018-01-01,1
1,14920,2 person,779.0,81.3,time_level_4,new_american,veggie,no_heat,38.5,No,5,no_protein,pasta_incl_gnocchi_spatzle,pasta,all_seasons,no_protein,30.0,main,quick,no_cut,2532,1,2018-01-01,1


In [4]:
sales_df.cooking_time.value_counts()

time_level_4    1150
time_level_5     949
time_level_3     114
time_level_6      18
time_level_7       4
Name: cooking_time, dtype: int64

In [5]:
train_sales_df, test_sales_df = train_test_split(sales_df,test_size=0.20, random_state=42)


In [6]:
print('Data for Modeling: ' + str(train_sales_df.shape))
print('Unseen Data For Predictions: ' + str(test_sales_df.shape))

Data for Modeling: (1788, 24)
Unseen Data For Predictions: (447, 24)


In [8]:
regression_models = setup(data = train_sales_df,
                          target = 'sales',
                          transform_target=True,
                          transformation = True,
                          bin_numeric_features = ['calories','fat','proteins'],
                          ordinal_features = {'cooking_time' : ['time_level_3', 'time_level_4', 'time_level_5','time_level_6','time_level_7']},
                          session_id=123) 

 
Setup Succesfully Completed.


Unnamed: 0,Description,Value
0,session_id,123
1,Transform Target,True
2,Transform Target Method,box-cox
3,Original Data,"(1788, 24)"
4,Missing Values,False
5,Numeric Features,6
6,Categorical Features,16
7,Ordinal Features,True
8,High Cardinality Features,False
9,High Cardinality Method,


In [9]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
0,CatBoost Regressor,1768.0863,15056326.0877,3860.9086,0.2995,0.8867,0.8562,7.5313
1,Ridge Regression,2007.2108,16684249.3347,4069.1277,0.2199,1.0333,1.0884,0.0292
2,Extreme Gradient Boosting,1884.8786,16765836.4415,4080.1455,0.2141,0.9907,1.0345,1.1369
3,Light Gradient Boosting Machine,1938.8674,16840869.0519,4086.361,0.213,1.013,1.0325,0.5718
4,Support Vector Machine,1873.5685,17135072.5614,4119.8977,0.2042,1.0042,0.8945,0.7799
5,Gradient Boosting Regressor,1939.2601,17246218.0753,4131.8826,0.2005,0.9672,0.9281,1.2174
6,TheilSen Regressor,2061.2885,17104322.3603,4121.0482,0.1966,1.0784,1.2049,183.1067
7,Orthogonal Matching Pursuit,2020.9116,17368060.2179,4150.5817,0.1912,1.0356,1.0836,0.0315
8,Bayesian Ridge,2016.5641,17395344.7989,4154.615,0.1886,1.0263,1.0433,0.1947
9,Linear Regression,2068.9655,17565190.1252,4177.3455,0.1761,1.7655,1.2705,0.2018


&lt;catboost.core.CatBoostRegressor at 0x219e2678b80&gt;

In [11]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model.Lasso,True
ridge,Ridge Regression,sklearn.linear_model.Ridge,True
en,Elastic Net,sklearn.linear_model.ElasticNet,True
lar,Least Angle Regression,sklearn.linear_model.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model.OMP,True
br,Bayesian Ridge,sklearn.linear_model.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model.PAR,True


In [12]:
catboost_model = create_model('catboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1819.1544,15002357.7825,3873.2877,0.3211,0.8821,0.6776
1,1778.8452,14181503.6776,3765.8337,0.4072,0.8222,0.7886
2,1730.5817,15874414.6988,3984.2709,0.241,0.8808,0.7898
3,1377.1939,9138890.9059,3023.0599,0.3506,0.8066,0.7879
4,1529.268,13463061.4388,3669.2045,0.2729,0.875,0.8208
5,1767.118,15164997.6044,3894.2262,0.3546,0.7976,0.6798
6,1932.1353,17658106.4924,4202.155,0.284,0.8994,0.7958
7,2188.1557,21461611.0577,4632.6678,0.1609,1.0012,0.7961
8,1720.6103,14799178.5477,3846.97,0.246,0.9097,1.1979
9,1837.801,13819138.6711,3717.4102,0.3571,0.9924,1.2277


In [13]:
tuned_cb = tune_model(catboost_model)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,1981.4148,17119986.3634,4137.6305,0.2253,0.9597,0.7638
1,2015.491,17381459.6634,4169.1078,0.2734,0.8871,0.8403
2,1832.7748,16321032.5799,4039.9298,0.2196,0.9433,0.8286
3,1504.8042,9379751.7982,3062.638,0.3335,0.8438,0.8472
4,1671.5732,14353334.2473,3788.5794,0.2248,0.9338,0.9264
5,1867.2489,17955314.4967,4237.3712,0.2359,0.8722,0.7474
6,2050.2316,18970420.3533,4355.5046,0.2308,0.9503,0.8776
7,2200.6189,21240332.8461,4608.7236,0.1696,1.0267,0.8377
8,1768.7376,14053386.2276,3748.7846,0.284,0.9246,1.0625
9,1921.5377,14917383.909,3862.3029,0.306,1.0207,1.1655


In [20]:
save_model(tuned_cb,'./Models/catboost_sales_prediction_model')

INFO:logs:Initializing save_model()
INFO:logs:save_model(model=&lt;catboost.core.CatBoostRegressor object at 0x00000219E47D3FD0&gt;, model_name=./Models/catboost_sales_prediction_model, model_only=False, verbose=True)
INFO:logs:./Models/catboost_sales_prediction_model.pkl saved in current working directory
INFO:logs:Pipeline(memory=None,
         steps=[(&#39;dtypes&#39;,
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      ml_usecase=&#39;regression&#39;,
                                      numerical_features=[], target=&#39;sales&#39;,
                                      time_features=[])),
                (&#39;imputer&#39;,
                 Simple_Imputer(categorical_strategy=&#39;not_available&#39;,
                                numeric_strategy=&#39;mean&#39;,
                                target_variable=None)),
                (&#39;new_levels1&#