## Load the Libraries

In [1]:
# importing the required libraries
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


## Preprocessing

In [3]:
# reading and cleaning the train and test files
df=Reader(sep=",").train_test_split(['train_bm.csv', 'test_bm.csv'],'Item_Outlet_Sales')


reading csv : train_bm.csv ...
cleaning data ...
CPU time: 4.166972875595093 seconds

reading csv : test_bm.csv ...
cleaning data ...
CPU time: 0.07345962524414062 seconds

> Number of common features : 11

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 7
> Number of numerical features: 4
> Number of training samples : 8523
> Number of test samples : 5681

> Top sparse features (% missing values on train set):
Outlet_Size    28.3
Item_Weight    17.2
dtype: float64

> Task : regression
count     8523.000000
mean      2181.288914
std       1706.499616
min         33.290000
25%        834.247400
50%       1794.331000
75%       3101.296400
max      13086.964800
Name: Item_Outlet_Sales, dtype: float64


In [4]:
type(df)

dict

In [5]:
df.keys()

dict_keys(['train', 'test', 'target'])

In [6]:
df['train'].head()

Unnamed: 0,Item_Fat_Content,Item_Identifier,Item_MRP,Item_Type,Item_Visibility,Item_Weight,Outlet_Establishment_Year,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type
0,Low Fat,FDA15,249.8092,Dairy,0.016047,9.3,1999.0,OUT049,Tier 1,Medium,Supermarket Type1
1,Regular,DRC01,48.2692,Soft Drinks,0.019278,5.92,2009.0,OUT018,Tier 3,Medium,Supermarket Type2
2,Low Fat,FDN15,141.618,Meat,0.01676,17.5,1999.0,OUT049,Tier 1,Medium,Supermarket Type1
3,Regular,FDX07,182.095,Fruits and Vegetables,0.0,19.2,1998.0,OUT010,Tier 3,,Grocery Store
4,Low Fat,NCD19,53.8614,Household,0.0,8.93,1987.0,OUT013,Tier 3,High,Supermarket Type1


In [7]:
df['train'].isnull().sum()

Item_Fat_Content                0
Item_Identifier                 0
Item_MRP                        0
Item_Type                       0
Item_Visibility                 0
Item_Weight                  1463
Outlet_Establishment_Year       0
Outlet_Identifier               0
Outlet_Location_Type            0
Outlet_Size                  2410
Outlet_Type                     0
dtype: int64

## Optimisation

In [8]:
# setting the hyperparameter space
space={'ne__numerical_strategy':{"space":['median']},
'ne__categorical_strategy':{"space":['mode']},
       
'ce__strategy':{"search":"choice","space":['label_encoding','entity_embedding']},
'fs__strategy':{"search":"choice","space":['variance','rf_feature_importance']},
'fs__threshold':{"search":"uniform","space":[0.01, 0.3]},
       
'est__strategy' : {"space" : ["RandomForest"]},
'est__max_depth':{"search":"choice","space":[3,5,7,9]},
'est__n_estimators':{"search":"choice","space":[250,500,700,1000]}}

In [9]:
# calculating the best hyper-parameter
best=Optimiser(scoring="r2",n_folds=5).optimise(space,df,10)


##################################################### testing hyper-parameters... #####################################################

>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': 'mode'}

>>> CA ENCODER :{'strategy': 'label_encoding'}

>>> FEATURE SELECTOR :{'strategy': 'variance', 'threshold': 0.290810744939252}

>>> ESTIMATOR :{'strategy': 'RandomForest', 'max_depth': 9, 'n_estimators': 700, 'bootstrap': True, 'criterion': 'mse', 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


MEAN SCORE : r2 = 0.5884089368754352
VARIANCE : 0.01589347931541683 (fold 1 = 0.585687382410703, fold 2 = 0.5797469921092984, fold 3 = 0.5868701408071972, fold 4 = 0.618265773515415, fold 5 = 0.5714743955345621)
CPU time: 10.895755290985107 seconds


#####


MEAN SCORE : r2 = 0.4381012707801246
VARIANCE : 0.014735418770391917 (fold 1 = 0.4468513189709198, fold 2 = 0.43115539047643225, fold 3 = 0.4259826768632752, fold 4 = 0.46276233244695086, fold 5 = 0.4237546351430449)
CPU time: 13.820397853851318 seconds


##################################################### testing hyper-parameters... #####################################################

>>> NA ENCODER :{'numerical_strategy': 'median', 'categorical_strategy': 'mode'}

>>> CA ENCODER :{'strategy': 'entity_embedding'}

>>> FEATURE SELECTOR :{'strategy': 'rf_feature_importance', 'threshold': 0.044404784103226}

>>> ESTIMATOR :{'strategy': 'RandomForest', 'max_depth': 5, 'n_estimators': 1000, 'bootstrap': True, 'criterion': 'mse', 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_s

In [10]:
best

{'ce__strategy': 'label_encoding',
 'est__max_depth': 9,
 'est__n_estimators': 700,
 'est__strategy': 'RandomForest',
 'fs__strategy': 'variance',
 'fs__threshold': 0.290810744939252,
 'ne__categorical_strategy': 'mode',
 'ne__numerical_strategy': 'median'}

## Predictions

In [11]:
# predicting on the test dataset
Predictor().fit_predict(best,df)


fitting the pipeline ...
CPU time: 1.8255245685577393 seconds

predicting...
CPU time: 0.34248805046081543 seconds

> Overview on predictions : 

   Item_Outlet_Sales_predicted
0                  1743.491722
1                  1498.373917
2                   641.687723
3                  2522.618260
4                  6280.104087
5                  1868.182294
6                   855.838796
7                  2251.364443
8                  1539.536206
9                  2909.515888

dumping predictions into directory : save ...


<mlbox.prediction.predictor.Predictor at 0x7fab721a3908>