# ** Xebia Part II: Machine Learning Pipeline**
0. **Data loading**
1. **Preprocessing pipeline**
2. **Grid Search**
    1. Xgboost
    2. Random Forest
    3. Lgb
3. **Stacking**
    1. Simple average
    2. mlens
4. **Prediction and submission**

In [1]:
import sys
import numpy as np
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
sys.path.insert(0, '../')

from src.utils import WarrantyToFloat,FillByMax,one_folder_out,make_submission
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression,Ridge
from mlens.ensemble import SuperLearner

%matplotlib inline
random_seed = 42
cv_number = 10

[MLENS] backend: threading


## 1. Data Loading

In [2]:
# laod
df_train = pd.read_csv('../data/mower_market_snapshot.csv',sep=';',na_values='unknown')
df_train = df_train.loc[np.abs(df_train["price"] - df_train["prod_cost"]-df_train["margin"] )<1]
df_test = pd.read_csv('../data/submission_set.csv',sep=';',na_values='unknown')
len_train = df_train.shape[0]

# shuffle train data set 
df_train = df_train.sample(frac=1,random_state=random_seed).reset_index(drop=True)
target = df_train.loc[:,"attractiveness"]
# transform target to log(1+target) so that rmse can be used in learning
target = np.log(1+target)

df_train = df_train.drop(["id","market_share","attractiveness"],axis=1)
submission = df_test.loc[:,["id"]]
df_test = df_test.drop(["id"],axis=1)

In [3]:
df_test.head(2)

Unnamed: 0,capacity,failure_rate,margin,price,prod_cost,product_type,quality,warranty
0,21.313064,0.037928,613.061762,768.160605,155.098843,auto-portee,Medium,3 ans
1,25.797234,0.038664,701.321608,865.72754,164.405932,auto-portee,Low,3 ans


In [4]:
df_train.head(2)

Unnamed: 0,capacity,failure_rate,margin,price,prod_cost,product_type,quality,warranty
0,47.049497,0.208291,65.730381,76.007012,10.276631,essence,Low,1 an.
1,21.745488,0.036429,660.239043,820.227993,159.98895,auto-portee,Hight,3 anss


Train, test and target are ready for preprocessing and machine learning pipeline.

## 2. Preprocessing pipeling
DataFrameMapper from sklearn_pandas package is used to preprocessing the data for more detail of package please look at https://github.com/scikit-learn-contrib/sklearn-pandas. Please find the code of WarrentyToFloat and FillByMax in src/utils file.

In [5]:
preprocessing = Pipeline([
    ('preprocessing',DataFrameMapper([ 
                ('product_type', LabelBinarizer()),
                ('quality', LabelBinarizer()),
                ('warranty',WarrantyToFloat()),
                                           ],input_df=True,default=None,df_out = True)),
     ('fill_by_max',FillByMax()) # this is only used for tree methodes  
            ])

df_train_prepro = preprocessing.fit_transform(df_train)
df_train_prepro.head(2)  

Unnamed: 0,product_type_auto-portee,product_type_electrique,product_type_essence,quality_Hight,quality_Low,quality_Medium,warranty,capacity,failure_rate,margin,price,prod_cost
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,47.049497,0.208291,65.730381,76.007012,10.276631
1,1.0,0.0,0.0,1.0,0.0,0.0,3.0,21.745488,0.036429,660.239043,820.227993,159.98895


df_train_prepro is ready to train a model and we will use the fitted preprocessing pipeling to preprocessing the test data set at the last part of this notebook

## 3. Grid search
GridSearchCV is used for Xgboost, Random Forest and Lgb to find their optimal parameters
### Xgboost

In [6]:
parameters = {'learning_rate': [0.1, 0.15,0.2], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [15,20],
              'subsample': [0.95,0.98],
              'colsample_bytree': [0.95,0.98],
              'n_estimators': [100,150], #number of trees, change it to 1000 for better results
              'objective':["reg:linear"],
              'seed': [random_seed],
               }
xgb_model = xgb.XGBRegressor()
clf_xgb = GridSearchCV(xgb_model, parameters, n_jobs=cv_number, cv= cv_number,
                       scoring='neg_mean_squared_error',
                       verbose=0, refit=True,return_train_score=True)
_ = clf_xgb.fit(df_train_prepro,target)
# best parameter
print(clf_xgb.best_params_)
# score
print(clf_xgb.best_score_)

{'colsample_bytree': 0.95, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 20, 'n_estimators': 100, 'objective': 'reg:linear', 'seed': 42, 'subsample': 0.95}
-0.001757690217706522


### Random Forest

In [7]:
parameters = {"n_estimators":[20,50,100],
              "max_depth": [5,None],
              "max_features": [8,9,10],
              "min_samples_split": [2,3],
              "min_samples_leaf": [2,3,5],
              "random_state":[random_seed]
                }
rf_model =  RandomForestRegressor()
clf_rf = GridSearchCV(rf_model, parameters, n_jobs=cv_number,cv= cv_number, 
                   scoring='neg_mean_squared_error',
                   verbose=0, refit=True,return_train_score=True)
_ = clf_rf.fit(df_train_prepro,target)
print(clf_rf.best_params_)
print(clf_rf.best_score_)

{'max_depth': None, 'max_features': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
-0.0018174264910305192


### Lgb

In [8]:
parameters = {
    'learning_rate': [0.1,0.3],
    'n_estimators': [100,200],
    'num_leaves': [12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['regression'],
    'random_state' : [random_seed], 
    'colsample_bytree' : [0.6, 0.7],
    'subsample' : [0.9]
    }
lgb_model =  lgb.LGBMRegressor()
clf_lgb = GridSearchCV(lgb_model, parameters, n_jobs=cv_number,
                    cv= cv_number, 
                    scoring='neg_mean_squared_error',
                    verbose=0, refit=True,return_train_score=True)
_ = clf_lgb.fit(df_train_prepro,target)
print(clf_lgb.best_params_)
print(clf_lgb.best_score_)

{'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 12, 'objective': 'regression', 'random_state': 42, 'subsample': 0.9}
-0.0017132634310111684


## 4. Stacking
After grid search we have 3 very good models let's combine them to get a better model. The simplest way is to average the 3 models, besides we can use some package like mlens to stack models, which is sometimes a more efficienty way. 

### Simple Average 

In [9]:
# prepare first level predictions for stacking
pre_rf_k =  one_folder_out(df_train_prepro,target,RandomForestRegressor(**clf_rf.best_params_),cv_number,random_seed)
pre_xgb_k = one_folder_out(df_train_prepro,target,xgb.XGBRegressor(**clf_xgb.best_params_),cv_number,random_seed)
pre_lgb_k = one_folder_out(df_train_prepro,target,lgb.LGBMRegressor(**clf_lgb.best_params_),cv_number,random_seed)

In [10]:
print(mean_squared_error(pre_xgb_k,target))
print(mean_squared_error(pre_rf_k,target))
print(mean_squared_error(pre_lgb_k,target))
print(mean_squared_error( 0.05*pre_rf_k+0.25*pre_xgb_k+0.7*pre_lgb_k,target))

0.001757690217706522
0.0018174264910305192
0.0017132634310111684
0.0017020702891760553


## mlens

In [11]:
ensemble_ridge = SuperLearner(scorer=mean_squared_error, folds=cv_number, random_state=random_seed, array_check=0)
ensemble_ridge.add([xgb.XGBRegressor(**clf_xgb.best_params_),
              RandomForestRegressor(**clf_rf.best_params_),
              lgb.LGBMRegressor(**clf_lgb.best_params_)
             ],)

ensemble_ridge.add(Ridge(fit_intercept=False))
ensemble_ridge.fit(df_train_prepro.values, target.values)

SuperLearner(array_check=0, backend=None, folds=10,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=7270, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=FoldIndex(X=None, folds=10, raise_on_e...b749bf8>)],
   n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=42, sample_size=20,
       scorer=<function mean_squared_error at 0x10b749bf8>, shuffle=False,
       verbose=False)

In [12]:
ensemble_ridge.data

Data([('score-m',
       OrderedDict([('layer-1/lgbmregressor', 0.0017132634310111684),
                    ('layer-1/randomforestregressor', 0.0018174264910305192),
                    ('layer-1/xgbregressor', 0.0017576902177065216),
                    ('layer-2/ridge', 0.0017091848473184974)])),
      ('score-s',
       OrderedDict([('layer-1/lgbmregressor', 0.00023336004227918268),
                    ('layer-1/randomforestregressor', 0.00022689234463838643),
                    ('layer-1/xgbregressor', 0.00027377742526342246),
                    ('layer-2/ridge', 0.00023533418813480374)])),
      ('ft-m',
       OrderedDict([('layer-1/lgbmregressor', 0.3550750264996168),
                    ('layer-1/randomforestregressor', 0.6492212285993446),
                    ('layer-1/xgbregressor', 0.18572375950061543),
                    ('layer-2/ridge', 0.0016378871005144902)])),
      ('ft-s',
       OrderedDict([('layer-1/lgbmregressor', 0.026980491227216248),
                    ('l

## 5. Predicting and submision 

In [13]:
df_test_prepro = preprocessing.transform(df_test)

In [14]:
# xgb , lgb and rf
make_submission(df_test_prepro,submission,clf_rf,"rf")
make_submission(df_test_prepro,submission,clf_xgb,"xgb") 
make_submission(df_test_prepro,submission,clf_lgb,"lgb")

# mlens
submission["attractiveness"]=np.e**(ensemble_ridge.predict(df_test_prepro.values)) -1
submission.to_csv('../submissions/xu_qi_attractiveness2.csv',sep=';',index=False)

# average
rf_pred = pd.read_csv('../submissions/rf.csv',sep=';')
xgb_pred = pd.read_csv('../submissions/xgb.csv',sep=';')
lgb_pred = pd.read_csv('../submissions/lgb.csv',sep=';')
submission["attractiveness"] = 0.05*rf_pred["attractiveness"] + 0.25*xgb_pred["attractiveness"] + 0.7*lgb_pred["attractiveness"]
submission.to_csv('../submissions/xu_qi_attractiveness.csv',sep=';',index=False)