# Model Training

In [1]:
import sys
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import joblib

sys.path.append("../../")

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

## Load Data

In [2]:
data = pd.read_csv("../../data/train.csv")

data.head()

Unnamed: 0,battery_power,has_bluetooth,clock_speed,has_dual_sim,front_camera_megapixels,has_four_g,internal_memory,depth,weight,number_of_cores,primary_camera_megapixels,pixel_resolution_height,pixel_resolution_width,ram,screen_height,screen_width,talk_time,has_three_g,has_touch_screen,has_wifi,price_range
0,672,0,0.5,0,7,1,35,0.1,99,8,17,574,1267,1403,16,7,6,1,0,0,0
1,1201,1,0.5,0,1,1,10,0.4,198,6,5,1151,1723,726,11,4,3,1,1,0,0
2,1007,0,2.0,0,0,0,45,0.1,95,5,2,1186,1529,3648,9,0,16,0,0,0,3
3,990,1,2.7,1,3,0,15,0.9,153,2,7,1466,1717,2698,11,8,6,1,0,0,2
4,1948,0,2.9,1,4,0,5,0.2,136,1,13,651,700,361,6,1,10,0,1,0,0


## Create Training, Validation, and Test Sets

In [3]:
# separating the feature columns from the target column
feature_columns = [
    "battery_power",
    "has_bluetooth",
    "clock_speed",
    "has_dual_sim",
    "front_camera_megapixels",
    "has_four_g",
    "internal_memory",
    "depth",
    "weight",
    "number_of_cores",
    "primary_camera_megapixels",
    "pixel_resolution_height",
    "pixel_resolution_width",
    "ram",
    "screen_height",
    "screen_width",
    "talk_time",
    "has_three_g",
    "has_touch_screen",
    "has_wifi"
]

target_column = "price_range"

# to make sure that we didnt miss any columns, we'll assert that the counts sum up
assert (len(feature_columns) + 1 == len(data.columns))

Create a 60%, 20%, 20% split for training, validation and test sets.

In [4]:
train, validate, test = np.split(data.sample(frac=1), [int(0.6*len(data)), int(0.8*len(data))])

In [5]:
X_train = train[feature_columns]
y_train = train[target_column]

In [6]:
X_validate = validate[feature_columns]
y_validate = validate[target_column]

In [7]:
print(X_train.shape)
print(X_validate.shape)

(720, 20)
(240, 20)


In [8]:
print(y_train.shape)
print(y_validate.shape)

(720,)
(240,)


## Load the the Transformer

In [9]:
# loading the preprocessing pipeline we built in the previous notebook
transformer = joblib.load("column_transformer.joblib")

## Train a Model

In [10]:
model = XGBClassifier()

pipeline = Pipeline(steps=[("preprocessor", transformer),
                           ("model", model)])

pipeline.fit(X_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['battery_power',
                                                   'clock_speed',
                                                   'front_camera_megapixels',
                                                   'internal_memory', 'depth',
                                                   'weight', 'number_of_cores',
                                                   'primary_camera_megapixels',
                                                   'pixel_resolut

## Test Model With Single Sample

In [11]:
result = model.predict(X_validate.iloc[[0]])

result

array([3])

## Hyperparameter Tuning

To do hyperparameter tuning, we'll use the Hyperopt package.

This section is based on this post: https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning

In [12]:
space = {
    "max_depth": hp.quniform("max_depth", 3, 18, 1),
    "gamma": hp.uniform ("gamma", 1, 9),
    "reg_alpha" : hp.quniform("reg_alpha", 40,180,1),
    "reg_lambda" : hp.uniform("reg_lambda", 0, 1),
    "colsample_bytree" : hp.uniform("colsample_bytree", 0.5, 1),
    "min_child_weight" : hp.quniform("min_child_weight", 0, 10, 1),
    "n_estimators": 180,
    "seed": 0
}

hp.quniform(label, low, high, q) — Returns a value round(uniform(low, high) / q) * q, i.e it rounds the decimal values and returns an integer.

In [13]:
def objective(space):
    classifier = XGBClassifier(
        n_estimators=space["n_estimators"], 
        max_depth=int(space["max_depth"]), 
        gamma=space["gamma"],
        reg_alpha=int(space["reg_alpha"]),
        min_child_weight=int(space["min_child_weight"]),
        colsample_bytree=int(space["colsample_bytree"])
    )
    
    evaluation = [(X_train, y_train), (X_validate, y_validate)]
    
    classifier.fit(X_train,
                   y_train,
                   eval_set=evaluation,
                   eval_metric="merror",
                   early_stopping_rounds=10,
                   verbose=False)

    predictions = classifier.predict(X_validate)
    accuracy = accuracy_score(y_validate, predictions)
    print("SCORE: ", accuracy)
    return {
        "loss": -accuracy, 
        "status": STATUS_OK 
    }

In [14]:
trials = Trials()

best_hyperparameters = fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)

SCORE:                                                                                                                                                  
0.25833333333333336                                                                                                                                     
SCORE:                                                                                                                                                  
0.6958333333333333                                                                                                                                      
SCORE:                                                                                                                                                  
0.7041666666666667                                                                                                                                      
SCORE:                                                                            

0.6958333333333333                                                                                                                                      
SCORE:                                                                                                                                                  
0.6958333333333333                                                                                                                                      
SCORE:                                                                                                                                                  
0.6958333333333333                                                                                                                                      
SCORE:                                                                                                                                                  
0.25833333333333336                                                               

SCORE:                                                                                                                                                  
0.25833333333333336                                                                                                                                     
SCORE:                                                                                                                                                  
0.7041666666666667                                                                                                                                      
SCORE:                                                                                                                                                  
0.25833333333333336                                                                                                                                     
SCORE:                                                                            

0.6958333333333333                                                                                                                                      
SCORE:                                                                                                                                                  
0.6958333333333333                                                                                                                                      
SCORE:                                                                                                                                                  
0.7041666666666667                                                                                                                                      
SCORE:                                                                                                                                                  
0.6958333333333333                                                                

In [15]:
print("The best hyperparameters are: ", best_hyperparameters)

The best hyperparameters are:  {'colsample_bytree': 0.9009538594513473, 'gamma': 2.407994646875835, 'max_depth': 9.0, 'min_child_weight': 5.0, 'reg_alpha': 40.0, 'reg_lambda': 0.09736662506010292}


In [16]:
# converting the integer hyperparameters
best_hyperparameters["max_depth"] = int(best_hyperparameters["max_depth"])

## Train the Final Model

In [17]:
model = XGBClassifier(**best_hyperparameters)

pipeline = Pipeline(steps=[("preprocessor", transformer),
                           ("model", model)])

pipeline.fit(X_train, y_train)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['battery_power',
                                                   'clock_speed',
                                                   'front_camera_megapixels',
                                                   'internal_memory', 'depth',
                                                   'weight', 'number_of_cores',
                                                   'primary_camera_megapixels',
                                                   'pixel_resolut

## Test Model With Single Sample

In [18]:
result = pipeline.predict(X_validate.iloc[[0]])

result

array([2])

## Save Model

In [19]:
joblib.dump(pipeline, "model.joblib")

['model.joblib']

## Saving the Datasets

In [20]:
train.to_csv("../../data/train.csv", index=False)
validate.to_csv("../../data/validate.csv", index=False)
test.to_csv("../../data/test.csv", index=False)