# Learning regression datasets and training model

### Imports

In [245]:
import os
from pathlib import Path
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [246]:
while "notebooks" in os.getcwd():
    os.chdir("../")

In [247]:
do_training = True

### California Housing Prices Datset

In [248]:
DATA_DIR = Path('data/housing_data')
file_name = 'housing.csv'
scaled_file_name = 'housing_scaled.csv'
train_file_name = 'train_housing_scaled.csv'
test_file_name = 'test_housing_scaled.csv'
scaler_params_file = 'housing_scaling_params.csv'

In [249]:
MODEL_PATH = Path('models/')
housing_model_name = 'housing'

In [250]:
df = pd.read_csv(DATA_DIR / file_name)

In [251]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [252]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [253]:
df = df.drop(columns=['ocean_proximity'])

In [254]:
df.loc[:, df.columns == 'median_house_value']

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [115]:
def scale_split_df(_df: pd.DataFrame, prediction_column: str, test_size: float, data_dir: Path,
                   scaled_df_name: str, train_df_name: str, test_df_name: str, scaler_params_name: str):

    scaler = StandardScaler()
    features_df = df.loc[:, df.columns != prediction_column]
    prediction_df = df.loc[:, df.columns == prediction_column]
    scaled_features = pd.DataFrame(scaler.fit_transform(features_df))
    scaled_features.columns = features_df.columns.str.replace(' ', '_')
    scaled_df = pd.concat([scaled_features, prediction_df], axis=1)
    train_df, test_df = train_test_split(scaled_df, test_size=test_size)

    scaled_df.to_csv(data_dir/scaled_df_name, index=False)
    train_df.to_csv(data_dir/train_df_name, index=False)
    test_df.to_csv(data_dir/test_df_name, index=False)

    normalization_params = {
        "mean": scaler.mean_,
        "variance": scaler.var_,
    }

    normalization_params_df = pd.DataFrame.from_dict(
        normalization_params, orient="index")
    normalization_params_df.columns = features_df.columns
    normalization_params_df.to_csv(DATA_DIR/scaler_params_name)

    return scaled_df, train_df, test_df

In [240]:
#scaled_df, train_df, test_df = scale_split_df(
#    df, 'median_house_value', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
#scaled_df.describe()

IndentationError: unexpected indent (2115582923.py, line 2)

In [255]:
train_df = pd.read_csv("data/housing_data/train_housing_scaled.csv")
test_df = pd.read_csv("data/housing_data/test_housing_scaled.csv")

In [256]:
X_train = train_df.loc[:, df.columns != 'median_house_value']
y_train = train_df.loc[:, df.columns == 'median_house_value']
X_test = test_df.loc[:, df.columns != 'median_house_value']
y_test = test_df.loc[:, df.columns == 'median_house_value']

#### Housing model training

In [24]:
def train_and_save_model(param: dict, steps: int, dtrain: xgb.DMatrix, dtest: xgb.DMatrix, model_path: Path, model_name: str):
    if do_training:
        gbdt_model = xgb.train(param, dtrain,
                               evals=[(dtest, 'test'), (dtrain, 'train')],
                               verbose_eval=50, early_stopping_rounds=1, num_boost_round=steps
                               )
    if do_training:
        gbdt_model.dump_model(
            model_path / f"{model_name}_dumped.txt", with_stats=True)
        gbdt_model.save_model(model_path / f"{model_name}_saved.json")
    else:
        gbdt_model = xgb.Booster()
        gbdt_model.load_model(model_path / f"{model_name}_saved.json")
    return gbdt_model

#### Performing grid_search

In [313]:
def grid_search(space: dict, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame,
                num_boost_round = 300):
    clf = xgb.XGBRegressor(
        objective='reg:squarederror',
        early_stopping_rounds=10,
        #n_boost_rounds=30
        n_estimators = num_boost_round
) 
    grid_search = GridSearchCV(
        clf, param_grid=space, scoring='neg_root_mean_squared_error', n_jobs=4, cv=5, verbose = 0)
    grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose = 0)
    return grid_search

In [314]:
def search_hyperparameters_and_save_model(search_space: dict, 
                                          X_train: pd.DataFrame, 
                                          y_train: pd.DataFrame,
                                          X_test: pd.DataFrame, 
                                          y_test: pd.DataFrame,
                                          model_name: str,
                                          MODEL_PATH: Path,
                                          num_boost_round = 300):
    best = grid_search(search_space, X_train, y_train, X_test, y_test, num_boost_round=num_boost_round)

    best_params = best.best_params_
    dump = best.best_estimator_.get_booster().get_dump()
    leaves = sum([i.count('leaf') for i in dump])
    iterations = len(dump)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    name =  "different_sizes/" + model_name +  f"_leaves_{leaves}"
    train_and_save_model(best_params, iterations, dtrain, dtest, MODEL_PATH, name)

#### Training models with various sizes - depth is set to be fixed to control size of a model


In [236]:
grid_params = {
    'max_depth': [1, 2,],# 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
for d in range(1, 10):
    grid_params['max_depth'] = [d] 
    search_hyperparameters_and_save_model(grid_params,
                                      X_train,
                                      y_train, 
                                      X_test, 
                                      y_test,
                                      housing_model_name,
                                      MODEL_PATH);

#### Two best models - depth is included in grid search 

In [284]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01],#, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01]#, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [285]:
grid_params_big = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round=40)

In [None]:
%%time
best_big = grid_search(grid_params_big, X_train, y_train, X_test, y_test, num_boost_round = 20)

In [294]:
best.best_params_

{'eta': 0.01, 'max_depth': 4, 'subsample': 0.01}

In [295]:
len(best.best_estimator_.get_booster().get_dump())

40

In [277]:
best_big.best_params_

{'eta': 0.2, 'max_depth': 10, 'subsample': 0.9}

In [278]:
len(best_big.best_estimator_.get_booster().get_dump())

20

In [297]:
# training hyperparameters
param = {
    'eta': 0.1,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.01,
}
steps = 40

In [300]:
# training hyperparameters
param_big = {
    'eta': 0.2,
    'max_depth': 10,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps = 1

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [299]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, housing_model_name)

[0]	test-rmse:112163.00291	train-rmse:109087.38916


[39]	test-rmse:65885.59621	train-rmse:63674.06243
CPU times: user 4.78 s, sys: 10.6 ms, total: 4.79 s
Wall time: 333 ms


<xgboost.core.Booster at 0x7f8ddaa8ebc0>

In [301]:
%%time
train_and_save_model(param_big, steps, dtrain, dtest, MODEL_PATH, housing_model_name + '_big')

[0]	test-rmse:100785.67308	train-rmse:96867.96499
CPU times: user 3.87 s, sys: 7.02 ms, total: 3.88 s
Wall time: 278 ms


<xgboost.core.Booster at 0x7f8dda97a080>

### Single tree

In [None]:
grid_params = {
    'max_depth': [3],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round = 1)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)

CPU times: user 744 ms, sys: 90 ms, total: 834 ms
Wall time: 4.27 s


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical

In [None]:
best.best_params_

{'eta': 0.9, 'max_depth': 3, 'subsample': 0.9}

In [None]:
len(best.best_estimator_.get_booster().get_dump())

1

In [None]:
# training hyperparameters
param = {
    'eta': 0.9,
    'max_depth': 3,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps = 1

In [None]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, housing_model_name + '_single')

[0]	test-rmse:81710.35211	train-rmse:82611.51414
CPU times: user 992 ms, sys: 0 ns, total: 992 ms
Wall time: 76.5 ms


<xgboost.core.Booster at 0x7f99e8b2aa10>

### Red Wine Dataset

In [302]:
DATA_DIR = Path('data/wine_quality')
file_name = 'winequality_red.csv'
scaled_file_name = 'winequality_red_scaled.csv'
train_file_name = 'train_winequality_red_scaled.csv'
test_file_name = 'test_winequality_red_scaled.csv'
scaler_params_file = 'winequality_red_scaling_params.csv'
MODEL_PATH = Path('models/')
wine_model_name = 'winequality_red'

In [303]:
df = pd.read_csv(DATA_DIR / file_name, sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [226]:
#scaled_df, train_df, test_df = scale_split_df(
#    df, 'quality', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
#scaled_df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,3.554936e-16,1.733031e-16,-8.887339000000001e-17,-1.244227e-16,3.732682e-16,-6.221137e-17,4.4436690000000005e-17,-3.473172e-14,2.861723e-15,6.754377e-16,1.066481e-16,5.636023
std,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,0.807569
min,-2.137045,-2.27828,-1.391472,-1.162696,-1.603945,-1.4225,-1.230584,-3.538731,-3.700401,-1.936507,-1.898919,3.0
25%,-0.7007187,-0.7699311,-0.9293181,-0.4532184,-0.371229,-0.8487156,-0.7440403,-0.6077557,-0.6551405,-0.6382196,-0.8663789,5.0
50%,-0.2410944,-0.04368911,-0.05636026,-0.240375,-0.1799455,-0.1793002,-0.2574968,0.001760083,-0.007212705,-0.2251281,-0.2093081,6.0
75%,0.5057952,0.6266881,0.7652471,0.04341614,0.05384542,0.4901152,0.4723184,0.5768249,0.5759223,0.4240158,0.6354971,6.0
max,4.355149,5.877976,3.743574,9.195681,11.12703,5.367284,7.375154,3.680055,4.528282,7.918677,4.202453,8.0


In [304]:
train_df = pd.read_csv("data/wine_quality/train_winequality_red_scaled.csv")
test_df = pd.read_csv("data/wine_quality/test_winequality_red_scaled.csv")

In [305]:
X_train = train_df.loc[:, train_df.columns != 'quality']
y_train = train_df.loc[:, train_df.columns == 'quality']
X_test = test_df.loc[:, test_df.columns != 'quality']
y_test = test_df.loc[:, test_df.columns == 'quality']

### Wine model training


#### Grid search

In [306]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
for d in range(1, 10):
    grid_params['max_depth'] = [d] 
    search_hyperparameters_and_save_model(grid_params,
                                      X_train,
                                      y_train, 
                                      X_test, 
                                      y_test,
                                      wine_model_name,
                                      MODEL_PATH)

In [221]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [222]:
grid_params_big = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round=40)

In [None]:
%%time
best_big = grid_search(grid_params_big, X_train, y_train, X_test, y_test, num_boost_round =20)

In [317]:
best.best_params_

{'eta': 0.2, 'max_depth': 4, 'subsample': 0.9}

In [318]:
len(best.best_estimator_.get_booster().get_dump())

40

In [319]:
best_big.best_params_

{'eta': 0.2, 'max_depth': 6, 'subsample': 0.9}

In [320]:
len(best_big.best_estimator_.get_booster().get_dump())

20

In [322]:
# training hyperparameters
param = {
    'eta': 0.2,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.8
}
steps = 40

In [327]:
# training hyperparameters
param_big = {
    'eta': 0.2,
    'max_depth': 6,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9
}
steps = 20

In [324]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [325]:
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, wine_model_name)

[0]	test-rmse:0.77840	train-rmse:0.74013
[31]	test-rmse:0.62305	train-rmse:0.45214


<xgboost.core.Booster at 0x7f8dda8b9ba0>

In [328]:
train_and_save_model(param_big, steps, dtrain, dtest, MODEL_PATH, wine_model_name + "_big")

[0]	test-rmse:0.76863	train-rmse:0.71958


[19]	test-rmse:0.59941	train-rmse:0.34475


<xgboost.core.Booster at 0x7f8dda97bca0>

### Single tree

In [None]:
grid_params = {
    'max_depth': [3],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test, num_boost_round = 1)

In [None]:
best.best_params_

{'eta': 0.9, 'max_depth': 3, 'subsample': 0.9}

In [None]:
len(best.best_estimator_.get_booster().get_dump())

1

In [None]:
# training hyperparameters
param = {
    'eta': 0.9,
    'max_depth': 3,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps = 1

In [None]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, wine_model_name + '_single')

[0]	test-rmse:0.65343	train-rmse:0.66162
CPU times: user 2.21 s, sys: 6.17 ms, total: 2.22 s
Wall time: 169 ms


<xgboost.core.Booster at 0x7f9a7855e0e0>

#### Mniejsze modele (maks. głębokość 4 zamiast 10) mają trochę gorsze wyniki ale mają znacznie mniej liści:
- Housing  RMSE: 51 51393.33848 vs 47937.65673, spadek liczby liści z ok 20 0000 do 1500
- Wine  RSE: 0.61376 vs 0.63269, spadek liczby liści z ok. 15 000 do 600