# Learning regression datasets and training model

### Imports

In [2]:
import pandas as pd
import os
from pathlib import Path
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [3]:
while "notebooks" in os.getcwd():
    os.chdir("../")

In [4]:
do_training = True

### California Housing Prices Datset

In [5]:
DATA_DIR = Path('data/housing_data')
file_name = 'housing.csv'
scaled_file_name = 'housing_scaled.csv'
train_file_name = 'train_housing_scaled.csv'
test_file_name = 'test_housing_scaled.csv'
scaler_params_file = 'housing_scaling_params.csv'

In [6]:
MODEL_PATH = Path('models/')
housing_model_name = 'housing'

In [7]:
df = pd.read_csv(DATA_DIR / file_name)

In [8]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [9]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [10]:
df = df.drop(columns=['ocean_proximity'])

In [11]:
df.loc[:, df.columns == 'median_house_value']

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [12]:
def scale_split_df(_df: pd.DataFrame, prediction_column: str, test_size: float, data_dir: Path,
                   scaled_df_name: str, train_df_name: str, test_df_name: str, scaler_params_name: str):

    scaler = StandardScaler()
    features_df = df.loc[:, df.columns != prediction_column]
    prediction_df = df.loc[:, df.columns == prediction_column]
    scaled_features = pd.DataFrame(scaler.fit_transform(features_df))
    scaled_features.columns = features_df.columns.str.replace(' ', '_')
    scaled_df = pd.concat([scaled_features, prediction_df], axis=1)
    train_df, test_df = train_test_split(scaled_df, test_size=test_size)

    scaled_df.to_csv(data_dir/scaled_df_name, index=False)
    train_df.to_csv(data_dir/train_df_name, index=False)
    test_df.to_csv(data_dir/test_df_name, index=False)

    normalization_params = {
        "mean": scaler.mean_,
        "variance": scaler.var_,
    }

    normalization_params_df = pd.DataFrame.from_dict(
        normalization_params, orient="index")
    normalization_params_df.columns = features_df.columns
    normalization_params_df.to_csv(DATA_DIR/scaler_params_name)

    return scaled_df, train_df, test_df

In [13]:
scaled_df, train_df, test_df = scale_split_df(
    df, 'median_house_value', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-8.526513e-15,-1.079584e-15,5.508083e-18,3.2015730000000005e-17,-7.233049000000001e-17,-1.101617e-17,6.885104000000001e-17,6.6097e-17,206855.816909
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,115395.615874
min,-2.385992,-1.447568,-2.19618,-1.207283,-1.274093,-1.256123,-1.303984,-1.774299,14999.0
25%,-1.113209,-0.7967887,-0.8453931,-0.5445698,-0.5740034,-0.5638089,-0.5742294,-0.6881186,119600.0
50%,0.5389137,-0.6422871,0.02864572,-0.2332104,-0.2441308,-0.2291318,-0.2368162,-0.1767951,179700.0
75%,0.7784964,0.9729566,0.6643103,0.2348028,0.2589843,0.2644949,0.2758427,0.4593063,264725.0
max,2.62528,2.958068,1.856182,16.81558,14.01871,30.25033,14.60152,5.858286,500001.0


In [14]:
X_train = train_df.loc[:, df.columns != 'median_house_value']
y_train = train_df.loc[:, df.columns == 'median_house_value']
X_test = test_df.loc[:, df.columns != 'median_house_value']
y_test = test_df.loc[:, df.columns == 'median_house_value']

#### Housing model training

In [15]:
def train_and_save_model(param: dict, steps: int, dtrain: xgb.DMatrix, dtest: xgb.DMatrix, model_path: Path, model_name: str):
    if do_training:
        gbdt_model = xgb.train(param, dtrain,
                               evals=[(dtest, 'test'), (dtrain, 'train')],
                               verbose_eval=50, early_stopping_rounds=1, num_boost_round=steps
                               )
    if do_training:
        gbdt_model.dump_model(
            model_path / f"{model_name}_dumped.txt", with_stats=True)
        gbdt_model.save_model(model_path / f"{model_name}_saved.json")
    else:
        gbdt_model = xgb.Booster()
        gbdt_model.load_model(model_path / f"{model_name}_saved.json")
    return gbdt_model

#### Performing grid_search

In [31]:
def grid_search(space: dict, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame):
    clf = xgb.XGBRegressor(
        objective='reg:squarederror',
        early_stopping_rounds=10,
        n_boost_rounds=300


    )
    grid_search = GridSearchCV(
        clf, param_grid=space, scoring='neg_root_mean_squared_error', n_jobs=4, cv=5)
    grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    return grid_search

In [32]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [33]:
grid_params_big = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test)

In [None]:
%%time
best_big = grid_search(grid_params_big, X_train, y_train, X_test, y_test)

In [39]:
best.best_params_

{'eta': 0.3, 'max_depth': 4, 'subsample': 0.9}

In [40]:
len(best.best_estimator_.get_booster().get_dump())

100

In [41]:
best_big.best_params_

{'eta': 0.1, 'max_depth': 7, 'subsample': 0.8}

In [42]:
len(best_big.best_estimator_.get_booster().get_dump())

100

In [26]:
# training hyperparameters
param = {
    'eta': 0.1,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.9,
}
steps = 100

In [43]:
# training hyperparameters
param_big = {
    'eta': 0.1,
    'max_depth': 7,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.8,
}
steps = 100

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [45]:
%%time
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, housing_model_name)

[0]	test-rmse:107902.39448	train-rmse:109480.31063
[50]	test-rmse:55464.40336	train-rmse:53688.37479
[99]	test-rmse:51646.68341	train-rmse:47863.76901
CPU times: user 8.02 s, sys: 446 µs, total: 8.02 s
Wall time: 559 ms


<xgboost.core.Booster at 0x7f664bfd6e90>

In [46]:
%%time
train_and_save_model(param_big, steps, dtrain, dtest, MODEL_PATH, housing_model_name + '_big')

[0]	test-rmse:106355.53035	train-rmse:107748.06093
[50]	test-rmse:49341.76109	train-rmse:39524.67722
[99]	test-rmse:47751.58023	train-rmse:33234.35557
CPU times: user 16.1 s, sys: 0 ns, total: 16.1 s
Wall time: 1.11 s


<xgboost.core.Booster at 0x7f664bfd76d0>

### Red Wine Dataset

In [47]:
DATA_DIR = Path('data/wine_quality')
file_name = 'winequality_red.csv'
scaled_file_name = 'winequality_red_scaled.csv'
train_file_name = 'train_winequality_red_scaled.csv'
test_file_name = 'test_winequality_red_scaled.csv'
scaler_params_file = 'winequality_red_scaling_params.csv'
MODEL_PATH = Path('models/')
wine_model_name = 'winequality_red'

In [48]:
df = pd.read_csv(DATA_DIR / file_name, sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [49]:
scaled_df, train_df, test_df = scale_split_df(
    df, 'quality', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,3.554936e-16,1.733031e-16,-8.887339000000001e-17,-1.244227e-16,3.732682e-16,-6.221137e-17,4.4436690000000005e-17,-3.473172e-14,2.861723e-15,6.754377e-16,1.066481e-16,5.636023
std,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,0.807569
min,-2.137045,-2.27828,-1.391472,-1.162696,-1.603945,-1.4225,-1.230584,-3.538731,-3.700401,-1.936507,-1.898919,3.0
25%,-0.7007187,-0.7699311,-0.9293181,-0.4532184,-0.371229,-0.8487156,-0.7440403,-0.6077557,-0.6551405,-0.6382196,-0.8663789,5.0
50%,-0.2410944,-0.04368911,-0.05636026,-0.240375,-0.1799455,-0.1793002,-0.2574968,0.001760083,-0.007212705,-0.2251281,-0.2093081,6.0
75%,0.5057952,0.6266881,0.7652471,0.04341614,0.05384542,0.4901152,0.4723184,0.5768249,0.5759223,0.4240158,0.6354971,6.0
max,4.355149,5.877976,3.743574,9.195681,11.12703,5.367284,7.375154,3.680055,4.528282,7.918677,4.202453,8.0


In [50]:
X_train = train_df.loc[:, df.columns != 'quality']
y_train = train_df.loc[:, df.columns == 'quality']
X_test = test_df.loc[:, df.columns != 'quality']
y_test = test_df.loc[:, df.columns == 'quality']

### Wine model training


#### Grid search

In [55]:
grid_params = {
    'max_depth': [1, 2, 3, 4],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [56]:
grid_params_big = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    'subsample': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [None]:
%%time
best = grid_search(grid_params, X_train, y_train, X_test, y_test)

In [None]:
%%time
best_big = grid_search(grid_params_big, X_train, y_train, X_test, y_test)

In [64]:
best.best_params_

{'eta': 0.1, 'max_depth': 4, 'subsample': 0.6}

In [65]:
len(best.best_estimator_.get_booster().get_dump())

34

In [66]:
best_big.best_params_

{'eta': 0.1, 'max_depth': 6, 'subsample': 0.8}

In [67]:
len(best_big.best_estimator_.get_booster().get_dump())

32

In [70]:
# training hyperparameters
param = {
    'eta': 0.1,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.6
}
steps = 34

In [71]:
# training hyperparameters
param_big = {
    'eta': 0.1,
    'max_depth': 4,
    'objective': 'reg:squarederror',
    'seed': 42,
    'subsample': 0.8
}
steps = 32

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [73]:
train_and_save_model(param, steps, dtrain, dtest, MODEL_PATH, wine_model_name)

[0]	test-rmse:0.72275	train-rmse:0.79116


[31]	test-rmse:0.60616	train-rmse:0.51858


<xgboost.core.Booster at 0x7f664bfd7a30>

In [74]:
train_and_save_model(param_big, steps, dtrain, dtest, MODEL_PATH, wine_model_name + "_big")

[0]	test-rmse:0.72319	train-rmse:0.79035
[31]	test-rmse:0.61175	train-rmse:0.51737


<xgboost.core.Booster at 0x7f66331e8130>

#### Mniejsze modele (maks. głębokość 4 zamiast 10) mają trochę gorsze wyniki ale mają znacznie mniej liści:
- Housing  RMSE: 51 51393.33848 vs 47937.65673, spadek liczby liści z ok 20 0000 do 1500
- Wine  RSE: 0.61376 vs 0.63269, spadek liczby liści z ok. 15 000 do 600