# Cleaning regression datasets and training model

### Imports

In [328]:
import pandas as pd
import os
from pathlib import Path
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [329]:
while "notebooks" in os.getcwd():
    os.chdir("../")

In [330]:
do_training = True

### California Housing Prices Datset

In [331]:
DATA_DIR = Path('data/housing_data')
file_name = 'housing.csv'
scaled_file_name = 'housing_scaled.csv'
train_file_name = 'train_housing_scaled.csv'
test_file_name = 'test_housing_scaled.csv'
scaler_params_file = 'housing_scaling_params.csv'

In [332]:
MODEL_PATH = Path('models/')
housing_model_name = 'housing'

In [333]:
df = pd.read_csv(DATA_DIR / file_name)

In [334]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [335]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [336]:
df = df.drop(columns=['ocean_proximity'])

In [337]:
df.loc[:, df.columns == 'median_house_value']

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [338]:
def scale_split_df(_df: pd.DataFrame, prediction_column: str, test_size: float, data_dir: Path, 
                   scaled_df_name: str, train_df_name: str, test_df_name: str, scaler_params_name: str):
    
    scaler = StandardScaler()
    features_df = df.loc[:, df.columns != prediction_column]
    prediction_df = df.loc[:, df.columns == prediction_column]
    scaled_features = pd.DataFrame(scaler.fit_transform(features_df))
    scaled_features.columns = features_df.columns
    scaled_df = pd.concat([scaled_features, prediction_df], axis=1)
    train_df, test_df = train_test_split(scaled_df, test_size=test_size)
    
    scaled_df.to_csv(data_dir/scaled_df_name)
    train_df.to_csv(data_dir/train_df_name)
    test_df.to_csv(data_dir/test_df_name)
    
    normalization_params = {
    "mean": scaler.mean_,
    "variance": scaler.var_,
}

    normalization_params_df = pd.DataFrame.from_dict(normalization_params, orient="index")
    normalization_params_df.columns = features_df.columns
    normalization_params_df.to_csv(DATA_DIR/scaler_params_name)

    return scaled_df, train_df, test_df

In [339]:
scaled_df, train_df, test_df = scale_split_df(df, 'median_house_value', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-8.526513e-15,-1.079584e-15,5.508083e-18,3.2015730000000005e-17,-7.233049000000001e-17,-1.101617e-17,6.885104000000001e-17,6.6097e-17,206855.816909
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,115395.615874
min,-2.385992,-1.447568,-2.19618,-1.207283,-1.274093,-1.256123,-1.303984,-1.774299,14999.0
25%,-1.113209,-0.7967887,-0.8453931,-0.5445698,-0.5740034,-0.5638089,-0.5742294,-0.6881186,119600.0
50%,0.5389137,-0.6422871,0.02864572,-0.2332104,-0.2441308,-0.2291318,-0.2368162,-0.1767951,179700.0
75%,0.7784964,0.9729566,0.6643103,0.2348028,0.2589843,0.2644949,0.2758427,0.4593063,264725.0
max,2.62528,2.958068,1.856182,16.81558,14.01871,30.25033,14.60152,5.858286,500001.0


In [340]:
X_train = train_df.loc[:, df.columns != 'median_house_value']
y_train = train_df.loc[:, df.columns == 'median_house_value'] 
X_test = test_df.loc[:, df.columns != 'median_house_value']
y_test = test_df.loc[:, df.columns == 'median_house_value']

#### Housing model training

In [341]:
def train_and_save_model(param: dict, dtrain: xgb.DMatrix, dtest: xgb.DMatrix, steps, model_path: Path, model_name: str):
    if do_training:
        gbdt_model = xgb.train(param, dtrain,
                          num_boost_round=steps,
                          evals=[(dtest, 'test'), (dtrain, 'train')],
                          verbose_eval=50)
    if do_training:
        gbdt_model.dump_model(model_path / f"{model_name}_dumped.txt", with_stats=True)
        gbdt_model.save_model(model_path / f"{model_name}_saved.json")
    else:
        gbdt_model = xgb.Booster()
        gbdt_model.load_model(model_path / f"{model_name}_saved.json")

In [342]:
# training hyperparameters
param = {
    'eta': 0.01,
    'max_depth': 8,
    'objective': 'reg:squarederror',
    'seed': 42,
}
steps = 10000


In [343]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [344]:
%%time
train_and_save_model(param, dtrain, dtest, steps, MODEL_PATH, housing_model_name)

[0]	test-rmse:113291.37249	train-rmse:114904.40257
[50]	test-rmse:83811.31803	train-rmse:83148.69348
[100]	test-rmse:68132.37475	train-rmse:65130.09453
[150]	test-rmse:60214.44676	train-rmse:54791.01936
[200]	test-rmse:55672.09710	train-rmse:48430.51908
[250]	test-rmse:53098.36844	train-rmse:44221.65442
[300]	test-rmse:51548.85364	train-rmse:41353.85370
[350]	test-rmse:50389.72656	train-rmse:39211.29196
[400]	test-rmse:49571.26916	train-rmse:37462.36284
[450]	test-rmse:48962.01552	train-rmse:35896.49008
[500]	test-rmse:48395.65594	train-rmse:34599.17474
[550]	test-rmse:48028.10669	train-rmse:33546.38695
[600]	test-rmse:47748.41193	train-rmse:32611.33554
[650]	test-rmse:47506.22287	train-rmse:31727.74931
[700]	test-rmse:47315.11400	train-rmse:30954.02914
[750]	test-rmse:47193.82151	train-rmse:30361.67198
[800]	test-rmse:47047.83481	train-rmse:29690.12876
[850]	test-rmse:46955.10010	train-rmse:29066.25693
[900]	test-rmse:46867.88535	train-rmse:28512.95685
[950]	test-rmse:46831.07213	trai

### Red Wine Dataset

In [345]:
DATA_DIR = Path('data/wine_quality')
file_name = 'winequality_red.csv'
scaled_file_name = 'winequality_red_scaled.csv'
train_file_name = 'train_winequality_red_scaled.csv'
test_file_name = 'test_winequality_red_scaled.csv'
scaler_params_file = 'winequality_red_scaling_params.csv'
MODEL_PATH = Path('models/')
wine_model_name = 'winequality_red'

In [346]:
df = pd.read_csv(DATA_DIR / file_name, sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [347]:
scaled_df, train_df, test_df = scale_split_df(df, 'quality', 0.2, DATA_DIR, scaled_file_name, train_file_name, test_file_name, scaler_params_file)
scaled_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,3.554936e-16,1.733031e-16,-8.887339000000001e-17,-1.244227e-16,3.732682e-16,-6.221137e-17,4.4436690000000005e-17,-3.473172e-14,2.861723e-15,6.754377e-16,1.066481e-16,5.636023
std,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,1.000313,0.807569
min,-2.137045,-2.27828,-1.391472,-1.162696,-1.603945,-1.4225,-1.230584,-3.538731,-3.700401,-1.936507,-1.898919,3.0
25%,-0.7007187,-0.7699311,-0.9293181,-0.4532184,-0.371229,-0.8487156,-0.7440403,-0.6077557,-0.6551405,-0.6382196,-0.8663789,5.0
50%,-0.2410944,-0.04368911,-0.05636026,-0.240375,-0.1799455,-0.1793002,-0.2574968,0.001760083,-0.007212705,-0.2251281,-0.2093081,6.0
75%,0.5057952,0.6266881,0.7652471,0.04341614,0.05384542,0.4901152,0.4723184,0.5768249,0.5759223,0.4240158,0.6354971,6.0
max,4.355149,5.877976,3.743574,9.195681,11.12703,5.367284,7.375154,3.680055,4.528282,7.918677,4.202453,8.0


In [348]:
X_train = train_df.loc[:, df.columns != 'quality']
y_train = train_df.loc[:, df.columns == 'quality'] 
X_test = test_df.loc[:, df.columns != 'quality']
y_test = test_df.loc[:, df.columns == 'quality']

### Wine model training

In [349]:
# training hyperparameters
param = {
    'eta': 0.01,
    'max_depth': 8,
    'objective': 'reg:squarederror',
    'seed': 42,
}
steps = 3000

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [350]:
train_and_save_model(param, dtrain, dtest, steps, MODEL_PATH, wine_model_name)

[0]	test-rmse:0.80343	train-rmse:0.80218
[50]	test-rmse:0.67788	train-rmse:0.59131
[100]	test-rmse:0.62565	train-rmse:0.45740
[150]	test-rmse:0.60196	train-rmse:0.37078
[200]	test-rmse:0.59076	train-rmse:0.31179
[250]	test-rmse:0.58498	train-rmse:0.27243
[300]	test-rmse:0.58063	train-rmse:0.24386
[350]	test-rmse:0.57627	train-rmse:0.21826
[400]	test-rmse:0.57291	train-rmse:0.19530
[450]	test-rmse:0.57248	train-rmse:0.17818
[500]	test-rmse:0.57119	train-rmse:0.16226
[550]	test-rmse:0.57053	train-rmse:0.15269
[600]	test-rmse:0.57062	train-rmse:0.14417
[650]	test-rmse:0.57068	train-rmse:0.13401
[700]	test-rmse:0.57109	train-rmse:0.12501
[750]	test-rmse:0.57111	train-rmse:0.11941
[800]	test-rmse:0.57066	train-rmse:0.11391
[850]	test-rmse:0.57008	train-rmse:0.10860
[900]	test-rmse:0.56978	train-rmse:0.10381
[950]	test-rmse:0.56965	train-rmse:0.09791
[1000]	test-rmse:0.56948	train-rmse:0.09353
[1050]	test-rmse:0.56946	train-rmse:0.08901
[1100]	test-rmse:0.56947	train-rmse:0.08320
[1150]	test