In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from lightgbm.callback import early_stopping

In [None]:
boston = pd.read_csv('boston.csv')
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [4]:
from scipy.stats import spearmanr
corr, _= spearmanr(boston)
for column, corr in zip(boston.columns, corr[-1]):
    print(f'{column}: {corr:.4f}')

CRIM: -0.5589
ZN: 0.4382
INDUS: -0.5783
CHAS: 0.1406
NOX: -0.5626
RM: 0.6336
AGE: -0.5476
DIS: 0.4459
RAD: -0.3468
TAX: -0.5624
PTRATIO: -0.5559
B: 0.1857
LSTAT: -0.8529
MEDV: 1.0000


In [5]:
boston_cdf = boston[['LSTAT', 'RM', 'INDUS', 'NOX', 'MEDV']]
boston_cdf.head()

Unnamed: 0,LSTAT,RM,INDUS,NOX,MEDV
0,4.98,6.575,2.31,0.538,24.0
1,9.14,6.421,7.07,0.469,21.6
2,4.03,7.185,7.07,0.469,34.7
3,2.94,6.998,2.18,0.458,33.4
4,5.33,7.147,2.18,0.458,36.2


In [6]:
train_x_full, test_x, train_y_full, test_y = train_test_split(boston_cdf.iloc[:,0:4], boston_cdf.MEDV, test_size= 0.2, random_state= 42)
train_x, val_x, train_y, val_y = train_test_split(train_x_full, train_y_full, test_size= 0.2, random_state= 42)
train_x

Unnamed: 0,LSTAT,RM,INDUS,NOX
54,14.80,5.888,4.00,0.410
28,12.80,6.495,8.14,0.538
423,23.29,6.103,18.10,0.614
244,12.50,5.593,5.86,0.431
111,10.16,6.715,10.01,0.547
...,...,...,...,...
157,4.59,6.943,19.58,0.605
199,4.56,6.975,1.47,0.403
200,4.45,7.135,1.47,0.403
498,12.92,6.019,9.69,0.585


In [7]:
model = lgb.LGBMRegressor(boosting_type= 'gbdt',
                          random_state= 42, 
                          subsample= 0.8
                          )

In [8]:
grid_param= {
    'n_estimators': [200],
    'learning_rate': [0.05, 0.03, 0.02, 0.01, 0.1],
    'max_depth': [9],
    'num_leaves': [32, 64, 128, 512],
    'min_child_samples': np.arange(2, 20),
    'feature_fraction': [0.9, 0.8],
    'bagging_fraction': [0.8, 0.7],
    'bagging_freq': [7, 6]
}

In [9]:
grid = GridSearchCV(estimator= model,
                    param_grid= grid_param,
                    cv= 3,
                    verbose= 2,
                    n_jobs= -1)

In [10]:
grid.fit(train_x, train_y, 
         eval_set= [(val_x, val_y)], 
         eval_metric= 'rmse', 
         callbacks = [early_stopping(stopping_rounds= 20)])

Fitting 3 folds for each of 2880 candidates, totalling 8640 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 323, number of used features: 4
[LightGBM] [Info] Start training from score 22.786068
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[136]	valid_0's rmse: 3.48749	valid_0's l2: 12.1626


In [11]:
print(grid.best_estimator_)

LGBMRegressor(bagging_fraction=0.8, bagging_freq=7, feature_fraction=0.8,
              max_depth=9, min_child_samples=14, n_estimators=200,
              num_leaves=32, random_state=42, subsample=0.8)


In [12]:
best_model = grid.best_estimator_

In [13]:
y_hat1= best_model.predict(test_x)
y_hat2 = best_model.predict(train_x_full)



In [14]:
from sklearn.metrics import r2_score
print(f'{r2_score(train_y_full, y_hat2):.5f}')
print(f'{r2_score(test_y, y_hat1):.5f}')

0.93963
0.81344
