In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import json
import warnings
import xgboost as xgb
import lightgbm as lgb
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from IPython.display import display
from sklearn.model_selection import GridSearchCV
sns.set_theme()

n =45
random_state = 42

In [23]:
df = pd.read_csv("kc_house_data.csv")
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['age'] = df['year'] - df['yr_built']
df = df.drop(labels=["date","yr_built"],axis=1)
df

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,month,day,age
0,7129300520,221900.0,3,1.00,1180,5650,1.0,0,0,3,...,0,98178,47.5112,-122.257,1340,5650,2014,10,13,59
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,...,1991,98125,47.7210,-122.319,1690,7639,2014,12,9,63
2,5631500400,180000.0,2,1.00,770,10000,1.0,0,0,3,...,0,98028,47.7379,-122.233,2720,8062,2015,2,25,82
3,2487200875,604000.0,4,3.00,1960,5000,1.0,0,0,5,...,0,98136,47.5208,-122.393,1360,5000,2014,12,9,49
4,1954400510,510000.0,3,2.00,1680,8080,1.0,0,0,3,...,0,98074,47.6168,-122.045,1800,7503,2015,2,18,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,360000.0,3,2.50,1530,1131,3.0,0,0,3,...,0,98103,47.6993,-122.346,1530,1509,2014,5,21,5
21609,6600060120,400000.0,4,2.50,2310,5813,2.0,0,0,3,...,0,98146,47.5107,-122.362,1830,7200,2015,2,23,1
21610,1523300141,402101.0,2,0.75,1020,1350,2.0,0,0,3,...,0,98144,47.5944,-122.299,1020,2007,2014,6,23,5
21611,291310100,400000.0,3,2.50,1600,2388,2.0,0,0,3,...,0,98027,47.5345,-122.069,1410,1287,2015,1,16,11


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def evaluate(model,x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = model.predict(x_train)
    
    Rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    R2_score_train = r2_score(y_train, y_train_pred)
    R2_score_test = r2_score(y_test, y_test_pred)
    return Rmse, R2_score_train, R2_score_test

from sklearn.model_selection import train_test_split

x = df.drop(labels='price',axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector


scaler = ['-',MinMaxScaler(), StandardScaler(), RobustScaler()]

In [13]:
def create_small_table(model):
    n = 45
    i = 0
    random_state = 42
    
    # RESET TABLE
    rate_table = pd.DataFrame({'Model': ['-'],
                               'Params': ['-'],
                               'FeatureSelection':['-'] ,
                               'Scaler': ['-'],
                               'Number of features': ['-'],
                               'Rmse':['-'],
                               'R2 Score(train)':['-'],
                               'R2 Score(test)':['-']})
    rate_table,i = rate_original(model,rate_table,i)
    rate_table,i = rate_SKB(model,rate_table,i)
    
    rate_table = rate_table.iloc[1:].sort_values(by='R2 Score(test)',ascending=False)
    display(rate_table)
    
    existing_df = pd.read_csv(f'exp/{model.__class__.__name__}.csv')
    merged_df = pd.concat([existing_df, rate_table], axis=0, ignore_index=True)
    merged_df.to_csv(f'exp/{model.__class__.__name__}.csv', index=False)
    
def rate_original(model,rate_table,i):
    # Original:
    Rmse, R2_score_train, R2_score_test = evaluate(model,X_train,y_train,X_test,y_test)
    num_feats = X_train.shape[1]
    row = pd.DataFrame({'Model':model.__class__.__name__,
                        'Params': str(model.get_params()),
                        'FeatureSelection': ['-'],
                        'Scaler': ['-'],
                        'Number of features': num_feats,
                        'Rmse':Rmse,
                        'R2 Score(train)':R2_score_train, 
                        'R2 Score(test)':R2_score_test})
    rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
    sys.stdout.write("\rProgress: [{:<33}] {:.2f}%".format("=" * (i), (i / n) * (1250/17)))
    i+=2
    sys.stdout.write("\rProgress: [{:<33}] {:.2f}% \tDone original!".format("=" * (i), (i / n) * (1250/17)))
    return rate_table,i

def rate_SKB(model,rate_table,i):    
    num_bests = [19,7]
    for num in num_bests:
        selection = [(["SelectKBest"],SelectKBest(score_func = f_classif, k=num)),
                     (["SFS"],SequentialFeatureSelector(model,n_features_to_select=num,scoring='r2',cv=3))]
        for name, selector in selection:
            x_train = selector.fit_transform(X_train,y_train)
            x_test = selector.transform(X_test)
            num_feats = x_train.shape[1]
            for sc in scaler:
                if sc != '-':
                    x_train = sc.fit_transform(x_train)
                    x_test = sc.transform(x_test)
                Rmse, R2_score_train, R2_score_test = evaluate(model,x_train,y_train,x_test,y_test)
                row = pd.DataFrame({'Model':model.__class__.__name__,
                                    'Params': str(model.get_params()),
                                    'FeatureSelection': name,
                                    'Scaler': sc,
                                    'Number of features': num_feats,
                                    'Rmse':Rmse,
                                    'R2 Score(train)':R2_score_train, 
                                    'R2 Score(test)':R2_score_test})
                rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
                sys.stdout.write("\rProgress: [{:<33}] {:.2f}%".format("=" * (i), (i / n) * (1250/17)))
                i+=2
        
        sys.stdout.write("\rProgress: [{:<33}] {:.2f}%\tDone num_feat = {}!".format("=" * (i), (i / n) * (1250/17),num))
        
    return rate_table,i

In [37]:
vote_reg_df = pd.read_csv("exp/VotingRegressor.csv")
vote_reg_df = vote_reg_df.drop_duplicates()
vote_reg_df.sort_values(by='R2 Score(test)',ascending=False)
vote_reg_df['overfitting'] = vote_reg_df['R2 Score(train)']/vote_reg_df['R2 Score(test)']
vote_reg_df = vote_reg_df.drop('Params', axis=1)
vote_reg_df.sort_values(by='R2 Score(test)',ascending=False)

Unnamed: 0,Model,FeatureSelection,Scaler,Number of features,Rmse,R2 Score(train),R2 Score(test),overfitting
45,VotingRegressor,SFS,-,19,123309.282419,0.957677,0.899421,1.064770
46,VotingRegressor,SFS,StandardScaler(),19,123558.044144,0.957828,0.899015,1.065420
47,VotingRegressor,SFS,MinMaxScaler(),19,123701.944366,0.957873,0.898780,1.065749
48,VotingRegressor,SFS,RobustScaler(),19,123767.194320,0.957483,0.898673,1.065441
137,VotingRegressor,SFS,RobustScaler(),12,126464.159414,0.929686,0.894209,1.039674
...,...,...,...,...,...,...,...,...
40,VotingRegressor,VarianceThreshold,-,16,235379.926322,0.727226,0.633517,1.147918
41,VotingRegressor,VarianceThreshold,MinMaxScaler(),13,238208.714668,0.715061,0.624655,1.144729
42,VotingRegressor,VarianceThreshold,RobustScaler(),13,238307.321451,0.714398,0.624344,1.144237
43,VotingRegressor,VarianceThreshold,StandardScaler(),13,238816.324231,0.714030,0.622738,1.146597


In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

model1 = AdaBoostRegressor(random_state=random_state,learning_rate=0.01,n_estimators=500)
model2 = xgb.XGBRegressor(n_estimators=200, max_depth=5,alpha=100,reg_lambda=3,learning_rate=0.3, random_state=random_state)

meta_model = RandomForestRegressor(random_state=random_state)

model = StackingRegressor(
    estimators=[('lgb', model1), ('rf', model2)],
    final_estimator=meta_model
    
)

create_small_table(model)



In [14]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


r1 = xgb.XGBRegressor(n_estimators=200, max_depth=5,alpha=100,reg_lambda=3,learning_rate=0.3, random_state=random_state)
r2 = RandomForestRegressor(n_estimators=50,min_samples_split=20,min_samples_leaf=2,random_state=random_state)
r3 = lgb.LGBMRegressor(objective='regression',n_estimators=1000,learning_rate = 0.01,max_depth=5,num_leaves  = 5,min_data_in_leaf =100, random_state=random_state,verbose=-1)


model = VotingRegressor([('xgb', r1), ('rf', r2), ('L', r3)],weights=[5,1,1],n_jobs=20)
create_small_table(model)



Unnamed: 0,Model,Params,FeatureSelection,Scaler,Number of features,Rmse,R2 Score(train),R2 Score(test)
6,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,-,19,121626.026732,0.969564,0.902148
8,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,StandardScaler(),19,121743.948985,0.969695,0.901959
9,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,RobustScaler(),19,122065.894521,0.969305,0.901439
7,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,MinMaxScaler(),19,122209.690056,0.969781,0.901207
15,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,MinMaxScaler(),7,133494.303186,0.948572,0.88212
14,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,-,7,133510.293539,0.948572,0.882092
17,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,RobustScaler(),7,133560.588837,0.94857,0.882003
16,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,StandardScaler(),7,133697.939184,0.948571,0.88176
1,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",-,-,22,134359.05422,0.97047,0.880588
3,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SelectKBest,MinMaxScaler(),19,134385.499009,0.966459,0.880541


In [28]:
import optuna
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score


def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 100, step=10)
    model = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=2, scoring='neg_mean_squared_error')
    return np.mean(scores)


study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=10)

print(f'Giá trị tối ưu: {study.best_value}')
print(f'Tham số tối ưu: {study.best_params}')


[I 2023-12-11 10:57:36,334] A new study created in memory with name: no-name-161a891a-0b3f-4c81-9a96-d6ec447e2903
[I 2023-12-11 10:57:41,688] Trial 0 finished with value: -59115260456.347275 and parameters: {'n_estimators': 20}. Best is trial 0 with value: -59115260456.347275.
[I 2023-12-11 10:57:44,544] Trial 1 finished with value: -48239286146.12841 and parameters: {'n_estimators': 10}. Best is trial 1 with value: -48239286146.12841.
[I 2023-12-11 10:57:51,610] Trial 2 finished with value: -70973949508.25137 and parameters: {'n_estimators': 30}. Best is trial 1 with value: -48239286146.12841.
[I 2023-12-11 10:58:03,479] Trial 3 finished with value: -164565701288.78687 and parameters: {'n_estimators': 70}. Best is trial 1 with value: -48239286146.12841.
[I 2023-12-11 10:58:13,558] Trial 4 finished with value: -141954944938.7948 and parameters: {'n_estimators': 60}. Best is trial 1 with value: -48239286146.12841.
[I 2023-12-11 10:58:23,620] Trial 5 finished with value: -141954944938.79

Giá trị tối ưu: -48239286146.12841
Tham số tối ưu: {'n_estimators': 10}


In [31]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500, step=50)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 1.0)

    model = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    return score


study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=10)

print(f'Giá trị tối ưu: {study.best_value}')
print(f'Tham số tối ưu: {study.best_params}')

[I 2023-12-11 11:06:03,560] A new study created in memory with name: no-name-7e7f5724-2283-450e-a221-d6623f97c837
[I 2023-12-11 11:06:10,480] Trial 0 finished with value: -0.45513361801838803 and parameters: {'n_estimators': 250, 'learning_rate': 0.004148219360070355}. Best is trial 0 with value: -0.45513361801838803.
[I 2023-12-11 11:06:21,991] Trial 1 finished with value: -0.45943068573684687 and parameters: {'n_estimators': 450, 'learning_rate': 0.07800498150357418}. Best is trial 0 with value: -0.45513361801838803.
[I 2023-12-11 11:06:31,550] Trial 2 finished with value: -0.4110875976554276 and parameters: {'n_estimators': 350, 'learning_rate': 0.03154247583559642}. Best is trial 2 with value: -0.4110875976554276.
[I 2023-12-11 11:06:38,907] Trial 3 finished with value: -0.46209967913085337 and parameters: {'n_estimators': 200, 'learning_rate': 0.1192979439874059}. Best is trial 2 with value: -0.4110875976554276.
[I 2023-12-11 11:06:48,995] Trial 4 finished with value: -0.411087597

Giá trị tối ưu: -0.010189022173726414
Tham số tối ưu: {'n_estimators': 50, 'learning_rate': 0.046808865661595124}


In [32]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 500, 600, step=1)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 1.0)

    model = AdaBoostRegressor(n_estimators=n_estimators, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    return score


study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=10)

print(f'Giá trị tối ưu: {study.best_value}')
print(f'Tham số tối ưu: {study.best_params}')

[I 2023-12-11 11:34:19,763] A new study created in memory with name: no-name-9a677679-ee87-4244-bdaf-2af290528a0a
[I 2023-12-11 11:34:32,105] Trial 0 finished with value: -0.5069774060899424 and parameters: {'n_estimators': 585, 'learning_rate': 0.07459421838288798}. Best is trial 0 with value: -0.5069774060899424.
[I 2023-12-11 11:34:43,387] Trial 1 finished with value: -0.4986832898788711 and parameters: {'n_estimators': 537, 'learning_rate': 0.0026347559642805433}. Best is trial 1 with value: -0.4986832898788711.
[I 2023-12-11 11:34:55,702] Trial 2 finished with value: -0.5160042988752678 and parameters: {'n_estimators': 590, 'learning_rate': 0.5282201222642283}. Best is trial 1 with value: -0.4986832898788711.
[I 2023-12-11 11:35:07,380] Trial 3 finished with value: -0.5027504067112889 and parameters: {'n_estimators': 548, 'learning_rate': 0.15093715944367245}. Best is trial 1 with value: -0.4986832898788711.
[I 2023-12-11 11:35:18,197] Trial 4 finished with value: -0.4738639960001

Giá trị tối ưu: -0.47386399600016627
Tham số tối ưu: {'n_estimators': 509, 'learning_rate': 0.02047121359302594}
