In [23]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [112]:
df_house = pd.read_csv("kc_house_data.csv")
X = df['sqft_living']
Y = df['price']
N = len(X)


In [83]:
from sklearn.model_selection import train_test_split

x = df.drop(labels='price',axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def evaluate(model,x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = model.predict(x_train)
    
    Rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    R2_score_train = r2_score(y_train, y_train_pred)
    R2_score_test = r2_score(y_test, y_test_pred)
    return Rmse, R2_score_train, R2_score_test

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector


scaler = ['-',MinMaxScaler(), StandardScaler(), RobustScaler()]

def create_table(model):
    n = 45
    i = 0
    random_state = 42
    
    # RESET TABLE
    rate_table = pd.DataFrame({'Model': ['-'],
                               'Params': ['-'],
                               'FeatureSelection':['-'] ,
                               'Scaler': ['-'],
                               'Number of features': ['-'],
                               'Rmse':['-'],
                               'R2 Score(train)':['-'],
                               'R2 Score(test)':['-']})
    rate_table,i = rate_original(model,rate_table,i)
   # rate_table,i = rate_variance_threshold(model,rate_table,i)
    rate_table,i = rate_SKB(model,rate_table,i)
    
    rate_table = rate_table.iloc[1:].sort_values(by='R2 Score(test)',ascending=False)
    display(rate_table)
    
    existing_df = pd.read_csv(f'exp/{model.__class__.__name__}.csv')
    merged_df = pd.concat([existing_df, rate_table], axis=0, ignore_index=True)
    merged_df.to_csv(f'exp/{model.__class__.__name__}.csv', index=False)
    
def rate_original(model,rate_table,i):
    # Original:
    Rmse, R2_score_train, R2_score_test = evaluate(model,X_train,y_train,X_test,y_test)
    num_feats = X_train.shape[1]
    row = pd.DataFrame({'Model':model.__class__.__name__,
                        'Params': str(model.get_params()),
                        'FeatureSelection': ['-'],
                        'Scaler': ['-'],
                        'Number of features': num_feats,
                        'Rmse':Rmse,
                        'R2 Score(train)':R2_score_train, 
                        'R2 Score(test)':R2_score_test})
    rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
    sys.stdout.write("\rProgress: [{:<22}] {:.2f}%".format("=" * (i // 2), (i / n) * 100))
    i+=1
    sys.stdout.write("\rProgress: [{:<22}] {:.2f}% \tDone original!".format("=" * (i // 2), (i / n) * 100))
    return rate_table,i

def rate_variance_threshold(model,rate_table,i):    
    threshold = [0.05, 0.5,1]
    for thres in threshold:
        selector = VarianceThreshold(threshold=thres)
        x_train = selector.fit_transform(X_train)
        x_test = selector.transform(X_test)
        num_feats = x_train.shape[1]
        for sc in scaler:
            if sc != '-':
                x_train = sc.fit_transform(x_train)
                x_test = sc.transform(x_test)
            Rmse, R2_score_train, R2_score_test = evaluate(model,x_train,y_train,x_test,y_test)
            row = pd.DataFrame({'Model':model.__class__.__name__,
                                'Params': str(model.get_params()),
                                'FeatureSelection': ['VarianceThreshold'],
                                'Scaler': sc,
                                'Number of features': num_feats,
                                'Rmse':Rmse,
                                'R2 Score(train)':R2_score_train, 
                                'R2 Score(test)':R2_score_test})
            rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
            sys.stdout.write("\rProgress: [{:<22}] {:.2f}%!".format("=" * (i // 2), (i / n) * 100))
            i+=1            
    sys.stdout.write("\rProgress: [{:<22}] {:.2f}%\tDone varThreshold!".format("=" * (i // 2), (i / n) * 100))
    return rate_table,i

def rate_SKB(model,rate_table,i):    
    num_bests = [19,12,7]
    for num in num_bests:
        selection = [(["SelectKBest"],SelectKBest(score_func = f_classif, k=num)),
                     #(["RFE"],RFE(model,n_features_to_select=num)),
                     (["SFS"],SequentialFeatureSelector(model,n_features_to_select=num,scoring='r2',cv=3))]
        for name, selector in selection:
            x_train = selector.fit_transform(X_train,y_train)
            x_test = selector.transform(X_test)
            num_feats = x_train.shape[1]
            for sc in scaler:
                if sc != '-':
                    x_train = sc.fit_transform(x_train)
                    x_test = sc.transform(x_test)
                Rmse, R2_score_train, R2_score_test = evaluate(model,x_train,y_train,x_test,y_test)
                row = pd.DataFrame({'Model':model.__class__.__name__,
                                    'Params': str(model.get_params()),
                                    'FeatureSelection': name,
                                    'Scaler': sc,
                                    'Number of features': num_feats,
                                    'Rmse':Rmse,
                                    'R2 Score(train)':R2_score_train, 
                                    'R2 Score(test)':R2_score_test})
                rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
                sys.stdout.write("\rProgress: [{:<22}] {:.2f}%".format("=" * (i // 2), (i / n) * 100))
                i+=1
        
        sys.stdout.write("\rProgress: [{:<22}] {:.2f}%\tDone num_feat = {}!".format("=" * (i // 2), (i / n) * 100,num))   
        
    return rate_table,i

In [72]:
def create_small_table(model):
    n = 20
    i = 0
    random_state = 42
    
    # RESET TABLE
    rate_table = pd.DataFrame({'Model': ['-'],
                               'Params': ['-'],
                               'FeatureSelection':['-'] ,
                               'Scaler': ['-'],
                               'Number of features': ['-'],
                               'Rmse':['-'],
                               'R2 Score(train)':['-'],
                               'R2 Score(test)':['-']})
    rate_table,i = rate_original(model,rate_table,i)
    rate_table,i = rate_SKB(model,rate_table,i)
    
    rate_table = rate_table.iloc[1:].sort_values(by='R2 Score(test)',ascending=False)
    display(rate_table)
    
    existing_df = pd.read_csv(f'exp/{model.__class__.__name__}.csv')
    merged_df = pd.concat([existing_df, rate_table], axis=0, ignore_index=True)
    merged_df.to_csv(f'exp/{model.__class__.__name__}.csv', index=False)
    
def rate_original(model,rate_table,i):
    # Original:
    Rmse, R2_score_train, R2_score_test = evaluate(model,X_train,y_train,X_test,y_test)
    num_feats = X_train.shape[1]
    row = pd.DataFrame({'Model':model.__class__.__name__,
                        'Params': str(model.get_params()),
                        'FeatureSelection': ['-'],
                        'Scaler': ['-'],
                        'Number of features': num_feats,
                        'Rmse':Rmse,
                        'R2 Score(train)':R2_score_train, 
                        'R2 Score(test)':R2_score_test})
    rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
    sys.stdout.write("\rProgress: [{:<33}] {:.2f}%".format("=" * (i), (i / n) * (31250000/236113)))
    i+=2
    sys.stdout.write("\rProgress: [{:<33}] {:.2f}% \tDone original!".format("=" * (i), (i / n) * (31250000/236113)))
    return rate_table,i

def rate_SKB(model,rate_table,i):    
    num_bests = [19,7]
    for num in num_bests:
        selection = [(["SelectKBest"],SelectKBest(score_func = f_classif, k=num)),
                     (["SFS"],SequentialFeatureSelector(model,n_features_to_select=num,scoring='r2',cv=3))]
        for name, selector in selection:
            x_train = selector.fit_transform(X_train,y_train)
            x_test = selector.transform(X_test)
            num_feats = x_train.shape[1]
            for sc in scaler:
                if sc != '-':
                    x_train = sc.fit_transform(x_train)
                    x_test = sc.transform(x_test)
                Rmse, R2_score_train, R2_score_test = evaluate(model,x_train,y_train,x_test,y_test)
                row = pd.DataFrame({'Model':model.__class__.__name__,
                                    'Params': str(model.get_params()),
                                    'FeatureSelection': name,
                                    'Scaler': sc,
                                    'Number of features': num_feats,
                                    'Rmse':Rmse,
                                    'R2 Score(train)':R2_score_train, 
                                    'R2 Score(test)':R2_score_test})
                rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
                sys.stdout.write("\rProgress: [{:<33}] {:.2f}%".format("=" * (i), (i / n) * (31250000/236113)))
                i+=2
        
        sys.stdout.write("\rProgress: [{:<33}] {:.2f}%\tDone num_feat = {}!".format("=" * (i), (i / n) * (31250000/236113),num))
        
    return rate_table,i

In [None]:
n_estimators = [100,200]
learning_rate = [0.01,0.3]
max_depth = [2,5]
num_leaves = [5,20,50]
min_data_in_leaf = [100,500,1000]
for n_estimator in n_estimators:
    for eta in learning_rate:
        for m_d in max_depth:
            for n_l in num_leaves:
                for m_d_i_l in min_data_in_leaf:
                    model = lgb.LGBMRegressor(objective='regression',
                                              n_estimators=n_estimator,
                                              learning_rate = eta,
                                              max_depth=m_d,
                                              num_leaves  = n_l,
                                              min_data_in_leaf =m_d_i_l, random_state=random_state,)
                    create_table(model)

In [None]:
params = {
    'objective': 'regression'
}
model = lgb.LGBMRegressor(**params)

param_grid = {
    'num_leaves': [5,50],
    'min_data_in_leaf ': [20,100,1000],
    'learning_rate': [0.1,0.3],
    'n_estimators': [300,200],
    'max_depth': [2,5]
    }
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,scoring = 'r2',n_jobs=20,cv=2)

grid_search.fit(X_train, y_train)

score_df = pd.DataFrame(grid_search.cv_results_)
score_df.nlargest(10,"mean_test_score")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.727010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2570
[LightGBM] [Info] Number of data points in the train set: 8645, number of used features: 22
[LightGBM] [Info] Start training from score 537953.549335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.275195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2570
[LightGBM] [Info] Number of data points in the train set: 8645, number of used features: 22
[LightGBM] [Info] Start training from score 537582.546559
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.074861 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.776506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2570
[LightGBM] [Info] Number of data points in the train set: 8645, number of used features: 22
[LightGBM] [Info] Start training from score 537582.546559
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2570
[LightGBM] [Info] Number of data points in the train set: 8645, number of used features: 22
[LightGBM] [Info] Start training from score 537582.546559


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.647495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2570
[LightGBM] [Info] Number of data points in the train set: 8645, number of used features: 22
[LightGBM] [Info] Start training from score 537953.549335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.364774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2570
[LightGBM] [Info] Number of data points in the train set: 8645, number of used features: 22
[LightGBM] [Info] Start training from score 537953.549335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.715692 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

In [17]:
model = xgb.XGBRegressor(n_estimators=200, max_depth=5,alpha=100,reg_lambda=10,learning_rate=0.1, random_state=random_state)
create_table(model)

Progress: [                      ] 0.00%Progress: [                      ] 2.22% 	Done original!

Unnamed: 0,Model,Params,FeatureSelection,Scaler,Number of features,Rmse,R2 Score(train),R2 Score(test)
1,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",-,-,22,136252.641064,0.94442,0.877198


In [76]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


r1 = xgb.XGBRegressor(n_estimators=200, max_depth=5,alpha=100,reg_lambda=3,learning_rate=0.3, random_state=random_state)
r2 = RandomForestRegressor(n_estimators=50,min_samples_split=20,min_samples_leaf=2,random_state=random_state)
r3 = lgb.LGBMRegressor(objective='regression',n_estimators=1000,learning_rate = 0.01,max_depth=5,num_leaves  = 5,verbose=-1,min_data_in_leaf =100, random_state=random_state)


model = VotingRegressor([('xgb', r1), ('rf', r2), ('L', r3)],weights=[10,1,5],n_jobs=20)
create_small_table(model)



Unnamed: 0,Model,Params,FeatureSelection,Scaler,Number of features,Rmse,R2 Score(train),R2 Score(test)
3,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SelectKBest,MinMaxScaler(),19,133221.444025,0.956238,0.882601
4,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SelectKBest,StandardScaler(),19,133430.745685,0.956158,0.882232
1,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",-,-,22,133462.59885,0.960267,0.882176
15,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,MinMaxScaler(),7,133750.612677,0.938118,0.881667
14,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,-,7,133757.41184,0.938118,0.881655
2,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SelectKBest,-,19,133786.693394,0.957169,0.881603
7,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,MinMaxScaler(),19,133810.755681,0.95893,0.88156
6,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,-,19,133821.847967,0.958935,0.881541
17,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,RobustScaler(),7,133902.483851,0.938113,0.881398
8,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,StandardScaler(),19,134033.809748,0.958979,0.881165


In [63]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor


r1 = xgb.XGBRegressor(n_estimators=200, max_depth=5,alpha=100,reg_lambda=3,learning_rate=0.3, random_state=random_state)
r2 = RandomForestRegressor(n_estimators=50,min_samples_split=20,min_samples_leaf=2,random_state=random_state)
#r3 = lgb.LGBMRegressor(objective='regression',n_estimators=1000,learning_rate = 0.01,max_depth=5,num_leaves  = 5,min_data_in_leaf =100,verbose=-1, random_state=random_state)


model = VotingRegressor([('xgb', r1), ('rf', r2)],n_jobs=20)
create_small_table(model)



Unnamed: 0,Model,Params,FeatureSelection,Scaler,Number of features,Rmse,R2 Score(train),R2 Score(test)
8,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,StandardScaler(),19,126254.73154,0.966526,0.894559
6,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,-,19,126342.806314,0.966389,0.894412
9,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,RobustScaler(),19,126442.973917,0.966237,0.894244
7,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,MinMaxScaler(),19,126659.12509,0.96657,0.893882
15,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,MinMaxScaler(),7,133908.840328,0.948118,0.881387
17,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,RobustScaler(),7,133911.010235,0.948114,0.881383
14,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,-,7,133965.343849,0.948117,0.881287
16,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SFS,StandardScaler(),7,134011.32082,0.94811,0.881205
3,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SelectKBest,MinMaxScaler(),19,136729.713857,0.963823,0.876337
4,VotingRegressor,"{'estimators': [('xgb', XGBRegressor(alpha=100...",SelectKBest,StandardScaler(),19,136912.37655,0.963715,0.876006


In [77]:
from sklearn.ensemble import AdaBoostRegressor


In [79]:
model = AdaBoostRegressor(random_state=random_state)

param_grid = {
    'n_estimators': [10,500],
    'learning_rate': [0.01,0.3]
    }
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,scoring = 'r2',n_jobs=30,cv=2)

grid_search.fit(X_train, y_train)

score_df = pd.DataFrame(grid_search.cv_results_)
score_df.nlargest(5,"mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
1,15.747756,0.099135,0.59712,0.006079,0.01,500,"{'learning_rate': 0.01, 'n_estimators': 500}",0.682334,0.667695,0.675014,0.00732,1
2,0.297786,0.000529,0.012302,2.6e-05,0.3,10,"{'learning_rate': 0.3, 'n_estimators': 10}",0.659698,0.665351,0.662524,0.002826,2
0,0.332794,0.009742,0.014447,0.000268,0.01,10,"{'learning_rate': 0.01, 'n_estimators': 10}",0.578617,0.634275,0.606446,0.027829,3
3,8.984669,0.046347,0.606751,0.00603,0.3,500,"{'learning_rate': 0.3, 'n_estimators': 500}",0.127852,-0.650937,-0.261543,0.389394,4


In [None]:
model = AdaBoostRegressor(random_state=random_state,learning_rate=0.005,n_estimators=100)
create_table(model)



In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


r1 = xgb.XGBRegressor(n_estimators=200, max_depth=5,alpha=100,reg_lambda=3,learning_rate=0.3, random_state=random_state)
r2 = RandomForestRegressor(n_estimators=50,min_samples_split=20,min_samples_leaf=2,random_state=random_state)
r3 = lgb.LGBMRegressor(objective='regression',n_estimators=1000,learning_rate = 0.01,max_depth=5,num_leaves  = 5,min_data_in_leaf =100, random_state=random_state,verbose=-1)


model = VotingRegressor([('xgb', r1), ('rf', r2), ('L', r3)],weights=[5,1,1],n_jobs=20)
create_table(model)