In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import json
import warnings
import xgboost as xgb
import lightgbm as lgb
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from IPython.display import display
from sklearn.model_selection import GridSearchCV
sns.set_theme()

n =45
random_state = 42

In [13]:
df = pd.read_csv("kc_house_data.csv")
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['age'] = df['year'] - df['yr_built']
df = df.drop(labels=["date","yr_built"],axis=1)
df

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,month,day,age
0,7129300520,221900.0,3,1.00,1180,5650,1.0,0,0,3,...,0,98178,47.5112,-122.257,1340,5650,2014,10,13,59
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,...,1991,98125,47.7210,-122.319,1690,7639,2014,12,9,63
2,5631500400,180000.0,2,1.00,770,10000,1.0,0,0,3,...,0,98028,47.7379,-122.233,2720,8062,2015,2,25,82
3,2487200875,604000.0,4,3.00,1960,5000,1.0,0,0,5,...,0,98136,47.5208,-122.393,1360,5000,2014,12,9,49
4,1954400510,510000.0,3,2.00,1680,8080,1.0,0,0,3,...,0,98074,47.6168,-122.045,1800,7503,2015,2,18,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,360000.0,3,2.50,1530,1131,3.0,0,0,3,...,0,98103,47.6993,-122.346,1530,1509,2014,5,21,5
21609,6600060120,400000.0,4,2.50,2310,5813,2.0,0,0,3,...,0,98146,47.5107,-122.362,1830,7200,2015,2,23,1
21610,1523300141,402101.0,2,0.75,1020,1350,2.0,0,0,3,...,0,98144,47.5944,-122.299,1020,2007,2014,6,23,5
21611,291310100,400000.0,3,2.50,1600,2388,2.0,0,0,3,...,0,98027,47.5345,-122.069,1410,1287,2015,1,16,11


In [102]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def evaluate(model,x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = model.predict(x_train)
    
    Rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    R2_score_train = r2_score(y_train, y_train_pred)
    R2_score_test = r2_score(y_test, y_test_pred)
    return Rmse, R2_score_train, R2_score_test

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector


scaler = ['-',MinMaxScaler(), StandardScaler(), RobustScaler()]

In [110]:
def create_small_table(model):
    n = 45
    i = 0
    random_state = 42
    
    # RESET TABLE
    rate_table = pd.DataFrame({'Model': ['-'],
                               'Params': ['-'],
                               'FeatureSelection':['-'] ,
                               'Scaler': ['-'],
                               'Number of features': ['-'],
                               'Rmse':['-'],
                               'R2 Score(train)':['-'],
                               'R2 Score(test)':['-']})
    rate_table,i = rate_original(model,rate_table,i)
    rate_table,i = rate_SKB(model,rate_table,i)
    
    rate_table = rate_table.iloc[1:].sort_values(by='R2 Score(test)',ascending=False)
    display(rate_table)
    
    existing_df = pd.read_csv(f'exp/{model.__class__.__name__}.csv')
    merged_df = pd.concat([existing_df, rate_table], axis=0, ignore_index=True)
    merged_df.to_csv(f'exp/{model.__class__.__name__}.csv', index=False)
    
def rate_original(model,rate_table,i):
    # Original:
    Rmse, R2_score_train, R2_score_test = evaluate(model,X_train,y_train,X_test,y_test)
    num_feats = X_train.shape[1]
    row = pd.DataFrame({'Model':model.__class__.__name__,
                        'Params': str(model.get_params()),
                        'FeatureSelection': ['-'],
                        'Scaler': ['-'],
                        'Number of features': num_feats,
                        'Rmse':Rmse,
                        'R2 Score(train)':R2_score_train, 
                        'R2 Score(test)':R2_score_test})
    rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
    sys.stdout.write("\rProgress: [{:<33}] {:.2f}%".format("=" * (i), (i / n) * (1250/17)))
    i+=2
    sys.stdout.write("\rProgress: [{:<33}] {:.2f}% \tDone original!".format("=" * (i), (i / n) * (1250/17)))
    return rate_table,i

def rate_SKB(model,rate_table,i):    
    num_bests = [19,7]
    for num in num_bests:
        selection = [(["SelectKBest"],SelectKBest(score_func = f_classif, k=num)),
                     (["SFS"],SequentialFeatureSelector(model,n_features_to_select=num,scoring='r2',cv=3))]
        for name, selector in selection:
            x_train = selector.fit_transform(X_train,y_train)
            x_test = selector.transform(X_test)
            num_feats = x_train.shape[1]
            for sc in scaler:
                if sc != '-':
                    x_train = sc.fit_transform(x_train)
                    x_test = sc.transform(x_test)
                Rmse, R2_score_train, R2_score_test = evaluate(model,x_train,y_train,x_test,y_test)
                row = pd.DataFrame({'Model':model.__class__.__name__,
                                    'Params': str(model.get_params()),
                                    'FeatureSelection': name,
                                    'Scaler': sc,
                                    'Number of features': num_feats,
                                    'Rmse':Rmse,
                                    'R2 Score(train)':R2_score_train, 
                                    'R2 Score(test)':R2_score_test})
                rate_table = pd.concat([rate_table,row],axis = 0,ignore_index=True)
                sys.stdout.write("\rProgress: [{:<33}] {:.2f}%".format("=" * (i), (i / n) * (1250/17)))
                i+=2
        
        sys.stdout.write("\rProgress: [{:<33}] {:.2f}%\tDone num_feat = {}!".format("=" * (i), (i / n) * (1250/17),num))
        
    return rate_table,i

In [37]:
vote_reg_df = pd.read_csv("exp/VotingRegressor.csv")
vote_reg_df = vote_reg_df.drop_duplicates()
vote_reg_df.sort_values(by='R2 Score(test)',ascending=False)
vote_reg_df['overfitting'] = vote_reg_df['R2 Score(train)']/vote_reg_df['R2 Score(test)']
vote_reg_df = vote_reg_df.drop('Params', axis=1)
vote_reg_df.sort_values(by='R2 Score(test)',ascending=False)

Unnamed: 0,Model,FeatureSelection,Scaler,Number of features,Rmse,R2 Score(train),R2 Score(test),overfitting
45,VotingRegressor,SFS,-,19,123309.282419,0.957677,0.899421,1.064770
46,VotingRegressor,SFS,StandardScaler(),19,123558.044144,0.957828,0.899015,1.065420
47,VotingRegressor,SFS,MinMaxScaler(),19,123701.944366,0.957873,0.898780,1.065749
48,VotingRegressor,SFS,RobustScaler(),19,123767.194320,0.957483,0.898673,1.065441
137,VotingRegressor,SFS,RobustScaler(),12,126464.159414,0.929686,0.894209,1.039674
...,...,...,...,...,...,...,...,...
40,VotingRegressor,VarianceThreshold,-,16,235379.926322,0.727226,0.633517,1.147918
41,VotingRegressor,VarianceThreshold,MinMaxScaler(),13,238208.714668,0.715061,0.624655,1.144729
42,VotingRegressor,VarianceThreshold,RobustScaler(),13,238307.321451,0.714398,0.624344,1.144237
43,VotingRegressor,VarianceThreshold,StandardScaler(),13,238816.324231,0.714030,0.622738,1.146597


In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

model1 = AdaBoostRegressor(random_state=random_state,learning_rate=0.01,n_estimators=500)
model2 = xgb.XGBRegressor(n_estimators=200, max_depth=5,alpha=100,reg_lambda=3,learning_rate=0.3, random_state=random_state)

meta_model = RandomForestRegressor(random_state=random_state)

model = StackingRegressor(
    estimators=[('lgb', model1), ('rf', model2)],
    final_estimator=meta_model
    
)

create_small_table(model)