In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from clean import Data, PTHURL
from metric_report import MetricReport

dat = Data(PTHURL)
X = dat.X
y = dat.y
X.head()

...

Unnamed: 0,price_per_sq_ft,beds_total,baths.lavs,original_list_price,year_built,sqft-est_tot_fin,sqft-est_fin_abv_grd,acreage
0,98.0,3,2.0,199900,1952,1.726,1.026,0.5
1,99.0,3,1.1,199900,1949,1.91,1.21,0.25
2,103.0,4,1.0,199900,1923,1.764,1.764,0.25
3,105.0,4,3.0,325000,1953,936.0,936.0,0.21
4,105.0,3,1.0,174999,1949,1.792,1.064,0.13


In [5]:
class GBR: 
    
    def __init__(self, X: pd.DataFrame = X, y: pd.DataFrame = y):
        from xgboost import train as Train, DMatrix
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y)
        self.dtrain = DMatrix(self.X_train.values, self.y_train.values)
        self.dtest = DMatrix(self.X_test.values)

        self.feat_names_mapper = {ident: name for ident,name in zip(self.dtrain.feature_names,X.columns)}
        
        self.params={'objective': 'reg:linear', 'max_depth':3, 'learning_rate':0.1, 'n_estimators':100, 'silent':1}
        self.num_round=2
        self.model = Train(self.params, self.dtrain, self.num_round)
        self.prediction = self.model.predict(self.dtest)

gbr = GBR(X, y)

print(MetricReport(gbr.prediction, gbr.y_test.values).show)

The error explained_variance_score scored at -42.65
The error mean_absolute_error scored at 2.248e+05
The error mean_squared_error scored at 6.071e+10
The error mean_squared_log_error scored at 2.779
The error median_absolute_error scored at 2.07e+05
The error r2_score scored at -259.8


In [6]:
class RFR:
    def __init__(self, X: pd.DataFrame = X, y: pd.DataFrame = y):
        from sklearn.ensemble import RandomForestRegressor
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y)
        self.model = RandomForestRegressor(n_estimators=100)
        self.model.fit(self.X_train, self.y_train)
        self.prediction = self.model.predict(self.X_test)

rfr = RFR(X, y)

print(MetricReport(rfr.prediction, rfr.y_test).show)

The error explained_variance_score scored at 0.7942
The error mean_absolute_error scored at 1.701e+04
The error mean_squared_error scored at 1.88e+09
The error mean_squared_log_error scored at 0.006886
The error median_absolute_error scored at 8.462e+03
The error r2_score scored at 0.7934


In [7]:
from training import gbr as g, rfr as r, REPORTS

REPORTS

...

{'gradient boost performance': 'The error explained_variance_score scored at 0.3099\nThe error mean_absolute_error scored at 2.185e+05\nThe error mean_squared_error scored at 5.284e+10\nThe error mean_squared_log_error scored at 2.776\nThe error median_absolute_error scored at 2.044e+05\nThe error r2_score scored at -6.158',
 'random forest performance': 'The error explained_variance_score scored at 0.966\nThe error mean_absolute_error scored at 1.279e+04\nThe error mean_squared_error scored at 3.325e+08\nThe error mean_squared_log_error scored at 0.004213\nThe error median_absolute_error scored at 9.832e+03\nThe error r2_score scored at 0.966'}

In [15]:
r.model.predict(np.array([[1,2,3,4,5,6,7,8]]))

array([151084.])