Regression is often evaluated with RMSE, R_squared, however we can use our own metric and parameter search. See Regression_review notebook

In [15]:
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

In [3]:
boston = load_boston()
X= boston.data
y =boston.target

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=123)

## KNN model

In [4]:
knn_reg = KNeighborsRegressor()
params ={'n_neighbors':list(range(3,20,1))}
knn_rs = RandomizedSearchCV(knn_reg, params, cv=10, n_iter=15)
knn_rs.fit(X_train, y_train)
knn_rs.best_score_

0.47958639677345277

## Ridge (linear) model

In [5]:
cross_val_score(Ridge(), X_train,y_train,cv=10).mean()

0.71256092193549692

## Gradient Boosting model

In [8]:
cross_val_score(GradientBoostingRegressor(max_depth=7),X_train,y_train, cv=10).mean()

0.80315211731612224

In [9]:
cross_val_score(RandomForestRegressor(), X_train,y_train, cv=10).mean()

0.85268135130634948

## Parameter search

In [10]:
params= {'n_estimators':[4000],'learning_rate':[0.01], 'max_depth':[1,2,3,5,7]}
rs_inst_a = RandomizedSearchCV(GradientBoostingRegressor(), params, n_iter =5, n_jobs=-1)
rs_inst_a.fit(X_train,y_train)
rs_inst_a.best_params_

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 4000}

In [13]:
rs_inst_a.best_score_

0.88157192651081739

## Using other metrics

In [14]:
def mape_score(y_test,y_pred):
    return (np.abs(y_test - y_pred)/y_test).mean()

In [29]:
from numba import autojit
@autojit
def mape_score(y_test,y_pred):
    sum_total = 0
    y_vec_length= len(y_test)
    for index in range(y_vec_length):
        sum_total += (1 - (y_pred[index]/y_test[index]))
    return sum_total/ y_vec_length    

In [30]:
mape_scorer = make_scorer(mape_score, greater_is_better=False)
params = {'n_estimators':[4000],'learning_rate':[0.01],'max_depth':[1,2,3,4,5]}
rs_inst_b = RandomizedSearchCV(GradientBoostingRegressor(),params,n_iter=3, n_jobs=-1, scoring=mape_scorer)

In [31]:
rs_inst_b.fit(X_train,y_train)
rs_inst_b.best_score_

0.036681992893417767

In [32]:
## Generating a scorere. The lower the better, unlike R-squared: the higher the better
mape_scorer = make_scorer(mape_score,greater_is_better = False)

In [34]:
## run grid search
params = {'n_estimators':[4000],'learning_rate':[0.01],'max_depth':[1,2,3,4,5]}
rs_inst_b = RandomizedSearchCV(GradientBoostingRegressor(),params, n_iter=3, n_jobs=-1, scoring =mape_scorer)
rs_inst_b.fit(X_train,y_train)
rs_inst_b.best_score_

0.036681992893417802

In [35]:
rs_inst_b.best_params_

{'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 4000}