In [None]:
import numpy as np
import pandas as pd

In [None]:
with open('data/ass3.pickle', 'rb') as handle:
    data = pd.read_pickle(handle)

X_train, y_train = data['train']
X_dev, y_dev = data['dev']

In [None]:

print('Train data:')
rows_train, columns_train = X_train.shape
print(f'Number of Rows: {rows_train}, Number of Coloumns: {columns_train}')

print('Dev data:')
rows_dev, columns_dev = X_dev.shape
print(f'Number of Rows: {rows_dev}, Number of Coloumns: {columns_dev}')


## First we run LazyPredict to choose the few best algorithms to continue with

In [None]:
from lazypredict.Supervised import LazyRegressor

reg = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = reg.fit(X_train, X_dev, y_train, y_dev)

print(models)

Our data has 13 features, we might need to reduce some of them based on their importance.

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

def select_k_best(k, X, y):
    k_best = SelectKBest(mutual_info_regression, k=k)
    k_best.fit(X, y)
    return (k_best.transform(X), k_best.transform(X_dev))

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold


def cross_validation(reg, X, y, k):
    kf = KFold(n_splits=5,shuffle=True)
    avg = 0
    count = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        reg.fit(X_train,y_train)
        avg += mean_absolute_error(y_test, reg.predict(X_test))
        count+=1
    return avg / count

def fit_predict(regressor, X_train, y_train, X_dev):
    sc = StandardScaler()
    scaled_X_train = sc.fit_transform(X_train)
    regressor.fit(scaled_X_train, y_train)

    train_score = mean_absolute_error(regressor.predict(scaled_X_train), y_train)
    dev_score = mean_absolute_error(regressor.predict(sc.transform(X_dev)), y_dev)
    over_fitting = train_score - dev_score

    return f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting, 3)}'

def fit_predict_poly(regressor, X_train, y_train, X_dev, y_dev):
    transformed_train = PolynomialFeatures(2).fit_transform(X_train)
    transformed_dev = PolynomialFeatures(2).fit_transform(X_dev)
    sc = StandardScaler()
    scaled_X_train = sc.fit_transform(transformed_train)

    regressor.fit(scaled_X_train, y_train)

    train_score = mean_absolute_error(regressor.predict(scaled_X_train), y_train)
    dev_score = mean_absolute_error(regressor.predict(sc.transform(transformed_dev)), y_dev)
    over_fitting = train_score - dev_score

    return f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting, 3)}'

## Initial test of the best algoriths using cross-validation

In [None]:
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

def check_models(X, y, k):
    for reg in regs:
        print(reg)
        for i in range(1,14):
            X_train, X_dev = select_k_best(i, X, y)
        print(f'cv {cross_validation(regs[reg], np.array(X_train), np.array(y), k)} Predict {fit_predict(regs[reg], X_train, y, X_dev)} Polynomial {fit_predict_poly(regs[reg], X_train, y, X_dev, y_dev)}')

regs = {
    'Random Forest':RandomForestRegressor(random_state=42),
    'AdaBoost':AdaBoostRegressor(random_state=42),
    'Histogram Gradient Boosting':HistGradientBoostingRegressor(random_state = 42),
    'XGBoost':XGBRegressor(random_state = 42),
    'Light GBM':LGBMRegressor(random_state = 42),
    'ExtraTreesRegressor' : ExtraTreesRegressor(random_state=42),
    'Bagging Regressor':BaggingRegressor(random_state=42)
    }

check_models(X_train, y_train, 5)

## Find best hyperparameters for each model using RandomizedSearch

In [None]:
from sklearn.model_selection import RandomizedSearchCV

#The function to measure the quality of a split
criterion = ['squared_error', 'absolute_error']
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
rf_grid =  {'criterion' : criterion,
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(random_state=42), param_distributions = rf_grid)
rf_random.fit(X_train, y_train)
print(rf_random.best_params_)
print(rf_random.best_estimator_)

In [None]:
xgb_grid = {
    "learning_rate"     : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3] ,
    "max_depth"         : [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight"  : [1, 3, 5, 7],
    "gamma"             : [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree"  : [0.3, 0.4, 0.5, 0.7]
}

xgb_random = RandomizedSearchCV(XGBRegressor(random_state=42), param_distributions=xgb_grid, n_iter=50, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, verbose=3)
xgb_random.fit(X_train, y_train)
print(xgb_random.best_params_)
print(xgb_random.best_estimator_)

In [None]:
bgr_grid = {
    'n_estimators': [20,50,100],
    'max_samples': [0.5,0.1],
    'max_features': [1,2,4,6,8],
    'bootstrap': [True, False],
}

bgr_random = RandomizedSearchCV(BaggingRegressor(random_state=42), param_distributions=bgr_grid, n_iter=50, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, verbose=3)

bgr_random.fit(X_train, y_train)
print(bgr_random.best_params_)
print(bgr_random.best_estimator_)

In [None]:
xtr_grid = {
    'n_estimators': [10,50,100],
    'max_depth': [2,8,16,32,50],
    'min_samples_split': [2,4,6],
    'min_samples_leaf': [1,2],
    'max_features': ['auto','sqrt','log2'],    
    'bootstrap': [True, False],
    'warm_start': [True, False],
}

xtr_random = RandomizedSearchCV(ExtraTreesRegressor(random_state=42), param_distributions=xtr_grid, n_iter=50, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, verbose=3)

xtr_random.fit(X_train, y_train)
print(xtr_random.best_params_)
print(xtr_random.best_estimator_)

## Test the new models with the 'dev' data

In [None]:


rf = RandomForestRegressor(
        random_state = 42, bootstrap=True, max_depth=70,
        min_samples_leaf=4, min_samples_split=10, n_estimators=400)

xgb = XGBRegressor(
        base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.7, gamma=0.0, gpu_id=-1,
        importance_type='gain', interaction_constraints='',
        learning_rate=0.1, max_delta_step=0, max_depth=12,
        min_child_weight=7, monotone_constraints='()',
        n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
        tree_method='exact', validate_parameters=1, verbosity=None)

xtr = ExtraTreesRegressor(random_state = 42)

bg = BaggingRegressor(random_state = 42)

clfs = {'xgb': xgb, 'xtr': xtr, 'bg': bg, 'rf': rf}

for clf in clfs:
        print(f'{clf}: {fit_predict(clfs[clf], X_train, y_train, X_dev)}')