In [1]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from models import RidgeWithMLP, RidgeWithGBRT

from get_dataset import get_dataset_with_features, split_dataset_to_train_and_validation

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrmnigm[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
def train_run(config=None):
    with wandb.init(config=config):
        config = wandb.config
        model = model_provider(config)
        (
            train_X,
            train_y,
            validation_X,
            validation_y,
        ) = split_dataset_to_train_and_validation(*get_dataset_with_features())
        model.fit(train_X, train_y)
        y_pred = model.predict(validation_X)
        y_true = validation_y
        rmse = mean_squared_error(y_true, y_pred) ** 0.5
        r2 = r2_score(y_true, y_pred)
        mape = mean_absolute_percentage_error(y_true, y_pred)
        wandb.sklearn.plot_regressor(
            model, train_X, validation_X, train_y, validation_y,
            model_name=f"{config['model']}"
            )
        wandb.sklearn.plot_learning_curve(model, train_X, train_y)
        wandb.log({"RMSE": rmse, "R2": r2, "MAPE": mape, })

In [4]:
def model_provider(config):
    if config['model'] == 'catboost':
        return CatBoostRegressor(verbose=False)
    elif config['model'] == 'random_forest':
        return RandomForestRegressor()
    elif config['model'] == 'linear_with_gbrt':
        return RidgeWithGBRT(tree_coef=0.8)
    elif config['model'] == 'linear_with_mlp':
        return RidgeWithMLP(mlp_coef=0.8)

sweep_config = {
    'method': 'grid',
    'metric': {'name': 'MAPE', 'goal': 'minimize'},
    'parameters': {
        'model': {"values": ["catboost", "random_forest", "linear_with_gbrt", "linear_with_mlp"]}
    }
}
sweep_id = wandb.sweep(sweep_config, project="mlsd-hw02")
wandb.agent(sweep_id, train_run, count=4)

Create sweep with ID: yrsmme6p
Sweep URL: https://wandb.ai/rmnigm/mlsd-hw02/sweeps/yrsmme6p


[34m[1mwandb[0m: Agent Starting Run: 0wkhtp96 with config:
[34m[1mwandb[0m: 	model: catboost


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting catboost.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


0,1
MAPE,▁
R2,▁
RMSE,▁

0,1
MAPE,0.02496
R2,0.52843
RMSE,6.63588


[34m[1mwandb[0m: Agent Starting Run: tjjmy8tn with config:
[34m[1mwandb[0m: 	model: random_forest


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting random_forest.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.


0,1
MAPE,▁
R2,▁
RMSE,▁

0,1
MAPE,0.02919
R2,0.29219
RMSE,8.12988


[34m[1mwandb[0m: Agent Starting Run: 8djenuwj with config:
[34m[1mwandb[0m: 	model: linear_with_gbrt


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting linear_with_gbrt.
[34m[1mwandb[0m: Logged summary metrics.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0,1
MAPE,▁
R2,▁
RMSE,▁

0,1
MAPE,0.02097
R2,0.6471
RMSE,5.74055


[34m[1mwandb[0m: Agent Starting Run: xuccbhin with config:
[34m[1mwandb[0m: 	model: linear_with_mlp


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting linear_with_mlp.
[34m[1mwandb[0m: Logged summary metrics.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
[34m[1mwandb[0m: Logged learning curve.
[34m[1mwandb[0m: Logged outlier candidates.
[34m[1mwandb[0m: Logged residuals.
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0,1
MAPE,▁
R2,▁
RMSE,▁

0,1
MAPE,368.3125
R2,-154668282.39446
RMSE,120178.25152


In [4]:
def model_provider(config):
    if config['model'] == 'linear_with_gbrt':
        return RidgeWithGBRT(
            tree_coef=config["tree_coef"],
            tree_n_estimators=config["tree_n_estimators"],
            tree_depth=config["tree_depth"],
            tree_learning_rate=config["tree_learning_rate"],
            tree_l2_leaf_reg=config["tree_l2_leaf_reg"],
            ridge_alpha=config["ridge_alpha"],
        )
    elif config['model'] == 'catboost':
        return CatBoostRegressor(
            verbose=False,
            n_estimators=config["tree_n_estimators"],
            max_depth=config["tree_depth"],
            learning_rate=config["tree_learning_rate"],
            l2_leaf_reg=config["tree_l2_leaf_reg"],
        )

In [5]:
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'MAPE', 'goal': 'minimize'},
    'parameters': {
        'model': {"values": ["linear_with_gbrt", "catboost"]},
        'ridge_alpha': {
            'distribution': 'uniform',
            'min': 0.001,
            'max': 1.0,
        },
        'tree_coef': {
            'distribution': 'uniform',
            'min': 0.3,
            'max': 1.0,
        },
        'tree_n_estimators': {
            'distribution': 'q_log_uniform_values',
            'min': 100,
            'max': 1000,
        },
        'tree_depth': {
            'distribution': 'q_log_uniform_values',
            'min': 3,
            'max': 8,
        },
        'tree_learning_rate': {
            'distribution': 'uniform',
            'min': 0.001,
            'max': 0.1,
        },
        'tree_l2_leaf_reg': {
            'distribution': 'uniform',
            'min': 0.5,
            'max': 5.0,
        },
    }
}

sweep_id = wandb.sweep(sweep_config, project="mlsd-hw02")

Create sweep with ID: ia7kw4ne
Sweep URL: https://wandb.ai/rmnigm/mlsd-hw02/sweeps/ia7kw4ne


In [None]:
wandb.agent(sweep_id, train_run, count=50)