# Intro

## pip installations, libraries, paths

In [1]:
# pip install lazypredict

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import multiprocessing

from sklearn.model_selection import PredefinedSplit, GridSearchCV, RandomizedSearchCV
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

In [3]:
# Set paths.
DATA_LOC = r'C:\Users\enriq\OneDrive\Desktop\Work\Code\HitFinder\Data'
TRACKS_LOC = os.path.join(DATA_LOC, 'Tracks')
MODELS_LOC = os.path.join(DATA_LOC, 'Models')
MODELS_VERSIONED_LOC = os.path.join(MODELS_LOC, 'Versioned')
MODELS_FINAL_LOC = os.path.join(MODELS_LOC, 'Final')
DATA_PATH = os.path.join(TRACKS_LOC, 'modeling_tracks.csv')
FEAT_LISTS_PATH = os.path.join(DATA_LOC, 'Feature Lists', 'feature_lists.p')
# Other.
NUM_FOLDS = 5
NUM_TRAIN_SPLITS = NUM_FOLDS - 1
RANDOM_STATE = 0
np.random.seed(RANDOM_STATE)
DO_LAZYPRED = False

## Read in data

In [4]:
# Read in tracks data.
tracks = pd.read_csv(DATA_PATH)
tracks = tracks.drop_duplicates(ignore_index = True)

In [5]:
# Read in feature lists.
with open(FEAT_LISTS_PATH, 'rb') as fp:
    feature_lists = pickle.load(fp)

In [6]:
# Set features.
TRGT_FEATURE = 'popularity' # or 'is_top_10' for binary classification
NON_PREDICTORS = [
    'TEST',
    'name',
    'id',
    'artist',
    'artist_id',   # keep ?
    'album',
    'popularity',   # potential target
    'is_top_10' # potential target
]

In [7]:
# Split data into train and holdout.
train_idx, val_idx = tracks['TEST'].isin(range(1, NUM_FOLDS)), tracks['TEST'] == NUM_FOLDS
tracks_no_holdout = tracks.loc[train_idx]
X, y = tracks.drop(columns = NON_PREDICTORS), tracks[TRGT_FEATURE]
X_cv, X_val = X.loc[train_idx], X.loc[val_idx]
y_cv, y_val = y.loc[train_idx], y.loc[val_idx]

# LazyPredict

In [8]:
def do_lazypred(
        feature_list_num: int,
        flists: int,
        X_train_test: pd.DataFrame = X_cv,
        y_train_test: pd.Series = y_cv,
        n_models: int = 10,
        random_state: int = RANDOM_STATE,
):
    
    # Initialize empty dataframe to hold LazyPredict results.
    cols = ['feat_list', 'model', 'test_num', 'test_RMSE']
    nan_arr = np.full((NUM_TRAIN_SPLITS * n_models, NUM_TRAIN_SPLITS), np.nan)
    metrics = pd.DataFrame(nan_arr, columns = cols)

    # Get test metrics for each CV fold.
    print('\nLazyPredict on feature_list_num', feature_list_num)
    X_in = X_train_test[flists[feature_list_num]]
    for fold_n in range(1, NUM_FOLDS):
        test_idx = tracks_no_holdout['TEST'] == fold_n
        X_train, X_test = X_in.loc[~test_idx], X_in.loc[test_idx]
        y_train, y_test = y_train_test.loc[~test_idx], y_train_test.loc[test_idx]
        model = LazyRegressor(random_state = random_state)
        print('\tTesting on fold #', fold_n)
        models, _ = model.fit(X_train, X_test, y_train, y_test)

        # Record metrics.
        metric_idx = range(fold_n * n_models - n_models, fold_n * n_models)
        metrics.loc[metric_idx, 'model'] = models.index.tolist()[:n_models]
        metrics.loc[metric_idx, 'test_num'] = fold_n
        metrics.loc[metric_idx, 'test_RMSE'] = models[:n_models]['RMSE'].values
    metrics['feat_list'] = feature_list_num

    # Prepare for output.
    print('Doing metrics preparation.')
    model_counts = metrics['model'].value_counts()
    save_models = model_counts[model_counts >= 3].index.tolist()
    save_models_idx = metrics['model'].isin(save_models)
    metrics = metrics.loc[save_models_idx]
    metrics = metrics.groupby('model').mean() # get RMSE means
    metrics = metrics.sort_values('test_RMSE').reset_index()
    metrics = metrics.drop(columns = ['test_num'])
    metrics = metrics.rename({'test_RMSE': 'avg_test_RMSE'}, axis = 1)
    metrics = metrics[['feat_list', 'model', 'avg_test_RMSE']]
    metrics['feat_list'] = metrics['feat_list'].astype('int32')

    return metrics

In [9]:
# Get best models for the feature lists obtained.
if DO_LAZYPRED:
    lazypred_res = pd.DataFrame()
    for flist_num in feature_lists:
        lazypred_res_tmp = do_lazypred(flist_num, feature_lists)
        lazypred_res = pd.concat([lazypred_res, lazypred_res_tmp], ignore_index = True)
    # Save LazyPredict results.
    lazypred_res.to_csv(os.path.join(DATA_LOC, r'Model Results\lazypred_res.csv'), index = False)
else:
    lazypred_res = pd.read_csv(os.path.join(DATA_LOC, r'Model Results\lazypred_res.csv'))

lazypred_res.sort_values('avg_test_RMSE').head(10)

Unnamed: 0,feat_list,model,avg_test_RMSE
45,8,RandomForestRegressor,18.57
46,8,LGBMRegressor,18.58
47,8,HistGradientBoostingRegressor,18.6
38,7,LGBMRegressor,18.62
39,7,HistGradientBoostingRegressor,18.65
48,8,GradientBoostingRegressor,18.66
31,6,LGBMRegressor,18.68
32,6,HistGradientBoostingRegressor,18.68
49,8,MLPRegressor,18.68
50,8,NuSVR,18.68


# Modeling

Use LazyPredict results to dig further into the indicated models.

## Intro

In [10]:
# Define cross-validation splits.
cv_no_holdout = PredefinedSplit(tracks_no_holdout['TEST'])  # only 1-4
# Define scorer for modeling.
rmse_scorer = make_scorer(rmse, greater_is_better = False)

In [12]:
def do_model(
        model_name: str,
        model_lib_str: str,
        params: dict,
        X_in: pd.DataFrame,
        y_in: pd.Series = y_cv,
):

    # Run the model if it doesn't exist already.
    model_path = os.path.join(MODELS_VERSIONED_LOC, model_name)
    if model_name not in os.listdir(MODELS_VERSIONED_LOC):
        print('Creating', model_name + '.')
        estimator = eval(model_lib_str + '(random_state = RANDOM_STATE)')
        model = GridSearchCV(
            estimator, params, cv = cv_no_holdout, scoring = rmse_scorer,
        )
        model.fit(X_in, y_in)

        # Save model.
        with open(model_path, 'wb') as handle:
            pickle.dump(model, handle, protocol = pickle.HIGHEST_PROTOCOL)
    # Read the model in if we already made it.
    else:
        print('Reading in', model_name + '.')
        with open(model_path, 'rb') as handle:
            model = pickle.load(handle)
    
    return model

In [40]:
def get_metrics(
        model,
        X_val_in: pd.DataFrame,
        y_val_in: pd.Series = y_val,
):

    # Get test metrics from the model's best estimator.
    best_idx = model.best_index_
    res = model.cv_results_
    metrics = {
        'test_mean_RMSE': float(-res['mean_test_score'][best_idx]),
        'test_fold_1_RMSE': float(-res['split0_test_score'][best_idx]),
        'test_fold_2_RMSE': float(-res['split1_test_score'][best_idx]),
        'test_fold_3_RMSE': float(-res['split2_test_score'][best_idx]),
        'test_fold_4_RMSE': float(-res['split3_test_score'][best_idx]),
    }

    # Get validation metrics.
    best_model = model.best_estimator_
    val_pred = best_model.predict(X_val_in)
    metrics['val_RMSE'] = float(rmse(y_val_in, val_pred))

    return metrics

## feat_list_8

### RandomForestRegressor (M45)

In [13]:
# Set modeling data.
features = feature_lists[8]
X_in = X_cv[features].copy()
X_val_tmp = X_val[features].copy()

In [14]:
# Set modeling parameters.
params = {
    'criterion': ['squared_error', 'absolute_error'],
    'max_depth': [None, 2, 3, 5],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2, 3, 5],
    # 'min_weight_fraction_leaf': [0.0],
    # 'max_features': [1.0],
    # 'max_leaf_nodes': [None],
    # 'min_impurity_decrease': [0.0],
    # 'bootstrap': [True],
    # 'oob_score': [False],
    'n_jobs': [-1],
    # 'random_state': [RANDOM_STATE],
    # 'verbose': [0],
    'warm_start': [False, True],
    # 'ccp_alpha': [0.0],
    'max_samples': [None],
    # 'monotonic_cst': [None],
}

In [15]:
# model = do_model('m45_v1.pkl', 'RandomForestRegressor', params, X_in)

In [16]:
# model.best_estimator_.predict(X_in)

### LGBMRegressor (M46)

In [17]:
# Set modeling parameters.
params = {
    # 'boosting_type': ['gbdt'],
    'num_leaves': [26, 31, 36],
    'max_depth': [-1],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 150],
    # 'subsample_for_bin': [200000],
    # 'objective': [None],
    # 'class_weight': [None],
    'min_split_gain': [0.0],
    'min_child_weight': [0.01, 0.001, 0.0001],
    'min_child_samples': [15, 20],
    # 'subsample': [1.0],
    # 'subsample_freq': [0],
    # 'colsample_bytree': [1.0],
    'reg_alpha': [0.0, 0.5],
    'reg_lambda': [0.0, 0.5],
    # 'random_state': [None],
    'n_jobs': [-1],
    'importance_type': ['split'],
}

In [18]:
model = do_model('m46_v1.pkl', 'lgb.LGBMRegressor', params, X_in)

Reading in m46_v1.pkl.


In [41]:
get_metrics(model, X_val_tmp)

{'test_mean_RMSE': 18.56527882153417,
 'test_fold_1_RMSE': 18.664613060420848,
 'test_fold_2_RMSE': 18.508549385183954,
 'test_fold_3_RMSE': 18.376529221277455,
 'test_fold_4_RMSE': 18.711423619254436,
 'val_RMSE': 18.682831472086587}

### HistGradientBoostingRegressor (M47)

### GradientBoostingRegressor (M48)

## feat_list_7

### LGBMRegressor (M38)

### HistGradientBoostingRegressor (M39)