# TM10007: Machine learning
## Week 3, lecture 2: Model complexity and optimization
#### Author: Martijn P. A. Starmans

In this exercise, you will learn how to use support vector machines and kernels using scikit learn.

For more of these methods, visit https://scikit-learn.org/stable/modules/svm.html, https://scikit-learn.org/stable/modules/kernel_approximation.html 

In [None]:
!pip install sklearn numpy matplotlib

In [None]:
# General packages
import numpy as np 
import matplotlib.pyplot as plt
from sklearn import datasets as ds
from scipy.stats import randint

# Classifiers and kernels
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Regularization
from sklearn.linear_model import Lasso, RidgeClassifier
from sklearn.feature_selection import SelectFromModel

# Model selection
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

# For the text classification dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from time import time

# Functions for plotting ROC curve
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize


In [None]:
# Some functions we will use
def colorplot(clf, ax, x, y, h=100, precomputer=None):
    '''
    Overlay the decision areas as colors in an axes.
    
    Input:
        clf: trained classifier
        ax: axis to overlay color mesh on
        x: feature on x-axis
        y: feature on y-axis
        h(optional): steps in the mesh
    '''
    # Create a meshgrid the size of the axis
    xstep = (x.max() - x.min() ) / 20.0
    ystep = (y.max() - y.min() ) / 20.0
    x_min, x_max = x.min() - xstep, x.max() + xstep
    y_min, y_max = y.min() - ystep, y.max() + ystep
    h = max((x_max - x_min, y_max - y_min))/h
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    features = np.c_[xx.ravel(), yy.ravel()]
    if precomputer is not None:
        if type(precomputer) is RBFSampler:
            features = precomputer.transform(features)
        elif precomputer is rbf_kernel:
            features = rbf_kernel(features, X)
            
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    if hasattr(clf, "decision_function"):
        Z = clf.decision_function(features)
    elif hasattr(clf, "predict_proba"):
        Z = clf.predict_proba(features)
    else:
        Z = clf.predict(features)
        
    if len(Z.shape) > 1:
        Z = Z[:, 1]
    
    # Put the result into a color plot
    cm = plt.cm.RdBu_r
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    del xx, yy, x_min, x_max, y_min, y_max, Z, cm
    
def load_breast_cancer(n_features=2):
    '''
    Load the sklearn breast data set, but reduce the number of features with PCA.
    '''
    data = ds.load_breast_cancer()
    x = data['data']
    y = data['target']
    
    p = PCA(n_components=n_features)
    p = p.fit(x)
    x = p.transform(x)
    return x, y

def load_boston(n_features=1):
    '''
    Load the sklearn boston data set, but reduce the number of features with PCA.
    '''
    data = ds.load_boston()
    x = data['data']
    y = data['target']
    
    p = PCA(n_components=n_features)
    p = p.fit(x)
    x = p.transform(x)
    return x, y

def load_diabetes(n_features=1):
    '''
    Load the sklearn bdiabetes data set, but reduce the number of features with PCA.
    '''
    data = ds.load_diabetes()
    x = data['data']
    y = data['target']
    
    p = PCA(n_components=n_features)
    p = p.fit(x)
    x = p.transform(x)
    return x, y

def plot_learning_curve(estimator, title, X, y, axes, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    axes : array of 3 axes, optional (default=None)
        Axes to use for plotting the curves.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores  = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes.legend(loc="best")

    return plt

def load_text_dataset(N_features=100):
    '''
    Load dataset for classifying text documents by topic.
    '''
    categories = [
        'alt.atheism',
        'talk.religion.misc'
    ]

    remove = ('headers', 'footers', 'quotes')

    print("Loading 20 newsgroups dataset for categories:")
    print(categories if categories else "all")

    data_train = fetch_20newsgroups(subset='train', categories=categories,
                                    shuffle=True, random_state=42,
                                    remove=remove)

    data_test = fetch_20newsgroups(subset='test', categories=categories,
                                   shuffle=True, random_state=42,
                                   remove=remove)
    print('data loaded')

    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names


    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6


    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(target_names))
    print()

    # split a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    use_hashing = False
    if use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                       n_features=2 ** 16)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if N_features < X_train.shape[1]:
        print("Extracting %d best features by a chi-squared test" %
              N_features)
        t0 = time()
        ch2 = SelectKBest(chi2, k=N_features)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()
    
    if feature_names:
        feature_names = np.asarray(feature_names)
        
    return X_train, X_test, y_train, y_test


def plot_roc_curve(y_score, y_truth):
    '''
    Plot an ROC curve.
    '''
    # Only take scores for class = 1
    y_score = y_score[:, 1]
    
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = roc_curve(y_truth, y_score)
    roc_auc = auc(fpr, tpr)
    
    # Plot the ROC curve
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

Let us first create again three example datasets to play with and plot the feature distributions in scatter plots.

In [None]:
# Load and plot three different classification datasets
X2, Y2 = ds.make_classification(n_samples=100, n_features=2, n_redundant=0,
                                n_informative=1,
                                n_clusters_per_class=1)
fig = plt.figure(figsize=(24,8))
ax = fig.add_subplot(131)
ax.set_title("One informative feature, one cluster per class", fontsize='small')
ax.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2,
            s=25, edgecolor='k', cmap=plt.cm.Paired)

X3, Y3 = ds.make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=5)
ax = fig.add_subplot(132)
ax.set_title("Two blobs, two classes", fontsize='small')
ax.scatter(X3[:, 0], X3[:, 1], marker='o', c=Y3,
            s=25, edgecolor='k', cmap=plt.cm.Paired)

X4, Y4 = load_breast_cancer()
ax = fig.add_subplot(133)
ax.set_title("A more complicated problem", fontsize='small')
ax.scatter(X4[:, 0], X4[:, 1], marker='o', c=Y4,
            s=25, edgecolor='k', cmap=plt.cm.Paired)

## Regularization: random forest
Let us first check what a random forest with a varying number of trees (1, 5, 200) would do on each dataset when using a single train-test split in each dataset. Note that we fixed all randomness, so you will get the same answer every time: you would not normally do this in a random forest.

In [None]:
# Construct classifiers
clsfs = [RandomForestClassifier(n_estimators=1, random_state=42),
         RandomForestClassifier(n_estimators=5, random_state=42),
         RandomForestClassifier(n_estimators=200, random_state=42)]


# Create lists of datasets to loop over
Xs = [X2, X3, X4]
Ys = [Y2, Y3, Y4]

# First make plot without classifiers:
num = 0
fig = plt.figure(figsize=(24,8*len(clsfs)))
for X, Y in zip(Xs, Ys):
    ax = fig.add_subplot(7, 3, num + 1)
    ax.scatter(X[:, 0], X[:, 1], marker='o', c=Y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
    num += 1
    
# Now use the classifiers on all datasets
for clf in clsfs:
    for X, Y in zip(Xs, Ys):
        # Split data in training and testing
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
        
        clf.fit(X_train, y_train)
        ax = fig.add_subplot(7, 3, num + 1)
        ax.scatter(X[:, 0], X[:, 1], marker='o', c=Y,
            s=25, edgecolor='k', cmap=plt.cm.Paired)
        colorplot(clf, ax, X[:, 0], X[:, 1])
        y_pred = clf.predict(X_test)
        t = ("Misclassified: %d / %d" % ((y_test != y_pred).sum(), y_test.shape[0]))
        ax.set_title(t)
        num += 1
        
# Note: you may get a FutureWarning, which you can for now just ignore

The complexity of the problem varies from the left to the right. We observe that:
- Left: all random forests work equally well. However, the decision boundary with a large number of trees becomes more complicated, and may not generalize as well as the simple boundary when only using a single tree.
- Middle: the single tree forest does not perform so well. The forests with 5 and 200 trees perform similar, but the one with 5 trees performs a bit better. Based on performance, 5 trees would thus be most suitable. Additionally, you may favour a less complex solution if it performs similar.
- Right: the problem is very complex, resulting in this case in the more complex 200 tree forest performing the best. However, even more trees may let it perform even better...

So which classifier would you pick for each problem, and why?

## Regularization: L1 and L2 norm
In the lectures, we have seen the L1 and L2 norm for regularization. These can be used in combination with a linear classifier in the Lasso and Ridge classifiers from sklearn. Both have a parameter \alpha which controls the weight of the regularization term. Let us start by using the Ridge classifier

In [None]:
# Load a larger dataset with more features
X_train, X_test, y_train, y_test = load_text_dataset(N_features=1000)

# This dataset contains samples of different text types. We have extracte two classes: atheist texts and religious texts.
# Originally, this dataset contained thousands of features. Using univariate selection, we can quickly select a subset of 1000.

# Display the weights and compute error for multiple values for alpha
n_alphas = 200
alphas = np.logspace(-10, -1, n_alphas)

# Construct classifiers
coefs = []
accuracies = []
times = []
for a in alphas:
    # Fit classifier
    clf = RidgeClassifier(alpha=a, fit_intercept=False)
    t0 = time()
    clf.fit(X_train, y_train)
    duration = time() - t0
    y_pred = clf.predict(X_test)
    message = ("\t Misclassified: %d / %d" % ((y_test != y_pred).sum(), y_test.shape[0]))
    print(message)
    
    # Append statistics
    accuracy = float((y_test != y_pred).sum()) / float(y_test.shape[0])
    times.append(duration)
    accuracies.append(accuracy)
    coefs.append(clf.coef_)

# #############################################################################
# Display results

# Weights
plt.figure()
ax = plt.gca()
ax.plot(alphas, np.squeeze(coefs))
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

# Performance
plt.figure()
ax = plt.gca()
ax.plot(alphas, accuracies)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('accuracies')
plt.title('Performance as a function of the regularization')
plt.axis('tight')
plt.show()

# Times
plt.figure()
ax = plt.gca()
ax.plot(alphas, times)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('times (s)')
plt.title('Fitting time as a function of the regularization')
plt.axis('tight')
plt.show()

# Note: you may get a FutureWarning, which you can for now just ignore

What do you observe:
- With respect to the value of the weights?
- With respect to the accuracy on the test set of the classifier?
- With respect to the times of fitting?

What would you conclude about when and how to use L2 regularization?

Let us now do the same for the Lasso estimator.

In [None]:
# Construct classifiers
coefs = []
accuracies = []
times = []
for a in alphas:
    # Fit classifier
    clf = Lasso(alpha=a, fit_intercept=False)
    t0 = time()
    clf.fit(X_train, y_train)
    duration = time() - t0
    y_pred = clf.predict(X_test)
    message = ("\t Misclassified: %d / %d" % ((y_test != y_pred).sum(), y_test.shape[0]))
    print(message)
    
    # Append statistics
    accuracy = float((y_test != y_pred).sum()) / float(y_test.shape[0])
    times.append(duration)
    accuracies.append(accuracy)
    coefs.append(clf.coef_)

# #############################################################################
# Display results

# Weights
plt.figure()
ax = plt.gca()
ax.plot(alphas, np.squeeze(coefs))
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

# Performance
plt.figure()
ax = plt.gca()
ax.plot(alphas, accuracies)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('accuracies')
plt.title('Performance as a function of the regularization')
plt.axis('tight')
plt.show()

# Times
plt.figure()
ax = plt.gca()
ax.plot(alphas, times)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('times (s)')
plt.title('Fitting time as a function of the regularization')
plt.axis('tight')
plt.show()

# Note: you may get a FutureWarning, which you can for now just ignore

What do you observe:
- With respect to the value of the weights?
- With respect to the accuracy on the test set of the classifier?
- With respect to the times of fitting?

What would you conclude about when and how to use L1 regularization / Lasso? How do the L2 and L1 regularization compare?

The Lasso estimator encourages weights to go to zero. Features with a weight of zero do not contribute to the final prediction. Thus, the Lasso estimator can also be used as a feature selection method. In sklearn, this is implemented throught the SelectFromModel module. Let us demonstrate how this works

In [None]:
# Make a dataset with many uninformative features
X, Y = ds.make_classification(n_samples=100, n_features=100, n_redundant=0,
                              n_informative=2,
                              n_clusters_per_class=1,
                              random_state=42)

# Split in training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Fit a simple LDA without feature selection and plot an ROC curve
clf = LDA()
clf.fit(X_train, y_train)
y_score = clf.predict_proba(X_test)
plot_roc_curve(y_score, y_test)


# Now first use the selectfrom model module. Select all features with a weight above the median.
selector = SelectFromModel(estimator=Lasso(alpha=10**(-10), random_state=42), threshold='median')
selector.fit(X_train, y_train)
n_original = X_train.shape[1]
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)
n_selected = X_train.shape[1]
print(f"Selected {n_selected} from {n_original} features.")

# Fit the LDA on selected features
clf = LDA()
clf.fit(X_train, y_train)
y_score = clf.predict_proba(X_test)
plot_roc_curve(y_score, y_test)

Simply selecting half of the features, which still includes 48 non-informative features, has resulted in our LDA now performing much better on the test dataset than when using all original features! Whether to use the Lasso regularization or another feature selection method, depends on your dataset. Note that we have now fixed the regularization weight parameter to 10**(-10), which you would normally have to tune to your dataset.

## Learning Curves

As you can see above, hyperparameter tuning is important, and the optimal choice may vary per problem. Let us now demonstrate this using learning curves. This may take a minute.

In [None]:
# Largely based on example from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py

# First make plot without classifiers:
num = 0
fig = plt.figure(figsize=(24,8*len(clsfs)))
for X, Y in zip(Xs, Ys):
    ax = fig.add_subplot(7, 3, num + 1)
    ax.scatter(X[:, 0], X[:, 1], marker='o', c=Y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
    num += 1
    
        
# Create a cross-validation object
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    
# Now use the classifiers on all datasets
for clf in clsfs:
    for X, Y in zip(Xs, Ys):
        # Split data in training and testing
        title = str(type(clf))
        ax = fig.add_subplot(7, 3, num + 1)
        plot_learning_curve(clf, title, X, Y, ax, ylim=(0.3, 1.01), cv=cv)
        num += 1



What do the learning curves tell you? Can they help in deciding which classifier to use for each dataset?

## Hyperparameter optimization

We saw that the number of trees for a random forest heavily influences the performance and complexity. Let us try to automatically optimize the number of trees using a randomized search. This may take a minute or two. Note that we again fixed the random states, which you should not do normally.

In [None]:
# Our parameter to optimize is the number of estimators, which we vary uniformlybetween 1 and 400
param_distributions = {'n_estimators': randint(1, 400)}


# First make plot without classifiers:
num = 0
fig = plt.figure(figsize=(24,8*len(clsfs)))
for X, Y in zip(Xs, Ys):
    ax = fig.add_subplot(2, 3, num + 1)
    ax.scatter(X[:, 0], X[:, 1], marker='o', c=Y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
    num += 1
    
# Now use the classifiers on all datasets
fitted_clfs = list()
for X, Y in zip(Xs, Ys):
    # Split data in training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

    # Within a 5-fold cross-validation, try out 20 different number of trees
    clf = RandomizedSearchCV(RandomForestClassifier(), param_distributions, cv=5, n_iter=20, random_state=42)
    
    # Fit the classifier
    clf.fit(X_train, y_train)
    
    # Save for next part
    fitted_clfs.append(clf)
    
    # Plotting
    ax = fig.add_subplot(2, 3, num + 1)
    ax.scatter(X[:, 0], X[:, 1], marker='o', c=Y,
        s=25, edgecolor='k', cmap=plt.cm.Paired)
    colorplot(clf, ax, X[:, 0], X[:, 1])
    y_pred = clf.predict(X_test)
    t = ("Misclassified: %d / %d" % ((y_test != y_pred).sum(), y_test.shape[0]))
    ax.set_title(t)
    num += 1

We can check the results of the fitting by looking at the several object within the fitted ``RandomizedSearchCV``object:

In [None]:
# Look at all the results: only for the first problem
print(fitted_clfs[0].cv_results_)

# Get the best estimator and best parameters belonging to that estimator
for num, clf in enumerate(fitted_clfs):
    print(f'\n The best estimator and parameters for dataset {num} are:')
    print(f'\t {clf.best_estimator_}')
    print(f'\t {clf.best_params_}')

Note: the ``RandomizedSearchCV`` selects the best estimator and best parameters purely based on performance, not on aspects such as generalizability and complexity. Moreover, there may be several solutions performing equally, in which case a random one is picked, or similar with minor differences. Hence it may be that the selected solution is very complex, while the second best solution performs only say 0.0000001% lower and is much less complex, and thus would be a better choice. Thus, be careful in your optimization!