In [None]:
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from sklearn.utils.random import sample_without_replacement
from sklearn.model_selection import train_test_split

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

alt.data_transformers.disable_max_rows()
alt.renderers.enable('default')

# small parameter overview



**accousticness**: A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic.


**energy**: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.

**valence**: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry)

**instrumentalness**: Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0.


**key**: The key the track is in. Integers map to pitches using standard Pitch Class notation.


**liveness**: Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live.


**loudness**: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.


**mode**: Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0.


**speechiness**: Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks.

[source](https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-analysis/)

**task**: predict popularity of a song (regression) based solely on other features/characterics of the music track

- impact of the artist's name on the popularity seems quiet obvious but also quiet a complex task (high cardinality data involving of processing to do), so I ll drop this parameter
    - also see quiet fun to primarily focus parameters charcterizing the music track itself
- **production year** and **release date** are in most case very near by another (at least at the year resolution), so I ll drop the release date

In [None]:
df = pd.read_csv('/kaggle/input/spotify-dataset-19212020-160k-tracks/data.csv')
df.drop(['release_date', 'name', 'artists'], axis=1, inplace=True)
df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
df.describe()

In [None]:
pd.concat([df.nunique().rename('nunique'), df.isnull().sum().rename('nnull')], axis=1)

# (spotify specific) music track characteristics

In [None]:
fig, ax = plt.subplots(1,1, figsize=(14,8))
sns.violinplot(
    data=df.select_dtypes([int,float])
            .drop(
                [
                    'year', 'duration_ms', 
                    'loudness', 'popularity', 
                    'tempo', 'key', 'mode', 'explicit'
                ],
                axis=1
            ),
    inner='quartile',
    ax=ax,
)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(14,8))
sns.boxplot(
    data=df.select_dtypes([int,float])
            .drop(
                [
                    'year', 'duration_ms', 
                    'loudness', 'popularity', 
                    'tempo', 'key', 'mode', 'explicit'
                ],
                axis=1
            ),
    ax=ax,
)

the characteric "features" of track provided by spotify with (0,1)-range exhibit very strange and mostly highly non-normal distributions.

**instrumentation,speechness,liveness**
- both show highly skewed distribution (heavy right tailed narrow "gaussian")
- hint of superposition (small bumps in right corners)
- instrumentation and speechness have a very similar shape

**danceability**
- very flat symmetric and centered distribution (0.5)

**valence,energy**
- almost uniform distribution with 
    
**acousticness**
- feels like binary entity with two buldges occuring on both ends of the distribution
- otherwise very flat
- median and mean have no meaning

unfortunetly I do not have enough computer resource to perform a full bivariate investigation 
(even with random subsampling)

# other predictors + target

In [None]:
import matplotlib.pyplot as plt

In [None]:
sample_size = 15000
idx = sample_without_replacement(df.shape[0], sample_size)

base = alt.Chart(df.iloc[idx]).mark_bar()
fig = alt.vconcat()

val = {
    'continuous predictor': ['tempo', 'loudness', 'duration_ms', 'year'],
    'discrete predictor': ['key', 'mode', 'explicit'],
    'target': ['popularity'],
}
for p, cols in val.items():
    row = alt.hconcat(title=p)
    for c in cols:
        row |= base.encode(
                alt.X(c, bin=alt.Bin(maxbins=50)),
                alt.Y('count():Q'),
                tooltip=['count():Q'],
            ).properties(
                title=c,
                width=200,
                height=500
        )
    fig &= row
fig

- **popularity** the target has a very strange anomaly,i.e.
very large count in there lowest value 0

- maybe this value encode a different type of information? needs to be investigated -
  could not find any information in the [spotify api doc](https://developer.spotify.com/documentation/web-api/reference/tracks/get-several-audio-features)
  
- **tempo** shows a symmetric distribution centered around 120 bpm with a gap in the appr. range (0,30). music within this range probably doesn't exist. Tempo 0 could be podcasts or audiobooks or music with complex rythms/unclear tempo?

- **loudness** left skewed distribution 

- **duration_ms** seems to be poisson distributed (and not a truncated normal distribution)
     - makes sense since poisson distribution express "the probabilty of event occuring in specified interval with a known constant mean rate and independent of other events" wikipedia
     
- **year** almost uniformly distributed with a drastic drop occuring around the 1940s

# anomaly investigation

## bivariate scatterplots/KDEs and univariate KDEs for popularity == 0 

In [None]:
 data = (
     df.where(lambda r: r.popularity == 0)
      .dropna()
      .loc[:,['tempo', 'loudness', 'danceability', 'year', 'speechiness', 'instrumentalness']]
 )
    
sample_size = 5000
idx = sample_without_replacement(data.shape[0], sample_size)

g = sns.PairGrid(data.iloc[idx],diag_sharey=False)
g.map_upper(sns.scatterplot, s=15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=2)

## bivariate scatterplots/KDEs and univariate KDEs for popularity != 0 

In [None]:
data.iloc[idx]

In [None]:
data = (
    df.where(lambda r: r.popularity != 0)
      .dropna()
      .loc[:,[
          'tempo', 'loudness', 'danceability', 
          'year', 'popularity', 'speechiness', 
          'instrumentalness', 'duration_ms',
          'acousticness',
      ]]
)
sample_size = 5000
idx = sample_without_replacement(data.shape[0], sample_size)
g = sns.PairGrid(data.iloc[idx],diag_sharey=False)
g.map_upper(sns.scatterplot, s=15)
g.map_lower(sns.scatterplot)
g.map_diag(sns.kdeplot, lw=2)

In [None]:
fig, ax  = plt.subplots(1,1,figsize=(12,10))
min_max_pop_year = (
    df[df.popularity != 0].groupby('year')
        .agg({
            'popularity': [
                lambda x: x.quantile(0.05),
                lambda x: x.quantile(0.95),
                'median',
                'mean',
            ]
        })
        .rename(columns={
            '<lambda_0>': 'min',
            '<lambda_1>': 'max',
        })
)
ax.set_title('Popularity vs year')
ax.fill_between(min_max_pop_year.index, min_max_pop_year['popularity', 'min'], min_max_pop_year['popularity', 'max'], label='90%')
sns.lineplot(x=min_max_pop_year.index, y=min_max_pop_year['popularity','median'], ax=ax, color='r', label='median')
sns.lineplot(x=min_max_pop_year.index, y=min_max_pop_year['popularity','mean'], ax=ax, color='y', label='mean')
plt.legend()
ax.set_ylabel('popularity')
#plt.yscale('log')

In [None]:
alt.Chart(
    (
        df.iloc[idx].where(lambda r: r.popularity == 0).groupby('year').agg({'id': 'count'}) / 
        df.iloc[idx].groupby('year').agg({'id': 'count'})
    ).fillna(value=0).reset_index().rename(columns={'id': 'percentile'})
).mark_bar().encode(
    x='year',
    y='percentile',
).properties(title='percentile zero popularity', width=600, height=400)

In [None]:
df.iloc[idx].where(lambda r: r.popularity == 0).groupby('year').size()
row = alt.hconcat()
base = (
    alt.Chart(df.iloc[idx])
        .mark_bar()
        .encode(x='year', y='count():Q', tooltip='year')
)
row |= (
    base
        .transform_filter(alt.datum.popularity == 0)
        .properties(width=600, height=500, title='track count with zero popularity vs year')
)
row |= (
    base
        .transform_filter(alt.datum.popularity != 0)
        .properties(width=600, height=500, title='track count with nonzero popularity vs year')
)
row |= (
    base
        #.transform_filter(alt.datum.popularity != 0)
        .properties(width=600, height=500, title='track count with nonzero popularity vs year')
)
row

In [None]:
row = alt.vconcat()
row &= (
alt.Chart(df.iloc[idx])
    .mark_bar()
    .encode(x='year', y='mean(popularity):Q', tooltip=['year'])
    .properties(width=800, height=400)   
)
row &= (
alt.Chart(df.iloc[idx])
    .mark_bar()
    .encode(x='year', y='median(popularity):Q', tooltip=['year'])
    .properties(width=800, height=400)   
)
row

In [None]:
from sklearn.utils.random import sample_without_replacement

X = df.select_dtypes([int,float]).values
cols = df.select_dtypes([int, float]).columns
N = X.shape[0]
#idx = sample_without_replacement(N, 0.5*N) 


In [None]:
df.corr().style.background_gradient(cmap='coolwarm')

# covariance matrix insights gain

**Directly visible effects on popularity(target variable)**

- popularity of tracks strongly depends on the year the music track was produced
- positive corr indicates that a trend in increasing popularity with increasing year 
    - the more recent the track is the more popular it is / modern music is popular
- loudness and energy seems also to have positive effect on popularity
- whereas acousticness seems to have a negative impact on popularity
- unexpectedly both instrumentalness and danceability exhibit relatively low corr.
 
(however we should not forget that covariance solely captures the tendency in linear rel.)
   
**Correlation between predictors**

- there seems to be a strong correlation between loudness, energy and acousticness
- since energetic tracks "feel fast, **loud**, and noisy" this leads to an obvious
  connection between loudness and energy
- to my knowledge the majority of acoustic songs are not known to 'energetic and loud'
  (with exception of crazy classical music)
 

In [None]:
import graphviz
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=2)
clf.fit(df.year.values.reshape(-1,1), df.popularity.values)

fig = plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf, filled=True)

first optimal split around the year 1953 via gini shows exactly what where exepecting.

In [None]:
df['has_popularity'] = (df.popularity != 0).astype(int)
test_df['has_popularity'] = (test_df.popularity != 0).astype(int)

In [None]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    train_test_split, cross_validate, KFold
)
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve

X_train = df.drop(['popularity', 'has_popularity', 'id'], axis=1)
y_train = df.has_popularity

X_test = test_df.drop(['popularity', 'has_popularity', 'id'], axis=1)
y_test = test_df.has_popularity


dtrain = xgboost.DMatrix(X_train, label=y_train)
dtest = xgboost.DMatrix(X_test, label=y_test)

# logistic regression

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
import sklearn.preprocessing as prep
#from sklearn.preprocessing import FunctionTransformer

preprocessor = ColumnTransformer([
    ('key', prep.OneHotEncoder(handle_unknown='ignore', dtype=int), ['key']),
    ('year', prep.MinMaxScaler(), ['year']),
    (
        'loudness', 
        make_pipeline(
            prep.FunctionTransformer(func=lambda x: np.log(np.abs(x))),
            prep.StandardScaler()
        ),
        ['loudness'],
    ),
    (
        'duration_ms', 
        make_pipeline(
            prep.FunctionTransformer(np.log),
            prep.StandardScaler(),
        ),
        ['duration_ms'],
    ),
    ('tempo', prep.StandardScaler(), ['tempo']),
    
], remainder='passthrough')

clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('regress', LogisticRegression(max_iter=150))
])

scores = cross_validate(
    clf,
    X_train,
    y_train,
    scoring=['accuracy', 'f1', 'precision', 'recall'],
    cv=5
)
scores

# xgboost classifier

In [None]:
params = {}
num_boost_round = 999
min_mae = float("Inf")
best_params = None
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgboost.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'logloss'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_logloss = cv_results['test-logloss-mean'].min()
    boost_rounds = cv_results['test-logloss-mean'].argmin()
    print("\tlogloss {} for {} rounds".format(mean_logloss, boost_rounds))
    if mean_logloss < min_mae:
        min_mae = mean_logloss
        best_params = (max_depth,min_child_weight)


In [None]:
params = dict(zip(('max_depth', 'min_child_weight'), best_params))

In [None]:
params['eval_metric'] = 'logloss'
model = xgboost.train(params, dtrain)
y_pred = model.predict(dtest)

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,7.5))
ax1, ax2 = ax
ax1.plot(fpr, tpr)
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC curve')

prec, rec, thr = precision_recall_curve(y_test, y_pred)
idx = np.argmax((2 *  rec * prec) / (prec + rec))
ax2.plot(rec, prec)
ax2.plot(rec[idx], prec[idx], marker='o', label='optimal')
ax2.set_xlabel('recall')
ax2.set_ylabel('precision')
ax2.set_title('precision recall curve')
plt.legend()

In [None]:
print('roc auc score {:.3f} with threshold {:.3f}'.format(roc_auc_score(y_test, y_pred), thr[idx]))

In [None]:
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    f1_score,
    recall_score,
    plot_confusion_matrix
)
print('accuracy_score: {:.3f} \nf1_score: {:.3f} \nrecall_score: {:.3f}'.format(
    accuracy_score(y_pred > thr[idx], y_test),
    f1_score(y_pred  > thr[idx],y_test),
    recall_score(y_pred  > thr[idx],y_test)
))
#plot_confusion_matrix(model, X_test, y_test, normalize='true')

In [None]:
confusion_matrix(y_pred > thr[idx], y_test)

In [None]:
pd.DataFrame(
    {
        'features': X_train.columns, 
        'feature_importances': model.feature_importances_,
    }
).sort_values('feature_importances', ascending=False)

# Regression

## linear regression

In [None]:
#cond = (df.has_popularity.astype(bool))
#X_train = df[cond].drop(
#    [
#        'popularity', 'has_popularity', 'id'
#    ],
#    axis=1,
#)
#y_train = df[cond].popularity
#
#dtrain = xgboost.DMatrix(X_train, label=y_train)
#dtest = xgboost.DMatrix(X_test, label=y_test)

In [None]:
# from sklearn import set_config
# from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.compose import TransformedTargetRegressor
# from sklearn.base import BaseEstimator, TransformerMixin

# set_config(display='diagram')  
# class GaussianMixtureTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, n_components):
#         self.n_components = n_components
#         #self.column = column
#         self._model = GaussianMixture(n_components=self.n_components, covariance_type='diag')

#     def fit(self, X, y=None):
#         self._model.fit(X)
#         print('fit', self._model.means_)
#         return self
    
#     def transform(self, X, y=None):
#         pred = self._model.predict(X)
#         print('transform', self._model.means_, id(self._model))
#         #for i in range(self.n_components):
#         #import pdb; pdb.set_trace()
#             #X['{}_{}'.format(self.column, i)] = pred[:,i]
#         X.values[:] = pred.reshape(-1,1)
#         return X
    
#     @property
#     def means(self):
#         return self._model.means_
    
# class IdentityTransformer(BaseEstimator, TransformerMixin):

#     def fit(self, X, y=None):
#         return self
    
#     def transform(self, X, y=None):
#         if len(X.values.shape) == 1:
#             return X.values.reshape(-1,1)
#         return X.values
    
# preprocessor = ColumnTransformer([
#     ('year', prep.MinMaxScaler(), ['year']),
#     #('key', prep.OneHotEncoder(handle_unknown='ignore', dtype=int), ['key']),
#     (
#         'loudness', 
#         make_pipeline(
#             prep.FunctionTransformer(func=lambda x: np.sqrt(np.abs(x))),
#             prep.StandardScaler()
#         ),
#         ['loudness'],
#     ),
#     (
#         'duration_ms', 
#         make_pipeline(
#             prep.FunctionTransformer(np.log),
#             prep.StandardScaler(),
#         ),
#         ['duration_ms'],
#     ),
#     ('tempo', prep.StandardScaler(), ['tempo']),
#     (
#         'acousticness', 
#         Pipeline(steps=[
#             ('GaussianMixtureTransformer', GaussianMixtureTransformer(2)),
#             ('OneHotEncoder', prep.OneHotEncoder(handle_unknown='ignore', dtype=int)),
#         ]),
#        ['acousticness']
#     ),
#     ('instrumentalness', IdentityTransformer(), 'instrumentalness'),
#     #(
#     #    'acousticness', 
#     #    IdentityTransformer(),
#     #   'acousticness'
#     #),
#     ('valence', IdentityTransformer(), 'valence'),
#     ('speechiness', IdentityTransformer(), 'speechiness'),
#     #(
#     #    'speechiness', 
#     #    make_pipeline(
#     #        GaussianMixtureTransformer(3),
#     #        prep.OneHotEncoder(handle_unknown='ignore', dtype=int),
#     #    ),
#     #   ['speechiness']
#     #)  
# ], remainder='drop')

# reg = Pipeline(steps=[
#     ('preprocess', preprocessor),
#     (
#         'regress', TransformedTargetRegressor(
#             Ridge(),
#             func=lambda x: x/100,
#             inverse_func=lambda x: x*100,
#         )
#     ),
# ])

# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE

# dim_red = Pipeline(steps=[
#     ('preprocess', preprocessor),
#     ('tsne', TSNE(n_components=2, perplexity=25)),    
# ])
# reg.fit(X_train, y_train)

# #red_data = dim_red.fit_transform(X_train[:5000])
# #plt.plot(
# #    red_data[:,0], red_data[:,1], 'o'
#)

In [None]:
# def get_columns_from_transformer(column_transformer, input_colums, include_remainder=False):    
#     col_name = []

#     for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
#         raw_col_name = transformer_in_columns[2]
#         if isinstance(transformer_in_columns[1],Pipeline): 
#             transformer = transformer_in_columns[1].steps[-1][1]
#         else:
#             transformer = transformer_in_columns[1]
#         try:
#             names = transformer.get_feature_names(raw_col_name)
#         except AttributeError: # if no 'get_feature_names' function, use raw column name
#             names = raw_col_name
#         if isinstance(names,np.ndarray): # eg.
#             col_name += names.tolist()
#         elif isinstance(names,list):
#             col_name += names    
#         elif isinstance(names,str):
#             col_name.append(names)

#     [_, _, reminder_columns] = column_transformer.transformers_[-1]
    
#     if include_remainder:
#         for col_idx in reminder_columns:
#             col_name.append(input_colums[col_idx])

#     return col_name

# cols = get_columns_from_transformer(preprocessor, X_train.columns)

In [None]:
# plt.subplots(figsize=(12,10))
# df.groupby('year')['acousticness'].median().plot(label='median acoust')
# df.groupby('year')['speechiness'].median().plot(label='median speech')
# df.groupby('year')['instrumentalness'].median().plot(label='median inst')
# df.groupby('year')['valence'].median().plot(label='median valence')
# df.groupby('year')['energy'].median().plot(label='median energy')
# df.groupby('year')['danceability'].median().plot(label='median danceability')
# df.groupby('year')['liveness'].median().plot(label='median liveness')
# df.groupby('year')['popularity'].median().apply(lambda x: x/100).plot(style='-o', label='median norm. popularity')
# plt.legend()
# plt.title('pseudo track characteristics')

In [None]:
# df = df.merge(
#     df
#         .groupby(['year'])['popularity']
#         .agg(lambda x: x.quantile(0.99))
#         .rename('popularity_q95'),
#     left_on='year',
#     right_index=True,
# )

In [None]:
# fig, ax = plt.subplots(figsize=(12,10))
# (
#     df[df.popularity >= df.popularity_q95]
#         .loc[:, 
#              [
#                  'acousticness', 
#                  'liveness', 
#                  'speechiness', 
#                  'year',
#                  'instrumentalness',
#                  'valence',
#                  'energy',
#                  'danceability',
#                  'popularity',
#              ]
#         ]
#         .assign(popularity=lambda r: r.popularity/100)
#         .groupby('year')
#         .median()
#         .plot(ax=ax)
# )

In [None]:
# df[df.popularity >= df.popularity_q95].groupby('year')['tempo'].median().plot(style='-', label='median tempo')

In [None]:
# df[df.popularity >= df.popularity_q95].groupby('year')['loudness'].median().plot(label='median inst')

In [None]:
# pd.DataFrame(reg['preprocess'].transform(X_train), columns=cols)

In [None]:
# reg.score(X_train, y_train)

In [None]:
# plt.plot(reg.predict(X_train), y_train, 'o')

In [None]:
# cross_validate(
#     reg, 
#     X_train, 
#     y_train,
#     cv=5,
#     scoring=[
#         'max_error',
#         'neg_mean_squared_error',
#         'neg_median_absolute_error',
#         'explained_variance'
#     ],
# )

In [None]:
# dict(zip(cols, reg['regress'].regressor_.coef_))

In [None]:
# import statsmodels.api as sm

# model = sm.OLS(y_train, np.squeeze(reg['preprocess'].fit_transform(X_train))).fit()
# model.summary()

In [None]:
# reg['preprocess'].fit_transform(X_train).shape, y_train.shape

In [None]:
# params = {}
# num_boost_round = 999
# min_rmse = float("Inf")
# best_params = None
# gridsearch_params = [
#     (max_depth, min_child_weight)
#     for max_depth in range(9,12)
#     for min_child_weight in range(5,8)
# ]
# for max_depth, min_child_weight in gridsearch_params:
#     print("CV with max_depth={}, min_child_weight={}".format(
#                              max_depth,
#                              min_child_weight))
#     # Update our parameters
#     params['max_depth'] = max_depth
#     params['min_child_weight'] = min_child_weight
#     # Run CV
#     cv_results = xgboost.cv(
#         params,
#         dtrain,
#         num_boost_round=num_boost_round,
#         seed=42,
#         nfold=5,
#         metrics={'rmse'},
#         early_stopping_rounds=10
#     )
#     # Update best MAE
#     mean_rmse = cv_results['test-rmse-mean'].min()
#     boost_rounds = cv_results['test-rmse-mean'].argmin()
#     print("\trmse {} for {} rounds".format(mean_logloss, boost_rounds))
#     if mean_logloss < min_rmse:
#         min_rmse = mean_rmse
#         best_params = (max_depth,min_child_weight)