In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cmc-ml-spotify-tracks-popularity-prediction-2025/sample_submission.csv
/kaggle/input/cmc-ml-spotify-tracks-popularity-prediction-2025/simple_pipeline.ipynb
/kaggle/input/cmc-ml-spotify-tracks-popularity-prediction-2025/train.csv
/kaggle/input/cmc-ml-spotify-tracks-popularity-prediction-2025/test.csv


In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from scipy.stats import entropy
from sklearn.model_selection import KFold

In [3]:
# Используйте эти пути для запуска ноутбука на Kaggle
PATH_TO_KAGGLE_TRAIN = "/kaggle/input/cmc-ml-spotify-tracks-popularity-prediction-2025/train.csv"
PATH_TO_KAGGLE_TEST = "/kaggle/input/cmc-ml-spotify-tracks-popularity-prediction-2025/test.csv"
PATH_TO_KAGGLE_SUBMISSION = "/kaggle/working/submission.csv"

train = pd.read_csv(PATH_TO_KAGGLE_TRAIN)
test = pd.read_csv(PATH_TO_KAGGLE_TEST)


X_train = train.drop(["index", "popularity"], axis=1)
X_test = test.drop(["index"], axis=1)
y_train = train["popularity"]

train.head()

Unnamed: 0,index,composer,album,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity
0,113287,14,1412,2967.28,0.045235,1,-38.44055,1,0.036275,-0.223432,0.706,474.708,0.093009,95.49426,1113008.4,4,2
1,49396,7,9772,5596.904,0.071796,9,-43.769375,1,0.050086,-0.212998,0.777,511.94,0.60976,114.64017,116020.83,3,2
2,122241,16,10251,3345.864,0.109368,4,-42.194985,0,0.037856,-0.212332,0.925,497.978,0.11227,101.37465,348964.32,3,0
3,123302,16,9342,2486.376,0.056395,9,-33.63287,0,0.032282,-0.212554,0.906,334.1572,0.079207,128.98356,382820.43,4,0
4,72027,9,4646,6169.896,0.086304,2,-33.402815,1,0.034611,-0.23542,0.932,465.4,0.38625,106.71843,221056.8,3,11


In [4]:
def target_encode_new(train, test, cat_col, target_col, alpha=5):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_encoded = train.copy()

    # Глобальное среднее и стандартное отклонение
    global_mean = train[target_col].mean()
    global_std = train[target_col].std()
    global_median = train[target_col].median()


    # Создаём пустой DataFrame для агрегированных статистик
    all_agg_stats = pd.DataFrame()

    for train_idx, val_idx in kf.split(train):
        X_train, X_val = train.iloc[train_idx], train.iloc[val_idx]

        agg_stats = X_train.groupby(cat_col)[target_col].agg([
            'mean', 'std', 'count', 'median', 'min', 'max',
            lambda x: x.quantile(0.25),
            lambda x: x.quantile(0.75),
            lambda x: entropy(np.histogram(x, bins=10)[0])
        ]).rename(columns={
            '<lambda_0>': 'q1',
            '<lambda_1>': 'q3',
            '<lambda_2>': 'entropy'
        })

        agg_stats['prob_ratio'] = (agg_stats['mean'] / global_mean).replace(np.inf, 1.0)

        agg_stats['lower_ci'] = agg_stats['mean'] - 1.96 * agg_stats['std']/np.sqrt(agg_stats['count'])
        agg_stats['upper_ci'] = agg_stats['mean'] + 1.96 * agg_stats['std']/np.sqrt(agg_stats['count'])


        agg_stats['smoothed_mean'] = (agg_stats['mean'] * agg_stats['count'] + global_mean * alpha) / (agg_stats['count'] + alpha)

        # Заполняем пропуски в std
        agg_stats['std'] = agg_stats['std'].fillna(global_std)

        # Сохраняем статистики для всех фолдов
        all_agg_stats = pd.concat([all_agg_stats, agg_stats])

        # Добавляем статистики в валидационную часть
        for stat in agg_stats.columns:
            train_encoded.loc[val_idx, f'{cat_col}_{stat}'] = X_val[cat_col].map(agg_stats[stat])

    # Усредняем статистики по всем фолдам
    final_stats = all_agg_stats.groupby(level=0).mean()

    # Кодируем тестовые данные
    test_encoded = test.copy()
    for stat in final_stats.columns:
        test_encoded[f'{cat_col}_{stat}'] = test[cat_col].map(final_stats[stat]).fillna({
            'mean': global_mean,
            'std': global_std,
            'count': 0,
            'median': global_median,
            'smoothed_mean': global_mean,
            'lower_ci': global_mean,
            'upper_ci': global_mean,
            'q1': global_median,
            'q3': global_median,
            'entropy': 0,
        })

    return train_encoded, test_encoded

In [5]:
train_encoded, test_encoded = target_encode_new(train, test, 'album', 'popularity')
train_encoded, test_encoded = target_encode_new(train_encoded, test_encoded, 'composer', 'popularity')

In [6]:
X_train = train_encoded.drop(["index", "popularity"], axis=1)
X_test = test_encoded.drop(["index"], axis=1)
y_train = train_encoded["popularity"]

In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [8]:
X_train_valid, X_test_valid, y_train_valid, y_test_valid = train_test_split(
    X_train,  # Признаки
    y_train,  # Целевая переменная
    test_size=0.1,  # Доля тестовых данных
    random_state=42,  # Фиксируем случайность для воспроизводимости
    shuffle=True      # Перемешивание данных перед разделением
)

In [9]:
from sklearn.model_selection import KFold

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

def calculate_metrics(targets, preds):
    metrics = {
        'r2': r2_score(targets, preds),
        'mse': mean_squared_error(targets, preds),
        'rmse': mean_squared_error(targets, preds, squared=False),  # Вот правильный способ получить RMSE
        'mae': mean_absolute_error(targets, preds)
    }
    return metrics

In [10]:
from xgboost import XGBRegressor
import xgboost as xgb
xgb.set_config(verbosity=0)

from sklearn.model_selection import cross_val_score
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll import scope

trials = Trials()

def quality(params):
    regressor = XGBRegressor(objective='reg:squarederror', **params)
    regressor.fit(X_train_valid, y_train_valid)
    preds = regressor.predict(X_test_valid)
    preds = np.clip(preds, 0, None)
    metrics = calculate_metrics(y_test_valid, preds)
    return {'loss': metrics['rmse'], 'status': STATUS_OK }

grid = {
    'n_estimators': 50 + hp.randint('n_estimators', 1450),
    'max_depth': 2 + hp.randint('max_depth', 8),
    'learning_rate': hp.loguniform('learning_rate', -5, -0.5),
    'num_leaves': 3 + hp.randint('num_leaves', 30),
    'min_child_samples': 2 + hp.randint('min_child_samples', 28),
    'subsample': hp.uniform('subsample', 0.2, 1.),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1.),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1.),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.2, 1.),
    'reg_alpha': hp.uniform('reg_alpha', 0.1, 0.6),
    'reg_lambda': hp.uniform('reg_lambda', 0.1, 0.6),
    # 'booster': hp.choice('booster', ['gbtree', 'gblinear']),
    # 'tree_method': hp.choice('tree_method', ['exact', 'approx', 'hist']),
}

best_xgb_params = fmin(
    fn=quality,
    space=grid,
    algo=tpe.suggest,
    max_evals=70,
    trials=trials,
    verbose=1
)

100%|██████████| 70/70 [07:41<00:00,  6.59s/trial, best loss: 3.102757182235797]


In [11]:
xgb_regressor = XGBRegressor(objective='reg:squarederror', **best_xgb_params)
xgb_regressor.fit(X_train_valid, y_train_valid)
preds = xgb_regressor.predict(X_test_valid)
preds = np.clip(preds, 0, None)
xgb_metrics = calculate_metrics(y_test_valid, preds)
xgb_metrics

{'r2': 0.6394748717531736,
 'mse': 9.666770534969482,
 'rmse': 3.1091430547611476,
 'mae': 1.3253509553376892}

In [12]:
model = xgb_regressor

In [13]:
test_pred = model.predict(X_test)

submission = pd.DataFrame({"index": test["index"],
                           "popularity": test_pred})

In [14]:
submission.to_csv(PATH_TO_KAGGLE_SUBMISSION, index=False)