In [1]:
import pandas as pd

df = pd.read_csv('SpotifyFeatures.csv')
df['genre'] = df['genre'].str.replace('Children’s Music', "Children's Music")

target = 'genre'
X = df.drop(columns=[target, 'track_name'])
X = X.drop(columns=['track_id'])
y = df[target]

In [2]:

df['genre'].nunique()

26

In [67]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features

['popularity',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence']

In [68]:
categories_features = X.select_dtypes(include=['object']).columns.tolist()
categories_features

['artist_name', 'key', 'mode', 'time_signature']

In [64]:
categories_features = X.select_dtypes(include=['object']).columns.tolist()
categories_for_TE = ['artist_name']
categories_features = list(set(categories_features) - set(categories_for_TE))
categories_features

['key', 'mode', 'time_signature']

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, stratify=y ,random_state=42,
                                                    test_size=0.2)

In [6]:
# преобразуем artist_name TargetEncoder, чтоб было меньше столбцов

import category_encoders as ce

TE_encoder = ce.TargetEncoder(cols=['artist_name'], smoothing=1.0)
X_train_TE = TE_encoder.fit_transform(X_train[['artist_name']], Y_train)
X_test_TE = TE_encoder.transform(X_test[['artist_name']])

X_train_modified = X_train.copy()
X_test_modified = X_test.copy()

X_train_modified['artist_name'] = X_train_TE['artist_name']
X_test_modified['artist_name'] = X_test_TE['artist_name']

numeric_features = numeric_features + ['artist_name'] 
categories_features = [col for col in categories_features 
                              if col != 'artist_name']
print(f"Обновлённые числовые признаки: {numeric_features}")
print(f"Обновлённые категориальные признаки: {categories_features}")

Обновлённые числовые признаки: ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'artist_name']
Обновлённые категориальные признаки: ['key', 'mode', 'time_signature']


In [70]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
numeric_transform = Pipeline([
    ('nan_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categories_transformer = Pipeline([
    ('nan_imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprosessor = ColumnTransformer([
    ('num', numeric_transform, numeric_features),
    ('cat', categories_transformer, categories_features)
])

In [8]:
X_train['artist_name'].nunique()


13748

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(
    n_neighbors=34,
    weights='distance',
    algorithm='auto',
    leaf_size=66,
    metric='manhattan',
    n_jobs=-1
)
#Итоговый гипер-пайплайнчик
clf = Pipeline([
    ('preproc', preprosessor),
    ('classifier', knn)
])


In [10]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Spotify")  

<Experiment: artifact_location=('file:///C:/Users/Artem/Desktop/vs code project/KNN, '
 'ANN/mlflow_server/artefacts/848808723939071173'), creation_time=1753901309241, experiment_id='848808723939071173', last_update_time=1753901309241, lifecycle_stage='active', name='Spotify', tags={}>

In [11]:
X_train_modified

Unnamed: 0,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
45184,12.760436,20,0.000014,0.417,232453,0.9060,0.001460,C,0.1540,-5.453,Major,0.2430,165.087,4/4,0.2490
223673,12.739746,57,0.012000,0.494,227227,0.9130,0.000000,E,0.0702,-6.220,Minor,0.0477,155.880,4/4,0.6630
138964,19.993408,78,0.423000,0.807,297485,0.6810,0.000000,B,0.0962,-6.268,Major,0.0983,140.006,4/4,0.6760
125920,14.599567,45,0.978000,0.190,160227,0.1670,0.046600,G#,0.1740,-13.944,Major,0.0471,86.607,3/4,0.0382
117862,12.764047,54,0.000853,0.886,194675,0.3540,0.000000,C#,0.0731,-15.511,Major,0.5520,100.055,4/4,0.2320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129407,5.000000,28,0.994000,0.357,380627,0.0074,0.872000,C#,0.0980,-31.676,Major,0.0435,118.160,3/4,0.1360
68992,14.466019,58,0.072700,0.660,227347,0.8750,0.000037,A#,0.3580,-6.057,Minor,0.0889,120.034,4/4,0.6110
18871,14.756757,55,0.480000,0.862,212683,0.4770,0.000000,F,0.1050,-9.393,Minor,0.3310,123.034,4/4,0.6070
172541,6.000000,18,0.782000,0.485,148250,0.5170,0.000000,F#,0.6140,-19.929,Major,0.9430,71.935,3/4,0.2950


In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import BaggingClassifier
X_train_small, _, y_train_small, _ = train_test_split(X_train_modified, Y_train, stratify=Y_train ,random_state=42,
                                                    test_size=0.7)



def objective(trial):
    params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 50),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['cosine', 'euclidean']),# 'manhattan']),
        'leaf_size': trial.suggest_int('leaf_size', 10, 100)
    }
    
    knn1 = KNeighborsClassifier(**params)
    model = BaggingClassifier(knn1, n_estimators=10, random_state=42)
    pipe = Pipeline([('preproc', preprosessor), 
                     #('svd', TruncatedSVD(n_components=500)),
                        ('classifier', model)])
    return cross_val_score(pipe, X_train_small, y_train_small, 
                         cv=5, scoring='accuracy', n_jobs=7).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  
#c OHE получаем 13000+ признаков и манхеттан с uniform ломается, из за слишком разряженных данных


In [15]:
X_train_modified.head()

Unnamed: 0,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
45184,12.760436,20,1.4e-05,0.417,232453,0.906,0.00146,C,0.154,-5.453,Major,0.243,165.087,4/4,0.249
223673,12.739746,57,0.012,0.494,227227,0.913,0.0,E,0.0702,-6.22,Minor,0.0477,155.88,4/4,0.663
138964,19.993408,78,0.423,0.807,297485,0.681,0.0,B,0.0962,-6.268,Major,0.0983,140.006,4/4,0.676
125920,14.599567,45,0.978,0.19,160227,0.167,0.0466,G#,0.174,-13.944,Major,0.0471,86.607,3/4,0.0382
117862,12.764047,54,0.000853,0.886,194675,0.354,0.0,C#,0.0731,-15.511,Major,0.552,100.055,4/4,0.232


In [None]:
import mlflow 
import mlflow.sklearn as ms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import BaggingClassifier


with mlflow.start_run(run_name='KNN_TE4_bagg'):
    n_neighbors, leaf_size, weights, metric = 50, 73, 'uniform', 'euclidean'
    mlflow.log_param('n_neighbors', n_neighbors)
    mlflow.log_param("weights", weights)
    mlflow.log_param("metric", metric)
    mlflow.log_param("leaf_size", leaf_size)

    knn1 = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm='auto',
        leaf_size=leaf_size,
        metric=metric,
        n_jobs=-1
    )
    knn_bagg = BaggingClassifier(knn1, n_estimators=10, random_state=42)
    clf = Pipeline([
        ('preproc', preprosessor),
        ('classifier', knn_bagg)
    ])
    clf.fit(X_train_modified, Y_train)
    y_pred = clf.predict(X_test_modified)

    mlflow.log_metric("accuracy", accuracy_score(Y_test, y_pred))
    mlflow.log_metric("precision", precision_score(Y_test, y_pred, average='weighted'))
    mlflow.log_metric("recall", recall_score(Y_test, y_pred, average='weighted'))
    mlflow.log_metric("f1", f1_score(Y_test, y_pred, average='weighted'))

    cl_rep = classification_report(y_true=Y_test, y_pred=y_pred)

    ms.log_model(clf, 'knn1')

    with open('classification_report.txt', 'w') as file:
        file.write(cl_rep)
    mlflow.log_artifact("classification_report.txt")
mlflow.end_run()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run KNN_TE4_bagg at: http://localhost:5000/#/experiments/848808723939071173/runs/3544c26d02094b43ba8c4c4887c96195
🧪 View experiment at: http://localhost:5000/#/experiments/848808723939071173


HNSW

In [None]:
import hnswlib
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class HNSWKNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, space='cosine', ef=200, M=16, k=10):
        self.space = space  # 'cosine', 'l2', 'ip'
        self.ef = ef       # точность поиска (больше = точнее, но медленнее)
        self.M = M         # связность графа (обычно 16-64)
        self.k = k      #  соседи
        self.label_encoder_ = None

    def fit(self, X, y):
        # Кодируем строковые метки в числа
        self.classes_, y_encoded = np.unique(y, return_inverse=True)
        self.y_train_encoded = y_encoded
        
    
        self.index = hnswlib.Index(space=self.space, dim=X.shape[1])
        self.index.init_index(max_elements=X.shape[0], ef_construction=self.ef, M=self.M)
        self.index.add_items(X, np.arange(len(X)))
        self.index.set_ef(self.ef)
        return self

    def predict(self, X):
        # Получаем индексы k ближайших соседей
        neighbor_indices, _ = self.index.knn_query(X, k=self.k)
        
        # Для каждого объекта выбираем наиболее частый класс среди соседей
        preds = []
        for indices in neighbor_indices:
            neighbor_labels = self.y_train_encoded[indices]
            counts = np.bincount(neighbor_labels, minlength=len(self.classes_))
            preds.append(self.classes_[np.argmax(counts)])
        return np.array(preds)

In [None]:
import mlflow 
import mlflow.sklearn as ms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import BaggingClassifier

with mlflow.start_run(run_name='KNN_HNSW_one'):
    n_neighbors, space, ef, Mm = 75, 'cosine', 15, 16
    mlflow.log_param('n_neighbors', n_neighbors)
    mlflow.log_param("точность", ef)
    mlflow.log_param("space", space)
    mlflow.log_param("связность", Mm)

    knn1_HNSW = HNSWKNNClassifier(k=n_neighbors, space=space, M=Mm, ef=ef )
    knn_bagg = BaggingClassifier(knn1_HNSW, n_estimators=10, random_state=42)
    clf = Pipeline([
        ('preproc', preprosessor),
        ('classifier', knn_bagg)
    ])
    clf.fit(X_train_modified, Y_train)
    y_pred = clf.predict(X_test_modified)

    mlflow.log_metric("accuracy", accuracy_score(Y_test, y_pred))
    mlflow.log_metric("precision", precision_score(Y_test, y_pred, average='weighted'))
    mlflow.log_metric("recall", recall_score(Y_test, y_pred, average='weighted'))
    mlflow.log_metric("f1", f1_score(Y_test, y_pred, average='weighted'))

    cl_rep = classification_report(y_true=Y_test, y_pred=y_pred)

    ms.log_model(clf, 'knn1')

    with open('classification_report.txt', 'w') as file:
        file.write(cl_rep)
    mlflow.log_artifact("classification_report.txt")
mlflow.end_run()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run KNN_HNSW_one at: http://localhost:5000/#/experiments/848808723939071173/runs/36b7170e13e2418c83dbb33ab1de9676
🧪 View experiment at: http://localhost:5000/#/experiments/848808723939071173


In [None]:
#ScaNN нет на Windows
#from scann.scann_ops.py import scann_ops

Теперь CatBoost. Также ищем параметры и готовим модельку

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier, Pool

X_train_small, _, y_train_small, _ = train_test_split(X_train, Y_train, \
                stratify=Y_train ,random_state=42, test_size=0.7)

categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()



def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10.0, log=True),
    }
    
    model = CatBoostClassifier(
        **params,
        loss_function='MultiClass',
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        thread_count=-1,
        cat_features=categorical_features  
    )
    
    
    pipe = Pipeline([
        #('preproc', preprosessor),
        ('classifier', model)
    ])
    
    return cross_val_score(
        pipe, 
        X_train_small, 
        y_train_small,
        cv=3,  
        scoring='accuracy',
        n_jobs=1  # CatBoost не любит многопоточность в кросс-валидации
    ).mean()


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=3600)  

In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report
import mlflow

# для катбуста нужны категориальные признаки
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# создаем обучающие и тестовые пулы
train_pool = Pool(data=X_train, label=Y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=Y_test, cat_features=categorical_features)


model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)

with mlflow.start_run(run_name="CatBoost_base_params"):
    
    mlflow.log_param('iterations', 500)
    mlflow.log_param("depth", 6)
    mlflow.log_param("learning_rate", 0.1)

    
    model.fit(train_pool, eval_set=test_pool)
    y_pred = model.predict(X_test)
 

    cl_rep = classification_report(y_true=Y_test,y_pred=y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(cl_rep)
    mlflow.log_artifact("classification_report.txt")
    print(cl_rep)

    acc = accuracy_score(Y_test, y_pred)
    prec = precision_score(Y_test, y_pred, average='weighted')
    rec = recall_score(Y_test, y_pred, average='weighted')
    f1 = f1_score(Y_test, y_pred, average='weighted')

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1", f1)




0:	learn: 0.2420238	test: 0.2412289	best: 0.2412289 (0)	total: 4.26s	remaining: 35m 25s
100:	learn: 0.7174133	test: 0.7529273	best: 0.7529273 (100)	total: 7m 53s	remaining: 31m 9s
200:	learn: 0.7345150	test: 0.7694274	best: 0.7694274 (200)	total: 17m	remaining: 25m 17s
300:	learn: 0.7400956	test: 0.7737673	best: 0.7737673 (292)	total: 25m 44s	remaining: 17m 1s
400:	learn: 0.7446503	test: 0.7770974	best: 0.7773767 (397)	total: 34m 59s	remaining: 8m 38s
499:	learn: 0.7495703	test: 0.7792459	best: 0.7795037 (487)	total: 43m 57s	remaining: 0us

bestTest = 0.7795037061
bestIteration = 487

Shrink model to first 488 iterations.
                  precision    recall  f1-score   support

       A Capella       0.96      0.96      0.96        24
     Alternative       0.62      0.55      0.58      1853
           Anime       0.96      0.99      0.97      1787
           Blues       0.84      0.89      0.86      1805
Children's Music       0.68      0.69      0.69      2951
       Classical     