In [128]:
import pandas as pd

df = pd.read_csv('SpotifyFeatures.csv')
df['genre'] = df['genre'].str.replace('Children’s Music', "Children's Music")

target = 'genre'
X = df.drop(columns=[target, 'track_name'])
X = X.drop(columns=['track_id'])
y = df[target]

In [129]:

df['genre'].nunique()

26

In [130]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features

['popularity',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence']

In [131]:
categories_features = X.select_dtypes(include=['object']).columns.tolist()
categories_for_TE = ['artist_name']
categories_features = list(set(categories_features) - set(categories_for_TE))

['mode', 'time_signature', 'key']

In [133]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, stratify=y ,random_state=42,
                                                    test_size=0.2)

In [134]:
# преобразуем artist_name TargetEncoder, чтоб было меньше столбцов

import category_encoders as ce

TE_encoder = ce.TargetEncoder(cols=['artist_name'], smoothing=1.0)
X_train_TE = TE_encoder.fit_transform(X_train[['artist_name']], Y_train)
X_test_TE = TE_encoder.transform(X_test[['artist_name']])

X_train_modified = X_train.copy()
X_test_modified = X_test.copy()

X_train_modified['artist_name'] = X_train_TE['artist_name']
X_test_modified['artist_name'] = X_test_TE['artist_name']

numeric_features = numeric_features + ['artist_name'] 
categories_features = [col for col in categories_features 
                              if col != 'artist_name']
print(f"Обновлённые числовые признаки: {numeric_features}")
print(f"Обновлённые категориальные признаки: {categories_features}")

Обновлённые числовые признаки: ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'artist_name']
Обновлённые категориальные признаки: ['mode', 'time_signature', 'key']


In [135]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
numeric_transform = Pipeline([
    ('nan_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categories_transformer = Pipeline([
    ('nan_imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprosessor = ColumnTransformer([
    ('num', numeric_transform, numeric_features),
    ('cat', categories_transformer, categories_features)
])

In [136]:
X_train['artist_name'].nunique()


13748

In [19]:
preprosessor.fit(X_train)

In [20]:
X_train_proc = preprosessor.transform(X_train)
X_test_proc = preprosessor.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(
    n_neighbors=34,
    weights='distance',
    algorithm='auto',
    leaf_size=66,
    metric='manhattan',
    n_jobs=-1
)


In [73]:
#Итоговый гипер-пайплайнчик
clf = Pipeline([
    ('preproc', preprosessor),
    ('classifier', knn)
])

In [None]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Spotify")  

<Experiment: artifact_location=('file:///C:/Users/Artem/Desktop/vs code project/KNN, '
 'ANN/mlflow_server/artefacts/848808723939071173'), creation_time=1753901309241, experiment_id='848808723939071173', last_update_time=1753901309241, lifecycle_stage='active', name='Spotify', tags={}>

In [140]:
X_train_modified

Unnamed: 0,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
45184,12.760436,20,0.000014,0.417,232453,0.9060,0.001460,C,0.1540,-5.453,Major,0.2430,165.087,4/4,0.2490
223673,12.739746,57,0.012000,0.494,227227,0.9130,0.000000,E,0.0702,-6.220,Minor,0.0477,155.880,4/4,0.6630
138964,19.993408,78,0.423000,0.807,297485,0.6810,0.000000,B,0.0962,-6.268,Major,0.0983,140.006,4/4,0.6760
125920,14.599567,45,0.978000,0.190,160227,0.1670,0.046600,G#,0.1740,-13.944,Major,0.0471,86.607,3/4,0.0382
117862,12.764047,54,0.000853,0.886,194675,0.3540,0.000000,C#,0.0731,-15.511,Major,0.5520,100.055,4/4,0.2320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129407,5.000000,28,0.994000,0.357,380627,0.0074,0.872000,C#,0.0980,-31.676,Major,0.0435,118.160,3/4,0.1360
68992,14.466019,58,0.072700,0.660,227347,0.8750,0.000037,A#,0.3580,-6.057,Minor,0.0889,120.034,4/4,0.6110
18871,14.756757,55,0.480000,0.862,212683,0.4770,0.000000,F,0.1050,-9.393,Minor,0.3310,123.034,4/4,0.6070
172541,6.000000,18,0.782000,0.485,148250,0.5170,0.000000,F#,0.6140,-19.929,Major,0.9430,71.935,3/4,0.2950


In [145]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import TruncatedSVD

X_train_small, _, y_train_small, _ = train_test_split(X_train_modified, Y_train, stratify=Y_train ,random_state=42,
                                                    test_size=0.5)



def objective(trial):
    params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 50),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric', ['cosine', 'euclidean']),# 'manhattan']),
        'leaf_size': trial.suggest_int('leaf_size', 10, 100)
    }
    
    model = KNeighborsClassifier(**params)
    pipe = Pipeline([('preproc', preprosessor), 
                     #('svd', TruncatedSVD(n_components=500)),
                        ('classifier', model)])
    return cross_val_score(pipe, X_train_small, y_train_small, 
                         cv=5, scoring='accuracy', n_jobs=7).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  
#c OHE получаем 13000+ признаков и ломается, из за слишком разряженных данных


[I 2025-07-31 20:50:35,629] A new study created in memory with name: no-name-9cee1624-4587-4129-b83b-d004844e7d5a
[I 2025-07-31 20:51:20,497] Trial 0 finished with value: 0.3985283059404877 and parameters: {'n_neighbors': 8, 'weights': 'distance', 'metric': 'cosine', 'leaf_size': 83}. Best is trial 0 with value: 0.3985283059404877.
[I 2025-07-31 20:51:28,925] Trial 1 finished with value: 0.4210334085293802 and parameters: {'n_neighbors': 43, 'weights': 'distance', 'metric': 'euclidean', 'leaf_size': 18}. Best is trial 1 with value: 0.4210334085293802.
[I 2025-07-31 20:52:08,677] Trial 2 finished with value: 0.41631754216349764 and parameters: {'n_neighbors': 45, 'weights': 'distance', 'metric': 'cosine', 'leaf_size': 84}. Best is trial 1 with value: 0.4210334085293802.
[I 2025-07-31 20:52:15,864] Trial 3 finished with value: 0.42131270813191535 and parameters: {'n_neighbors': 42, 'weights': 'distance', 'metric': 'euclidean', 'leaf_size': 85}. Best is trial 3 with value: 0.4213127081319

In [137]:
X_train_modified.head()

Unnamed: 0,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
45184,12.760436,20,1.4e-05,0.417,232453,0.906,0.00146,C,0.154,-5.453,Major,0.243,165.087,4/4,0.249
223673,12.739746,57,0.012,0.494,227227,0.913,0.0,E,0.0702,-6.22,Minor,0.0477,155.88,4/4,0.663
138964,19.993408,78,0.423,0.807,297485,0.681,0.0,B,0.0962,-6.268,Major,0.0983,140.006,4/4,0.676
125920,14.599567,45,0.978,0.19,160227,0.167,0.0466,G#,0.174,-13.944,Major,0.0471,86.607,3/4,0.0382
117862,12.764047,54,0.000853,0.886,194675,0.354,0.0,C#,0.0731,-15.511,Major,0.552,100.055,4/4,0.232


In [None]:
import mlflow 
import mlflow.sklearn as ms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

with mlflow.start_run(run_name='KNN_TE3'):
    n_neighbors, leaf_size, weights, metric = 49, 42, 'uniform', 'euclidean'
    mlflow.log_param('n_neighbors', n_neighbors)
    mlflow.log_param("weights", weights)
    mlflow.log_param("metric", metric)
    mlflow.log_param("leaf_size", leaf_size)

    knn1 = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm='auto',
        leaf_size=leaf_size,
        metric=metric,
        n_jobs=-1
    )
    clf = Pipeline([
        ('preproc', preprosessor),
        ('classifier', knn1)
    ])
    clf.fit(X_train_modified, Y_train)
    y_pred = clf.predict(X_test_modified)

    mlflow.log_metric("accuracy", accuracy_score(Y_test, y_pred))
    mlflow.log_metric("precision", precision_score(Y_test, y_pred, average='weighted'))
    mlflow.log_metric("recall", recall_score(Y_test, y_pred, average='weighted'))
    mlflow.log_metric("f1", f1_score(Y_test, y_pred, average='weighted'))

    cl_rep = classification_report(y_true=Y_test, y_pred=y_pred)

    ms.log_model(clf, 'knn1')

    with open('classification_report.txt', 'w') as file:
        file.write(cl_rep)
    mlflow.log_artifact("classification_report.txt")
mlflow.end_run()
