In [28]:
import pandas as pd

df = pd.read_csv('SpotifyFeatures.csv')

target = 'genre'
X = df.drop(columns=[target, 'track_name'])
y = df[target]

In [29]:
X = X.drop(columns=['track_id'])
X.head()

Unnamed: 0,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Henri Salvador,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Martin & les fées,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Joseph Williams,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Henri Salvador,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Fabien Nataf,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [30]:
y.head()

0    Movie
1    Movie
2    Movie
3    Movie
4    Movie
Name: genre, dtype: object

In [31]:
X.dtypes

artist_name          object
popularity            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo               float64
time_signature       object
valence             float64
dtype: object

In [32]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features

['popularity',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'liveness',
 'loudness',
 'speechiness',
 'tempo',
 'valence']

In [33]:
categories_features = X.select_dtypes(include=['object']).columns.tolist()
categories_features


['artist_name', 'key', 'mode', 'time_signature']

In [None]:
un = df[categories_features]
uni_cat = un.nunique()
uni_cat, len(df['artist_name'])


(artist_name       14564
 key                  12
 mode                  2
 time_signature        5
 dtype: int64,
 232725)

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_transform = Pipeline([
    ('nan_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categories_transformer = Pipeline([
    ('nan_imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprosessor = ColumnTransformer([
    ('num', numeric_transform, numeric_features),
    ('cat', categories_transformer, categories_features)
])

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, stratify=y ,random_state=42,
                                                    test_size=0.2)

In [36]:
X_train.head()

Unnamed: 0,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
30142,OxT,25,0.0296,0.502,217880,0.961,0.000381,D#,0.107,-5.302,Minor,0.084,140.015,4/4,0.624
157325,Rancid,40,0.18,0.411,152760,0.958,2e-06,G,0.126,-3.291,Major,0.0812,171.686,4/4,0.777
5389,Disturbed,52,0.000116,0.589,191960,0.983,0.00211,A#,0.0628,-2.521,Minor,0.121,110.007,4/4,0.469
205668,Tom Tykwer,21,0.281,0.143,213970,0.207,0.773,C#,0.113,-14.786,Major,0.0362,168.028,4/4,0.0646
202563,Alexandre Desplat,33,0.907,0.128,32173,0.136,0.944,A#,0.119,-16.687,Minor,0.0363,147.175,4/4,0.139


In [46]:
X_train['artist_name'].nunique()


13738

In [37]:
preprosessor.fit(X_train)

In [38]:
X_train_proc = preprosessor.transform(X_train)
X_test_proc = preprosessor.transform(X_test)

In [39]:
X_train_proc.shape, np.mean(X_train_proc, axis=0)

((186180, 13768),
 matrix([[-3.56287469e-17,  3.93082580e-17, -3.54502770e-16, ...,
           1.03684606e-01,  8.62283811e-01,  2.26501235e-02]]))

In [60]:
X_train_proc[:3,3939].toarray()

array([[0.],
       [0.],
       [0.]])

In [62]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(
    n_neighbors=5,
    weights='uniform',
    algorithm='auto',
    leaf_size=30,
    p=2,
    metric='minkowski',
    n_jobs=-1
)



In [63]:
#Итоговый гипер-пайплайнчик
clf = Pipeline([
    ('preproc', preprosessor),
    ('classifier', knn)
])

In [64]:
clf.fit(X_train, Y_train)

In [65]:
y_pred = clf.predict(X_test)

In [66]:
from sklearn.metrics import classification_report
cl_rep = classification_report(y_true=Y_test, y_pred=y_pred)

In [None]:
print(cl_rep)

                  precision    recall  f1-score   support

       A Capella       0.32      0.25      0.28        24
     Alternative       0.09      0.21      0.12      1853
           Anime       0.51      0.64      0.57      1787
           Blues       0.31      0.39      0.34      1805
Children's Music       0.82      0.83      0.82      1081
Children’s Music       0.05      0.07      0.06      1871
       Classical       0.59      0.63      0.61      1851
          Comedy       0.95      0.95      0.95      1936
         Country       0.31      0.45      0.37      1733
           Dance       0.15      0.19      0.17      1740
      Electronic       0.52      0.47      0.50      1875
            Folk       0.21      0.20      0.21      1860
         Hip-Hop       0.19      0.22      0.20      1859
           Indie       0.10      0.06      0.07      1909
            Jazz       0.43      0.35      0.39      1888
           Movie       0.69      0.59      0.64      1561
           Op