In [2]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


# Очистка и обработка данных

In [3]:
data = pd.read_csv('SpotifyFeatures.csv')
data.info()
data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             232725 non-null  object 
 1   artist_name       232725 non-null  object 
 2   track_name        232724 non-null  object 
 3   track_id          232725 non-null  object 
 4   popularity        232725 non-null  int64  
 5   acousticness      232725 non-null  float64
 6   danceability      232725 non-null  float64
 7   duration_ms       232725 non-null  int64  
 8   energy            232725 non-null  float64
 9   instrumentalness  232725 non-null  float64
 10  key               232725 non-null  object 
 11  liveness          232725 non-null  float64
 12  loudness          232725 non-null  float64
 13  mode              232725 non-null  object 
 14  speechiness       232725 non-null  float64
 15  tempo             232725 non-null  float64
 16  time_signature    23

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368


In [4]:
ntrack_id = {}

for index, original_id in enumerate(data['track_id']):
    ntrack_id[original_id] = index + 1
    
data['track_id'] = data['track_id'].map(ntrack_id)

In [5]:
data.head(3)

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,1,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),2,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,3,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368


In [6]:
df = data
df['popularity'].value_counts()

popularity
0      6312
50     5415
53     5414
51     5401
52     5342
       ... 
96        8
94        7
99        4
98        3
100       2
Name: count, Length: 101, dtype: int64

In [7]:
df = df.drop_duplicates(subset=['track_name', 'artist_name'], keep='first')

In [8]:
popular_value = df[(df['popularity'] >= 0) & (df['popularity'] <= 10)]

popular_value['popularity'].value_counts().sum()

15599

In [9]:
df = df[df['popularity'] > 10]

In [10]:
corr = df[['acousticness', 'danceability', 'energy', 'liveness', 
           'loudness', 'speechiness', 'tempo', 'valence', 'popularity']].corr()
corr

Unnamed: 0,acousticness,danceability,energy,liveness,loudness,speechiness,tempo,valence,popularity
acousticness,1.0,-0.36824,-0.710752,0.086488,-0.68557,0.197792,-0.261971,-0.351756,-0.273447
danceability,-0.36824,1.0,0.352924,-0.022435,0.453855,0.123253,0.027189,0.564769,0.255396
energy,-0.710752,0.352924,1.0,0.220612,0.818308,0.165919,0.23931,0.456166,0.114647
liveness,0.086488,-0.022435,0.220612,1.0,0.060896,0.57237,-0.062469,0.009694,-0.213763
loudness,-0.68557,0.453855,0.818308,0.060896,1.0,-0.005059,0.245979,0.42627,0.256942
speechiness,0.197792,0.123253,0.165919,0.57237,-0.005059,1.0,-0.108133,0.010291,-0.220942
tempo,-0.261971,0.027189,0.23931,-0.062469,0.245979,-0.108133,1.0,0.143931,0.077828
valence,-0.351756,0.564769,0.456166,0.009694,0.42627,0.010291,0.143931,1.0,0.070435
popularity,-0.273447,0.255396,0.114647,-0.213763,0.256942,-0.220942,0.077828,0.070435,1.0


In [11]:
df = df.drop(['loudness', 'track_id', 'duration_ms', 'key', 'mode', 'time_signature'], axis=1)
df = df.dropna()

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160914 entries, 7 to 232724
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             160914 non-null  object 
 1   artist_name       160914 non-null  object 
 2   track_name        160914 non-null  object 
 3   popularity        160914 non-null  int64  
 4   acousticness      160914 non-null  float64
 5   danceability      160914 non-null  float64
 6   energy            160914 non-null  float64
 7   instrumentalness  160914 non-null  float64
 8   liveness          160914 non-null  float64
 9   speechiness       160914 non-null  float64
 10  tempo             160914 non-null  float64
 11  valence           160914 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 16.0+ MB


# Начинаем работу с обучением модели

### Подготовка текстовых данных:

In [13]:
def tokenize_track_name(track_name): #Токенизация, превращение предложений в слова
    fixed = track_name.lower().strip().replace(' ', '_').replace('(', '').replace(')', '').replace("'", '')
    tokens = word_tokenize(fixed)
    return tokens

df['tokenized'] = df['track_name'].apply(tokenize_track_name)

In [14]:
model = Word2Vec(sentences=df['tokenized'], vector_size=100, window=5, min_count=1, workers=4)

track_vector = model.wv['ice_on_my_baby_feat._kevin_gates_-_remix'] #Получение вектора из примера
track_vector

array([ 9.1009140e-05, -4.8354626e-04,  3.9073359e-03,  2.1400785e-03,
       -3.7047565e-03,  3.4237886e-03, -3.9438247e-03,  6.0416711e-03,
        5.3183208e-03, -4.3890132e-03, -9.3062995e-03,  5.2790524e-04,
       -2.7662076e-03,  6.0389042e-03, -5.9912931e-03,  5.3417492e-03,
        3.5509539e-03,  6.5546873e-04,  9.6462509e-03,  7.6512335e-04,
        6.3972948e-03, -7.1902419e-03, -5.4282178e-03, -9.6371584e-03,
       -1.7605149e-03,  4.3710805e-03, -4.1288352e-03, -6.2523987e-03,
       -8.3235977e-04, -1.1033989e-03, -3.8899840e-03, -8.4642163e-03,
       -4.4307900e-03,  1.0168660e-03, -7.7912067e-03, -8.1407037e-03,
        8.3342791e-03, -6.6730496e-04, -6.6423072e-03,  1.1249899e-03,
        2.8591920e-03,  6.2815011e-03,  2.0230019e-03, -5.1247133e-03,
       -9.5850853e-03,  7.3707439e-03, -3.8310564e-03,  5.4073689e-04,
        3.1455385e-03, -9.2466595e-04,  6.3091554e-03, -4.3644262e-03,
        4.8595546e-03,  8.6594978e-03,  2.2547257e-03, -2.6387954e-03,
      

### Подготовка числовых данных:

In [15]:
numerical_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence']
scaler = StandardScaler()

df[numerical_features] = scaler.fit_transform(df[numerical_features]) #Применяем стандартизацию к числовым признакам

## Создание функций для получения рекомендаций

### Функция для поиска похожих треков по названию:

In [16]:
def get_similar_tracks_by_name(track, model, top_n=5): 
    if track in model.wv:
        similar_tracks = model.wv.most_similar(track, topn=top_n)
        return [(similar[0], round(similar[1], 3)) for similar in similar_tracks]
    else:
        print('Трек не найден в модели.')
        return 

get_similar_tracks_by_name('mia', model, 10)

[('_wd_31_/_act_4', 0.428),
 ('a_poem_to_my_chinese_girlfriend', 0.428),
 ('_ho_pagato_il_trimestre_marcello', 0.419),
 ('_je_vais_danser_en_votre_honneur_-_act_two', 0.379),
 ('thickfreakness', 0.378),
 ('_-_thin_white_duke_mix', 0.374),
 ('main_show', 0.373),
 ('tried_my_best', 0.372),
 ('tuba_concerto', 0.372),
 ('twang', 0.37)]

### Функция для нахождения похожих треков по числовым признакам:

In [17]:
def get_similar_tracks_by_features(track, df, top_n=5): 
    track_normalized = track.lower().strip()
    df['track_name_normalized'] = df['track_name'].str.lower().str.strip()
    
    if track_normalized not in df['track_name_normalized'].values:
        print(f"Трек '{track}' не найден в датасете.")
        return []

    track_index = df[df['track_name_normalized'] == track_normalized].index[0]
    target_features = df.loc[track_index, numerical_features].values.reshape(1, -1)
    
    similarities = cosine_similarity(target_features, df[numerical_features])
    similar_indices = similarities[0].argsort()[-top_n-1:-1][::-1]    
    by_features = [(df.iloc[i]['track_name'], round(similarities[0][i], 3)) for i in similar_indices]
    return by_features

get_similar_tracks_by_features('mia', df, 10)

[('No Cap', 0.984),
 ("Tell'em Who You Got It From", 0.982),
 ('Up (feat. Lil Uzi Vert)', 0.98),
 ('Talk Up (feat. Jay-Z)', 0.977),
 ('Perplexing Pegasus - From SR3MM', 0.975),
 ('Wake Up in the Sky', 0.975),
 ('Griselda - Remix', 0.974),
 ('PICK IT UP (feat. A$AP Rocky)', 0.974),
 ('Back Soon', 0.969),
 ('Blessings', 0.969)]

### Комбинированная функция, включает и текстовые и числовые признаки:

In [18]:
def get_combined_recommendations(track, artist, df, model, top_n=5): 
    
    track_normalized = track.lower().strip().replace(' ', '_').replace('(', '').replace(')', '').replace("'", '')
    artist_normalized = artist.lower().strip().replace(' ', '_').replace('(', '').replace(')', '').replace("'", '')
    
    if track_normalized not in model.wv:
        print('Трек не найден в модели.')
        return []
  
    by_name = model.wv.most_similar(track_normalized, topn=top_n) 
    by_name = [(similar[0], round(similar[1], 3)) for similar in by_name]

    df['track_name_normalized'] = df['track_name'].str.lower().str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace("'", '')
    df['artist_name_normalized'] = df['artist_name'].str.lower().str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace("'", '')
    
    track_index = df[(df['track_name_normalized'] == track_normalized) & (df['artist_name_normalized'] == artist_normalized)].index
    if len(track_index) == 0:
        print(f"Трек '{track}' от '{artist}' не найден в датасете.")
        return []
    
    track_index = track_index[0]
    target_features = df.loc[track_index, numerical_features].values.reshape(1, -1) 
    by_features = cosine_similarity(target_features, df[numerical_features]) 
    similar_indices = by_features[0].argsort()[-top_n-1:-1][::-1]    
    by_features = [(df.iloc[i]['track_name'], round(by_features[0][i], 3)) for i in similar_indices]

    combined_recommendations = {name: similarity for name, similarity in by_name}
    for name, similarity in by_features:
        if name not in combined_recommendations:
            combined_recommendations[name] = similarity

    genre_weight = 5 
    popularity_weight = 1.3
    artist_weight = 10  
    
    final_scores = {}
    for name, sim in combined_recommendations.items(): 
        genre_score = 0
        popularity_score = 0
        artist_score = 0
        
        genre_row = df[df['track_name'] == name] 
        if not genre_row.empty:
            genre_score = genre_row['genre'].values[0] 
            genre_score = 1 if genre_score in df.loc[track_index, 'genre'] else 0  
        
        popularity_row = df[df['track_name'] == name]
        if not popularity_row.empty:
            popularity_score = popularity_row['popularity'].values[0]  

        artist_row = df[df['track_name'] == name]
        if not artist_row.empty:
            artist_score = artist_weight if artist_row['artist_name_normalized'].values[0] == artist_normalized else 0
        
        final_score = (sim * 0.5 + 
                       (genre_weight * genre_score) + 
                       (popularity_weight * popularity_score) + 
                       artist_score)
        final_scores[name] = min(final_score, 100)  

    sorted_recommendations = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

    formatted_recommendations = []
    for name, score in sorted_recommendations[:top_n]:
        artist_name = df[df['track_name'] == name]['artist_name'].values[0]  
        formatted_recommendations.append((name, artist_name, round(score, 1))) 

    return formatted_recommendations

get_combined_recommendations('rap god', 'eminem', df, model, top_n=20)

[('Offended', 'Eminem', 92.2),
 ('Really Got It', 'Jerreau', 79.6),
 ('I Smile', 'Kirk Franklin', 75.9),
 ('Imma Tell', 'Tech N9ne', 74.4),
 ('Makin Love', 'Kevin Gates', 67.9),
 ('Cuidao - Reggaeton Version', 'Super Yei', 57.7),
 ('Sweetest Thing', 'Dave B.', 57.7),
 ('Them Belly Full - Live At Music Hall, Boston / 1978',
  'Bob Marley & The Wailers',
  55.1),
 ('Un Vacilon (Young Maelo)', 'Arcangel', 55.1),
 ('Power (Remix) [feat. Daddy Yankee, Kendo Kaponi, Gotay El Autentiko, Pusho, Alexio, D Ozi, Almighty, Ozuna & Anuel Aa]',
  'Benny Benni',
  52.5),
 ('Speak To Me / Breathe', 'Easy Star All-Stars', 52.5),
 ('El Comediante', 'Pusho', 51.2),
 ('Strangeulation Vol. II Cypher I', 'Tech N9ne Collabos', 49.9),
 ('Mi Niña - Remix', 'D-Enyel', 47.3),
 ('No Mercy', 'Eptic', 44.7),
 ('No Diga Más', 'Dillon Francis', 43.4),
 ('I Got It', 'Nitti Gritti', 39.5),
 ('Lo Que No Sabes Tu - En Vivo Desde El Anfiteatro El Hatillo, Caracas-Venezuela/2014',
  'Chino & Nacho',
  38.2),
 ("Everybody's

## Наглядный пример возможного оценивания по метрикам рекомендательных систем:

In [19]:
def precision_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    hits = sum(1 for track in recommended_k if track in relevant_set)
    return hits / k if k > 0 else 0

def recall_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    hits = sum(1 for track in recommended_k if track in relevant_set)
    return hits / len(relevant) if relevant else 0

def mean_average_precision(recommended, relevant):
    ap_sum = 0
    hits = 0
    for i, track in enumerate(recommended):
        if track in relevant:
            hits += 1
            ap_sum += hits / (i + 1)
    return ap_sum / len(relevant) if relevant else 0

def mean_reciprocal_rank(recommended, relevant):
    for i, track in enumerate(recommended):
        if track in relevant:
            return 1 / (i + 1)
    return 0

In [20]:
test_tracks = ['Wake Up in the Sky']
artists = ['Gucci Mane']

relevant_tracks = {
    'Wake Up in the Sky': ['a lot', 'Perplexing Pegasus - From SR3MM', 'No I in Team']
}

In [21]:
k = 5  
for track, artist in zip(test_tracks, artists):
    recommended = [rec[0] for rec in get_combined_recommendations(track, artist, df, model, top_n=25)] 
    relevant = relevant_tracks.get(track, [])
    
    precision = precision_at_k(recommended, relevant, k)
    recall = recall_at_k(recommended, relevant, k)
    map_score = mean_average_precision(recommended, relevant)
    mrr = mean_reciprocal_rank(recommended, relevant)

    print(f"Track: {track}")
    print(f"Precision@{k}: {precision}")
    print(f"Recall@{k}: {recall}")
    print(f"MAP: {map_score}")
    print(f"MRR: {mrr}\n")

Track: Wake Up in the Sky
Precision@5: 0.6
Recall@5: 1.0
MAP: 1.0
MRR: 1.0



##  Использование метрик, основанных на качественных характеристиках, таких как разнообразие и средняя дистанция:

In [22]:
def diversity_score(recommended_tracks, feature_vectors):
    distances = cosine_distances(feature_vectors)
    return np.mean(distances)

def mean_distance_score(original_track_features, recommended_features):
    distances = cosine_distances(original_track_features.reshape(1, -1), recommended_features)
    return np.mean(distances)

track = "Roar"
artist = 'katy perry'
recommendations = get_combined_recommendations(track, artist, df, model, top_n=10)

recommended_names = [rec[0] for rec in recommendations]
recommended_features = df[df['track_name'].isin(recommended_names)][numerical_features].values
original_track_features = df[df['track_name'] == track][numerical_features].values[0]

diversity = diversity_score(recommended_names, recommended_features)
mean_distance = mean_distance_score(original_track_features, recommended_features)

print("Diversity:", diversity) #Разнообразие
print("Mean Distance:", mean_distance) #Среднее расстояние


Diversity: 0.8383406702568168
Mean Distance: 0.658571091204805
