In [1]:
import numpy as np
import pandas as pd
from glob import glob

## Подготовка данных для обучения модели предсказаний

Перед загрузкой была осуществлена предподготовка данных на предмет наличия опечаток в названиях и жанрах.

In [2]:
genres = pd.read_csv('data/fixed_data.csv')

In [3]:
genres

Unnamed: 0,song,student,coarse_genre,genre1,genre2,genre3
0,7Horse_-_Answer_the_Bell.mp3,Шемякина Аня,Rock,Blues Rock,,
1,A_Simple_Life-Brian_Crain.mp3,Людмила_Проценко,classic,neo-classical,new age piano,
2,Adagio_in_G_Minor-Albinoni.mp3,Никанорова Даша,Classic,Instrumental,,
3,Age_Atomic-Solar_Bears.mp3,Паша_Дерябин,electro,Synthwave,,
4,Agnus_Dei-Enigma.mp3,Darya Khaleneva,new_age,downtempo,ambient,
...,...,...,...,...,...,...
325,Пыяла_АИГЕЛ.mp3,Цитрина Александра,Electronic,Hip-hop,,
326,Оргазм_Нострадамуса—Раздражение_аморала.mp3,Журбенко Петр,Punk,rock,,
327,Tycho-Awake.mp3,Купаева Дарья,electronica,chilwave,ambient,post-rock
328,Fugleflugten_Gladens_Port.mp3,Катерина_Данько,Indi_rock,shoegaze,,


In [4]:
genres['coarse_genre'] = genres['coarse_genre'].str.lower()
genres['genre1'] = genres['genre1'].str.lower()
genres['genre2'] = genres['genre2'].str.lower()
genres['genre3'] = genres['genre3'].str.lower()

In [5]:
print(genres['coarse_genre'].value_counts().to_string())

rock                 54
electronica          45
metal                32
hip-hop              24
pop                  18
electro              18
symphonic_metal      18
folk                 15
indie                14
new_age               8
electronic            7
indie_rock            7
classic               6
instrumental          5
jazz                  5
orchestral            4
alternative rock      3
art_song              3
country               3
alt_rock              3
christian_rock        3
indie_pop             2
britpop               2
ethnic                2
classics              2
rap rock              2
indi_rock             2
alternative           1
indie rock            1
indi_pop              1
heavy_metal           1
rap                   1
funk_rock             1
electropop            1
trap                  1
punk                  1
dance                 1
progressive_rock      1
disco                 1
soft_rock             1
rock'n'roll           1
synth-punk      

Отсортируем песни по названиям, чтобы они соотносились с треками по индексам

In [6]:
genres.sort_values('song', axis=0, inplace=True)
genres = genres.reset_index(drop=True)
genres

Unnamed: 0,song,student,coarse_genre,genre1,genre2,genre3
0,7Horse_-_Answer_the_Bell.mp3,Шемякина Аня,rock,blues rock,,
1,A_Simple_Life-Brian_Crain.mp3,Людмила_Проценко,classic,neo-classical,new age piano,
2,Adagio_in_G_Minor-Albinoni.mp3,Никанорова Даша,classic,instrumental,,
3,Age_Atomic-Solar_Bears.mp3,Паша_Дерябин,electro,synthwave,,
4,Agnus_Dei-Enigma.mp3,Darya Khaleneva,new_age,downtempo,ambient,
...,...,...,...,...,...,...
325,Электрослабость-Терентий.mp3,Aleksei Zverev,rock,punk,,
326,Я_говорю_тебе_да-Зоя_Ященко_и_группа_Белая_Гва...,Данил_Литвинов,art_song,indie_rock,,
327,Я_не_один-Эм_Калинин.mp3,Данил_Литвинов,indie,hip-hop,,
328,Я_так_соскучился-Порнофильмы.mp3,Данил_Литвинов,rock,punk_rock,,


In [7]:
genre_list = genres['coarse_genre'].tolist()+genres['genre1'].tolist()+genres['genre2'].tolist()+ genres['genre3'].tolist()
genre_list = list(set(genre_list))
genre_list

[nan,
 'breakbeat',
 'indie pop',
 'russian rock',
 'bard song',
 'garage rock',
 'soft_rock',
 'indie',
 'symphonic_metal',
 'technical_death_metal',
 'melodic_death_metal',
 'alt_rock',
 'post-rock',
 'chanson',
 'jewish music',
 'ambient',
 'classic russian pop',
 'bebop',
 'traditional_pop',
 'alt_pop',
 'folk_metal',
 'classic_rock',
 'sovietwave',
 'j-pop',
 'german hard rock',
 'disco',
 'progressive_rock',
 'glam rock',
 'post-punk',
 'dance',
 'trip-hop',
 'blues rock',
 'funk_rock',
 'gothic_metal',
 'country',
 'noir_chanson',
 'punk_rock',
 'christian_rock',
 'emo',
 'melodic-hardcore',
 'russian indie',
 'audiobook',
 'electropop',
 'rave',
 'trap',
 'power_metal',
 'neo mellow',
 'downtempo',
 'indie_rock',
 'industrial',
 'blues',
 'classical russian rock',
 'progressive',
 'drum_and_bass',
 "r'n'b",
 'alternative_rock',
 'art_song',
 'metal',
 'drill and bass',
 'noise',
 'french jazz',
 'pop',
 'classics',
 'dubstep',
 'dance music',
 'hardbass',
 'jazz',
 'electronic_

In [8]:
len(genre_list)

171

## Создадим таблицу жанров

В таблице имена колонок будут соответствовать уникальным жанрам, а строки - трекам.
Закодируем данные таким образом, чтобы для каждой песни в колонках с соответствующим жанром были единицы, а во всех остальных - нули.

In [9]:
genres_table = pd.DataFrame(0, index=np.arange(len(genres)), columns=genre_list)
genres_table.insert(0, 'song', genres['song'])
genres_table

Unnamed: 0,song,NaN,breakbeat,indie pop,russian rock,bard song,garage rock,soft_rock,indie,symphonic_metal,...,post-hardcore,orchestral,shoegaze,balcan music,heavy_metal,alternative-rock,russian synthpop,indi_pop,rap,electronica
0,7Horse_-_Answer_the_Bell.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A_Simple_Life-Brian_Crain.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adagio_in_G_Minor-Albinoni.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Age_Atomic-Solar_Bears.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agnus_Dei-Enigma.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Электрослабость-Терентий.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
326,Я_говорю_тебе_да-Зоя_Ященко_и_группа_Белая_Гва...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
327,Я_не_один-Эм_Калинин.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
328,Я_так_соскучился-Порнофильмы.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
genres_table.drop(genres_table.columns[1], axis=1, inplace=True)
genres_table

Unnamed: 0,song,breakbeat,indie pop,russian rock,bard song,garage rock,soft_rock,indie,symphonic_metal,technical_death_metal,...,post-hardcore,orchestral,shoegaze,balcan music,heavy_metal,alternative-rock,russian synthpop,indi_pop,rap,electronica
0,7Horse_-_Answer_the_Bell.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A_Simple_Life-Brian_Crain.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adagio_in_G_Minor-Albinoni.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Age_Atomic-Solar_Bears.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agnus_Dei-Enigma.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Электрослабость-Терентий.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
326,Я_говорю_тебе_да-Зоя_Ященко_и_группа_Белая_Гва...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
327,Я_не_один-Эм_Калинин.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
328,Я_так_соскучился-Порнофильмы.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
for i in range(len(genres['song'])):
    coarse_genre = genres['coarse_genre'][i]
    genres_table[coarse_genre][i] = 1
    if str(genres['genre1'][i]) != 'nan':
        genre1 = genres['genre1'][i]    
        genres_table[genre1][i] = 1
    if str(genres['genre2'][i]) != 'nan':
        genre2 = genres['genre2'][i]    
        genres_table[genre2][i] = 1
    if str(genres['genre3'][i]) != 'nan':
        genre3 = genres['genre3'][i]    
        genres_table[genre3][i] = 1
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres_table[coarse_genre][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres_table[genre1][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres_table[genre2][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres_table[genre3][i] = 1


In [12]:
genres_table

Unnamed: 0,song,breakbeat,indie pop,russian rock,bard song,garage rock,soft_rock,indie,symphonic_metal,technical_death_metal,...,post-hardcore,orchestral,shoegaze,balcan music,heavy_metal,alternative-rock,russian synthpop,indi_pop,rap,electronica
0,7Horse_-_Answer_the_Bell.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A_Simple_Life-Brian_Crain.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adagio_in_G_Minor-Albinoni.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Age_Atomic-Solar_Bears.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Agnus_Dei-Enigma.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Электрослабость-Терентий.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
326,Я_говорю_тебе_да-Зоя_Ященко_и_группа_Белая_Гва...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
327,Я_не_один-Эм_Калинин.mp3,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
328,Я_так_соскучился-Порнофильмы.mp3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
songs_dir = '/home/asha/IB/python/Music_classification/data/songs'

In [14]:
paths = sorted(glob(songs_dir + '/*.mp3') + glob(songs_dir + '/*.flac'))

In [15]:
names = [p.split('/')[-1] for p in paths]

### Проверяем, что все наши названия треков на диске совпали с именами в таблице

Всего должно быть 330 совпадений

In [16]:
np.setdiff1d(names, genres_table['song'])  # различий не найдено

array([], dtype='<U69')

In [17]:
len(set(genres_table['song']).intersection(set(names)))  # Все треки совпадают

330

### Сохраняем таблицу с жанрами для каждой песни

In [18]:
genres_table.to_csv('data/genres.csv', index=False)