# Хакатона Практикум х Яндекс Музыка

## Проект: Обнаружение каверов музыкальных треков

### Команда: Deadline "Yesterday"



## Этап: Поиск дополнительной информации для трека

## Импортируем библиотеки

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import musicbrainzngs

In [3]:
# Initialize the MusicBrainz API client
musicbrainzngs.set_useragent("MyApp", "1.0")

In [4]:
import warnings
warnings.simplefilter("ignore")

## Функции написанные для проекта

In [5]:
# Define a function to fetch MusicBrainz metadata for a given ISRC
def fetch_musicbrainz_metadata(isrc):
    try:
        # Query MusicBrainz for metadata using the ISRC
        result = musicbrainzngs.search_recordings(isrc=isrc)
         
        if 'recording-list' in result and len(result['recording-list']) > 0:
            recording = result['recording-list'][len(result['recording-list'])-1]
            title = recording['title']
            vrem = recording['release-list']
            all_date = []
            for i in range(len(vrem)):
                if 'date' in vrem[i]:
                    all_date.append(vrem[i]['date'])
                else:
                    all_date.append('unknown')
                
            all_date.sort()     
            first_year = all_date[0]
            
            artist_credit_phrase = result['recording-list'][len(result['recording-list'])-1]['artist-credit-phrase']
            
            isrc_list = recording['isrc-list']

            # Fetch additional metadata from release group
            release_group_title = recording['release-list'][::-1][0]['release-group']['title']
            release_group_type = recording['release-list'][::-1][0]['release-group']['type']


            return {
                'title': title,
                'release_group_title': release_group_title,
                'release_group_type': release_group_type,
                'release_year': first_year ,
                'artist_credit_phrase': artist_credit_phrase,
                'isrc_list': isrc_list
            }
    
    except Exception as e:
        print(f"Error fetching metadata for ISRC {isrc}: {str(e)}")
    
    return None

Проверка работы запроса

In [6]:
fetch_musicbrainz_metadata('AUBM02000117')

{'title': 'Endless Summer',
 'release_group_title': 'Revisions 2020',
 'release_group_type': 'EP',
 'release_year': '2020-11-06',
 'artist_credit_phrase': 'Josh Pyke & Elana Stone',
 'isrc_list': ['AUBM02200152', 'AUBM02000117']}

## Закгрузка БД

Загрузка исходных данных, откуда будем брать  isrc и вести поиск доп.информации по треку.

In [7]:
url_meta = "./data/meta.json"
df_meta = pd.read_json(url_meta, lines=True)

In [8]:
df_meta.head()

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration
0,c3b9d6a354ca008aa4518329aaa21380,1639688000000.0,Happy New Year,EN,RUB422103970,[DANCE],161120.0
1,c57e3d13bbbf5322584a7e92e6f1f7ff,1637762000000.0,Bad Habits,EN,QZN882178276,[ELECTRONICS],362260.0
2,955f2aafe8717908c140bf122ba4172d,1637768000000.0,Por Esa Loca Vanidad,,QZNJZ2122549,"[FOLK, LATINFOLK]",260000.0
3,fae5a077c9956045955dde02143bd8ff,1637768000000.0,Mil Lagrimas,,QZNJZ2166033,"[FOLK, LATINFOLK]",190000.0
4,6bede082154d34fc18d9a6744bc95bf5,1637768000000.0,Sexo Humo y Alcohol,,QZNJZ2122551,"[FOLK, LATINFOLK]",203000.0


In [9]:
unique_isrc = df_meta['isrc'].sort_values().unique()[:-1]

In [10]:
unique_isrc

array(['AEA0D1828626', 'AEA0D1860569', 'AEA0D1866274', ...,
       'ZZOPM2235994', 'ZZOPM2235995', 'ZZOPM2237412'], dtype=object)

Создадим датафрейм куда будем записывать необходимую инфомрацию

In [12]:
data_isrc = pd.DataFrame(
    {
        "isrc": [np.nan]*len(unique_isrc),
        "artist": [np.nan]*len(unique_isrc),
        'title':[np.nan]*len(unique_isrc),
        "release_year": [np.nan]*len(unique_isrc),
        "release_group_title": [np.nan]*len(unique_isrc),
        "release_group_type": [np.nan]*len(unique_isrc),
        "isrc_list": [np.nan]*len(unique_isrc)
    }
)


In [13]:
data_isrc.head(7)

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,


Разобъем наш поиск на три части. Значений много и поиск будет вестись долго.

In [14]:
print('Всего шагов:', len(unique_isrc))
print('Делим на три части:', len(unique_isrc)/3.0)

Всего шагов: 71283
Делим на три части: 23761.0


In [15]:
part_1 = unique_isrc[0:int((len(unique_isrc)/3.0))]

In [16]:
len(part_1)

23761

In [17]:
part_1

array(['AEA0D1828626', 'AEA0D1860569', 'AEA0D1866274', ...,
       'QZDFP1932829', 'QZDFP1932830', 'QZDFP1932831'], dtype=object)

In [18]:
part_2 = unique_isrc[len(part_1):(2*len(part_1))]

In [19]:
part_2

array(['QZDFP1932832', 'QZDFP1932833', 'QZDFP1932834', ...,
       'QZNJW2184128', 'QZNJW2184129', 'QZNJW2184130'], dtype=object)

In [20]:
len(part_2)

23761

In [21]:
part_3 = unique_isrc[(2*len(part_1)):len(unique_isrc)]

In [22]:
part_3

array(['QZNJW2184131', 'QZNJW2184132', 'QZNJW2184216', ...,
       'ZZOPM2235994', 'ZZOPM2235995', 'ZZOPM2237412'], dtype=object)

In [23]:
len(part_3)

23761

## Поиск

In [24]:
index = 0
# Iterate through rows and fetch MusicBrainz metadata
for value in tqdm(part_3):
    metadata = fetch_musicbrainz_metadata(value)
    if (metadata != None):
        data_isrc.loc[index, 'isrc'] = value
        data_isrc.loc[index, 'title'] = metadata['title']
        data_isrc.loc[index, 'release_year'] = metadata['release_year']
        data_isrc.loc[index, 'release_group_title'] = metadata['release_group_title']
        data_isrc.loc[index, 'release_group_type'] = metadata['release_group_type']
        data_isrc.loc[index, 'artist'] = metadata['artist_credit_phrase']
        data_isrc.at[index, 'isrc_list'] = metadata['isrc_list']
        index += 1

data_isrc.to_csv('data_artist_song_p3.csv', sep='\t')

 51%|█████     | 12162/23761 [3:23:29<3:04:40,  1.05it/s]

Error fetching metadata for ISRC RUB422202211: caused by: <urlopen error [Errno 61] Connection refused>


 52%|█████▏    | 12355/23761 [3:26:43<3:10:06,  1.00s/it]

Error fetching metadata for ISRC RUB631700330: 'type'


 85%|████████▌ | 20281/23761 [5:39:15<57:01,  1.02it/s]  

Error fetching metadata for ISRC USAT21504066: 'type'


 87%|████████▋ | 20741/23761 [5:46:57<51:25,  1.02s/it]  

Error fetching metadata for ISRC USDY42032551: 'release-list'


 89%|████████▉ | 21173/23761 [5:54:12<43:47,  1.02s/it]  

Error fetching metadata for ISRC USHR12040190: 'release-list'


 99%|█████████▊| 23461/23761 [6:32:31<05:03,  1.01s/it]

Error fetching metadata for ISRC USWWW0202466: 'type'


100%|██████████| 23761/23761 [6:37:32<00:00,  1.00s/it]


In [25]:
data_isrc.head()

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
0,QZNJX2130808,plenka,Call Me (slowed),2021-10-06,Call Me (slowed),Single,QZNJX2130808
1,QZNWQ2070741,Tesher,Jalebi Baby,2020-11-13,NRJ Summer Hits Only 2021,Compilation,[QZNWQ2070741]
2,QZNWQ2087174,UNAVERAGE GANG,Pay the Price,2020-10-31,Revenant,EP,[QZNWQ2087174]
3,QZNWR2031217,Scott Rill & Yasmin Levy,La alegria,2022-02-25,House Party 2022,Compilation,[QZNWR2031217]
4,QZNWS2230712,g3ox_em,GigaChad Theme - Phonk House Version,unknown,GigaChad Theme (Phonk House Version),Single,[QZNWS2230712]


In [26]:
data_isrc.tail()

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
71278,,,,,,,
71279,,,,,,,
71280,,,,,,,
71281,,,,,,,
71282,,,,,,,


In [28]:
data_isrc_2 = pd.DataFrame(
    {
        "isrc": [np.nan]*len(unique_isrc),
        "artist": [np.nan]*len(unique_isrc),
        'title':[np.nan]*len(unique_isrc),
        "release_year": [np.nan]*len(unique_isrc),
        "release_group_title": [np.nan]*len(unique_isrc),
        "release_group_type": [np.nan]*len(unique_isrc),
        "isrc_list": [np.nan]*len(unique_isrc)
    }
)


In [29]:
index = 0
# Iterate through rows and fetch MusicBrainz metadata
for value in tqdm(part_2):
    metadata = fetch_musicbrainz_metadata(value)
    if (metadata != None):
        data_isrc_2.loc[index, 'isrc'] = value
        data_isrc_2.loc[index, 'title'] = metadata['title']
        data_isrc_2.loc[index, 'release_year'] = metadata['release_year']
        data_isrc_2.loc[index, 'release_group_title'] = metadata['release_group_title']
        data_isrc_2.loc[index, 'release_group_type'] = metadata['release_group_type']
        data_isrc_2.loc[index, 'artist'] = metadata['artist_credit_phrase']
        data_isrc_2.at[index, 'isrc_list'] = metadata['isrc_list']
        index += 1


100%|██████████| 23761/23761 [6:37:32<00:00,  1.00s/it]  


In [30]:
data_isrc_2.head()

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
0,QZES52065750,Yasumu,Lonely Nights,2020-05-29,Lonely Nights,Single,QZES52065750
1,QZES62110621,Wasback,Just Dance,2021-06-25,Just Dance,Single,[QZES62110621]
2,QZES71982312,Tones and I,Dance Monkey,,NRJ Hit Music Only 2020,Compilation,"[USAT21904532, QZES71982312]"
3,QZES72019010,The Tea Party,Isolation,2021-11-26,Blood Moon Rising,Album,[QZES72019010]
4,QZES82063366,Aljosha Konstanty & Liam Thomas,Peace of Mind,unknown,Peace of Mind,Single,[QZES82063366]


In [31]:
data_isrc_2.to_csv('data_artist_song_p2.csv', sep='\t')

In [32]:
data_isrc_3 = pd.DataFrame(
    {
        "isrc": [np.nan]*len(unique_isrc),
        "artist": [np.nan]*len(unique_isrc),
        'title':[np.nan]*len(unique_isrc),
        "release_year": [np.nan]*len(unique_isrc),
        "release_group_title": [np.nan]*len(unique_isrc),
        "release_group_type": [np.nan]*len(unique_isrc),
        "isrc_list": [np.nan]*len(unique_isrc)
    }
)


In [33]:
index = 0
# Iterate through rows and fetch MusicBrainz metadata
for value in tqdm(part_1):
    metadata = fetch_musicbrainz_metadata(value)
    if (metadata != None):
        data_isrc_3.loc[index, 'isrc'] = value
        data_isrc_3.loc[index, 'title'] = metadata['title']
        data_isrc_3.loc[index, 'release_year'] = metadata['release_year']
        data_isrc_3.loc[index, 'release_group_title'] = metadata['release_group_title']
        data_isrc_3.loc[index, 'release_group_type'] = metadata['release_group_type']
        data_isrc_3.loc[index, 'artist'] = metadata['artist_credit_phrase']
        data_isrc_3.at[index, 'isrc_list'] = metadata['isrc_list']
        index += 1
        
data_isrc_3.to_csv('data_artist_song_p1.csv', sep='\t')

 44%|████▍     | 10469/23761 [2:55:03<3:41:24,  1.00it/s]

Error fetching metadata for ISRC FR59R1767337: 'type'


 73%|███████▎  | 17325/23761 [4:50:03<1:46:56,  1.00it/s]

Error fetching metadata for ISRC NLB151800189: 'type'


 73%|███████▎  | 17326/23761 [4:50:04<1:47:27,  1.00s/it]

Error fetching metadata for ISRC NLB151800191: 'type'


 90%|████████▉ | 21340/23761 [5:57:24<42:01,  1.04s/it]  

Error fetching metadata for ISRC QMZM51800007: 'type'


100%|██████████| 23761/23761 [6:37:59<00:00,  1.00s/it]


In [34]:
data_isrc_3.head()

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
0,AEA0D1828626,Markul,Худший друг,2018-10-16,Great Depression,Album,AEA0D1828626
1,AEA0D2000803,plenka,When You Find Me,2020-09-02,Angle,EP,[AEA0D2000803]
2,AEA0Q1921861,Intelligency,August,2020-03-19,Renovatio,Album,[AEA0Q1921861]
3,AEA2D2200207,R3HAB,My Pony,2022-07-29,Bravo Hits 118,Compilation,[AEA2D2200207]
4,ARF410500095,Karen Souza,Do You Really Want to Hurt Me,,Essentials,Album,[ARF410500095]


In [35]:
data_isrc_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71283 entries, 0 to 71282
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 1418 non-null   object
 1   artist               1418 non-null   object
 2   title                1418 non-null   object
 3   release_year         1418 non-null   object
 4   release_group_title  1418 non-null   object
 5   release_group_type   1418 non-null   object
 6   isrc_list            1418 non-null   object
dtypes: object(7)
memory usage: 3.8+ MB


In [36]:
data_isrc_3= data_isrc_3.dropna(how='all')

In [37]:
data_isrc_3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1418 entries, 0 to 1417
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 1418 non-null   object
 1   artist               1418 non-null   object
 2   title                1418 non-null   object
 3   release_year         1418 non-null   object
 4   release_group_title  1418 non-null   object
 5   release_group_type   1418 non-null   object
 6   isrc_list            1418 non-null   object
dtypes: object(7)
memory usage: 88.6+ KB


In [38]:
data_isrc_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71283 entries, 0 to 71282
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 85 non-null     object
 1   artist               85 non-null     object
 2   title                85 non-null     object
 3   release_year         85 non-null     object
 4   release_group_title  85 non-null     object
 5   release_group_type   85 non-null     object
 6   isrc_list            85 non-null     object
dtypes: object(7)
memory usage: 3.8+ MB


In [39]:
data_isrc_2= data_isrc_2.dropna(how='all')

In [40]:
data_isrc_2

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
0,QZES52065750,Yasumu,Lonely Nights,2020-05-29,Lonely Nights,Single,QZES52065750
1,QZES62110621,Wasback,Just Dance,2021-06-25,Just Dance,Single,[QZES62110621]
2,QZES71982312,Tones and I,Dance Monkey,,NRJ Hit Music Only 2020,Compilation,"[USAT21904532, QZES71982312]"
3,QZES72019010,The Tea Party,Isolation,2021-11-26,Blood Moon Rising,Album,[QZES72019010]
4,QZES82063366,Aljosha Konstanty & Liam Thomas,Peace of Mind,unknown,Peace of Mind,Single,[QZES82063366]
...,...,...,...,...,...,...,...
80,QZMEP2041695,Eric D. Lawrence,Darksiders Genesis (cover),2020-10-13,Darksiders Genesis (cover),Single,[QZMEP2041695]
81,QZMHL2048016,"Coopex, Yohan Gerber & Lunis",Radioactive,2020-12-24,Radioactive,Single,[QZMHL2048016]
82,QZMHN2066557,"Arc North, Cour & New Beat Order feat. Lunis",Faded,2020-11-24,Faded,Single,[QZMHN2066557]
83,QZMHN2097278,"Crystal Rock, Felix Schorn & NOTSOBAD ft. City...",Astronaut in the Ocean,2021-05-28,Astronaut in the Ocean,Single,[QZMHN2097278]


In [41]:
data_isrc_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85 entries, 0 to 84
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 85 non-null     object
 1   artist               85 non-null     object
 2   title                85 non-null     object
 3   release_year         85 non-null     object
 4   release_group_title  85 non-null     object
 5   release_group_type   85 non-null     object
 6   isrc_list            85 non-null     object
dtypes: object(7)
memory usage: 5.3+ KB


In [42]:
data_isrc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71283 entries, 0 to 71282
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 1622 non-null   object
 1   artist               1622 non-null   object
 2   title                1622 non-null   object
 3   release_year         1622 non-null   object
 4   release_group_title  1622 non-null   object
 5   release_group_type   1622 non-null   object
 6   isrc_list            1622 non-null   object
dtypes: object(7)
memory usage: 3.8+ MB


In [43]:
data_isrc= data_isrc.dropna(how='all')

In [44]:
data_isrc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1622 entries, 0 to 1621
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 1622 non-null   object
 1   artist               1622 non-null   object
 2   title                1622 non-null   object
 3   release_year         1622 non-null   object
 4   release_group_title  1622 non-null   object
 5   release_group_type   1622 non-null   object
 6   isrc_list            1622 non-null   object
dtypes: object(7)
memory usage: 101.4+ KB


## Объединение полученных данных в одну таблицу

In [45]:
data_isrc = pd.concat([data_isrc,data_isrc_2], ignore_index=True)

In [46]:
data_isrc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1707 entries, 0 to 1706
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 1707 non-null   object
 1   artist               1707 non-null   object
 2   title                1707 non-null   object
 3   release_year         1707 non-null   object
 4   release_group_title  1707 non-null   object
 5   release_group_type   1707 non-null   object
 6   isrc_list            1707 non-null   object
dtypes: object(7)
memory usage: 93.5+ KB


In [47]:
data_isrc = pd.concat([data_isrc,data_isrc_3], ignore_index=True)

In [48]:
data_isrc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3125 entries, 0 to 3124
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isrc                 3125 non-null   object
 1   artist               3125 non-null   object
 2   title                3125 non-null   object
 3   release_year         3125 non-null   object
 4   release_group_title  3125 non-null   object
 5   release_group_type   3125 non-null   object
 6   isrc_list            3125 non-null   object
dtypes: object(7)
memory usage: 171.0+ KB


In [49]:
data_isrc.head()

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
0,QZNJX2130808,plenka,Call Me (slowed),2021-10-06,Call Me (slowed),Single,QZNJX2130808
1,QZNWQ2070741,Tesher,Jalebi Baby,2020-11-13,NRJ Summer Hits Only 2021,Compilation,[QZNWQ2070741]
2,QZNWQ2087174,UNAVERAGE GANG,Pay the Price,2020-10-31,Revenant,EP,[QZNWQ2087174]
3,QZNWR2031217,Scott Rill & Yasmin Levy,La alegria,2022-02-25,House Party 2022,Compilation,[QZNWR2031217]
4,QZNWS2230712,g3ox_em,GigaChad Theme - Phonk House Version,unknown,GigaChad Theme (Phonk House Version),Single,[QZNWS2230712]


In [50]:
data_isrc.sort_values(by=['isrc'])

Unnamed: 0,isrc,artist,title,release_year,release_group_title,release_group_type,isrc_list
1707,AEA0D1828626,Markul,Худший друг,2018-10-16,Great Depression,Album,AEA0D1828626
1708,AEA0D2000803,plenka,When You Find Me,2020-09-02,Angle,EP,[AEA0D2000803]
1709,AEA0Q1921861,Intelligency,August,2020-03-19,Renovatio,Album,[AEA0Q1921861]
1710,AEA2D2200207,R3HAB,My Pony,2022-07-29,Bravo Hits 118,Compilation,[AEA2D2200207]
1711,ARF410500095,Karen Souza,Do You Really Want to Hurt Me,,Essentials,Album,[ARF410500095]
...,...,...,...,...,...,...,...
1617,ZZOPM2235991,Lofi Fruits Music & Chill Fruits Music,Industry Baby,2022-06-24,"Lofi Fruits Remix, Vol. 1",Album,[ZZOPM2235991]
1618,ZZOPM2235992,Lofi Fruits Music & Chill Fruits Music,Thousand Miles,2022-06-24,"Lofi Fruits Remix, Vol. 1",Album,[ZZOPM2235992]
1619,ZZOPM2235993,Lofi Fruits Music & Chill Fruits Music,Everybody Dies in Their Nightmares,2022-06-24,"Lofi Fruits Remix, Vol. 1",Album,[ZZOPM2235993]
1620,ZZOPM2235994,Lofi Fruits Music & Chill Fruits Music,Can We Kiss Forever?,2022-06-24,"Lofi Fruits Remix, Vol. 1",Album,[ZZOPM2235994]


In [51]:
data_isrc = data_isrc.sort_values(by=['isrc'])

Запись данных на локальный компьютер

In [52]:
data_isrc.to_csv('data_artist_song.csv', sep='\t')