<a href="https://colab.research.google.com/github/richy1996/Springboard_capstone3/blob/main/music_data_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!sudo apt install -y fluidsynth

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fluid-soundfont-gm libevdev2 libfluidsynth3 libgudev-1.0-0 libinput-bin
  libinput10 libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a libqt5dbus5
  libqt5gui5 libqt5network5 libqt5svg5 libqt5widgets5 libwacom-bin
  libwacom-common libwacom9 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 qsynth qt5-gtk-platformtheme qttranslations5-l10n
  timgm6mb-soundfont
Suggested packages:
  fluid-soundfont-gs qt5-image-formats-plugins qtwayland5 jackd
The following NEW packages will be installed:
  fluid-soundfont-gm fluidsynth libevdev2 libfluidsynth3 libgudev-1.0-0
  libinput-bin libinput10 libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a
  libqt5dbus5 libqt5gui5 libqt5network5 libqt5svg5 libqt5widgets5 libwacom-bin
  libwacom-common libwacom9 libx

In [None]:
!pip install --upgrade pyfluidsynth

Collecting pyfluidsynth
  Downloading pyFluidSynth-1.3.2-py3-none-any.whl (19 kB)
Installing collected packages: pyfluidsynth
Successfully installed pyfluidsynth-1.3.2


In [None]:
!pip install pretty_midi

Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.0-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.3/50.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592287 sha256=ee6d0b793dd10191d11a9edd405f8bd784ac5314cd604445d5490937f79563d4
  Stored in directory: /root/.cache/pip/wheels/cd/a5/30/7b8b7f58709f5150f67f98fde4b891ebf0be9ef07a8af49f25
Successfully built pretty_midi
Installing collected packages: mido, pretty_midi
Successfully installed mido-1.3.0 pretty_midi-0.2.10


In [None]:
import collections
import datetime
import fluidsynth
import glob
import numpy as np
import pathlib
import pandas as pd
import pretty_midi
import seaborn as sns
import tensorflow as tf

from IPython import display
from matplotlib import pyplot as plt
from typing import Optional

In [None]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Sampling rate for audio playback
_SAMPLING_RATE = 16000

## Download the dataset

- Scraped from https://www.midiworld.com/
- specifically all the classical work, ~1300 files

In [None]:
import requests
from bs4 import BeautifulSoup

def mid_scraper(URL):

    # URL = "https://www.midiworld.com/bach.htm"
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all anchor tags with .mid in the href attribute
    midi_links = [a['href'] for a in soup.find_all('a', href=True) if '.mid' in a['href']]

    return set(midi_links)

In [None]:
# mid files form classics page
url = "https://www.midiworld.com/classic.htm"

classic_links = mid_scraper(url)
len(classic_links)

241

In [None]:
import requests
from bs4 import BeautifulSoup
import re

URL = "https://www.midiworld.com/classic.htm"
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all anchor tags with "The" in the text (indicating artist pages like "The Bach Page")
artist_page_links = [a['href'] for a in soup.find_all('a', href=True) if 'The' in a.text]

# Filter out links that don't match the desired format using a regular expression
filtered_links = [link for link in artist_page_links if re.match(r'^[a-z]+\.htm$', link.split('/')[-1])]

print(filtered_links)


['bach.htm', 'bartok.htm', 'beethoven.htm', 'brahms.htm', 'chopin.htm', 'handel.htm', 'hummel.htm', 'liszt.htm', 'mendelssohn.htm', 'rachmaninov.htm', 'scarlatti.htm', 'schumann.htm', 'scriabin.htm', 'shostakovich.htm']


In [None]:
for name in filtered_links:

    # name = "bach.htm"

    # mid files form classics page
    url = "https://www.midiworld.com/" + name

    links = mid_scraper(url)
    classic_links = classic_links.union(links)

In [None]:
len(classic_links)

1328

In [None]:
list(classic_links)[:3]

['https://www.midiworld.com/midis/other/bach/bwv781.mid',
 'https://www.midiworld.com/midis/other/bach/bwv833.mid',
 'https://www.midiworld.com/midis/other/c1/coumoiss.mid']

In [None]:
!mkdir midis

In [None]:
import requests
# Downloads the actual midi file given a midi link

def midi_downloader(midi_link):
    # midi_link = "https://www.midiworld.com/midis/other/bach/bwv773.mid"
    # Download the first MIDI file
    response = requests.get(midi_link, allow_redirects=True)

    # Save the file to your local machine
    filename = 'midis/' + midi_link.split('/')[-1]  # Extract the filename from the URL
    with open(filename, 'wb') as file:
        file.write(response.content)

    print(f"{filename} has been downloaded!")

In [None]:
for midi_link in classic_links:
    midi_downloader(midi_link)

midis/bwv781.mid has been downloaded!
midis/bwv833.mid has been downloaded!
midis/coumoiss.mid has been downloaded!
midis/latalant.mid has been downloaded!
midis/all_his.mid has been downloaded!
midis/bwv803.mid has been downloaded!
midis/op87_04a.mid has been downloaded!
midis/mazrka17.mid has been downloaded!
midis/waltz_12.mid has been downloaded!
midis/bjsbmm15.mid has been downloaded!
midis/mthuo74a.mid has been downloaded!
midis/peter.mid has been downloaded!
midis/orphan.mid has been downloaded!
midis/kreuz1vc.mid has been downloaded!
midis/msw34.mid has been downloaded!
midis/gfh-jm07.mid has been downloaded!
midis/cantique.mid has been downloaded!
midis/bwv841.mid has been downloaded!
midis/bwv806.mid has been downloaded!
midis/han4-4b.mid has been downloaded!
midis/mikro122.mid has been downloaded!
midis/J_M_Bach_Auf_lasst_uns_den_Herren_loben.mid has been downloaded!
midis/bwv851.mid has been downloaded!
midis/diary.mid has been downloaded!
midis/op6n07m4.mid has been downlo

In [None]:
# count number of files in midis directory using bash

!ls midis | wc -l

1326


In [None]:
# zip the midis folder using bash

!zip -r /content/midis.zip /content/midis

  adding: content/midis/ (stored 0%)
  adding: content/midis/op87_04b.mid (deflated 64%)
  adding: content/midis/waltz_08.mid (deflated 79%)
  adding: content/midis/poissons.mid (deflated 78%)
  adding: content/midis/op6n09m4.mid (deflated 80%)
  adding: content/midis/maz25n3.mid (deflated 39%)
  adding: content/midis/bydlo.mid (deflated 44%)
  adding: content/midis/kinder5.mid (deflated 25%)
  adding: content/midis/rachmm6.mid (deflated 63%)
  adding: content/midis/waltz_03.mid (deflated 65%)
  adding: content/midis/dsch10-2.mid (deflated 64%)
  adding: content/midis/tristes2.mid (deflated 44%)
  adding: content/midis/hans3a.mid (deflated 69%)
  adding: content/midis/hmwoo23b.mid (deflated 62%)
  adding: content/midis/op6n08m3.mid (deflated 72%)
  adding: content/midis/satieson.mid (deflated 38%)
  adding: content/midis/beeth9-1.mid (deflated 33%)
  adding: content/midis/hmcello1.mid (deflated 66%)
  adding: content/midis/chpson3d.mid (deflated 44%)
  adding: content/midis/suite_3.mid

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# copy the midis.zip file to root directory in google drive

!cp /content/midis.zip /content/drive/MyDrive