In [8]:
from music21 import converter, corpus, instrument, midi, note, chord, pitch, tree, stream, meter

import os
import numpy as np
import pandas as pd

In [2]:
midi_path = './data/AustinPowers-TheSpyWhoShaggedMe.mid'

In [3]:
# def open_midi(midi_path, remove_drums):
#     # There is an one-line method to read MIDIs
#     # but to remove the drums we need to manipulate some
#     # low level MIDI events.
#     mf = midi.MidiFile()
#     mf.open(midi_path)
#     mf.read()
#     mf.close()
#     if (remove_drums):
#         for i in range(len(mf.tracks)):
#             mf.tracks[i].events = [ev for ev in mf.tracks[i].events if ev.channel != 10]          

#     return midi.translate.midiFileToStream(mf)

# sample = open_midi(midi_path, True)
# len(sample)

In [4]:
def get_file_name(path):
    return path.split('/')[-1].split('.')[0]

def open_midi(midi_path):
    score = converter.parse(midi_path)

    # remove percussion staff
    for part in score.parts:
        if 'Percussion' in part.partName or 'Drum' in part.partName:
            score.remove(part)
    return score

def extract_tempo(score):
    for element in score.flat:
        if 'MetronomeMark' in element.classes:
            tempo_indication = element
            return tempo_indication.number
    return None

def extract_key(score):
    for element in score.flat:
        if 'KeySignature' in element.classes:
            key_indication = element
            return str(key_indication)
    return None

def to_snake_case(text):
  """Converts text to snake case.

  Args:
    text: The text to convert.

  Returns:
    The text in snake case.
  """

  return text.lower().replace(" ", "_")

def truncate_midi_by_bars(midi_path, num_bars):
    score = open_midi(midi_path)
    file_name = get_file_name(midi_path)
    tempo = extract_tempo(score)
    key = extract_key(score)

    beats_per_bar = score.recurse().getElementsByClass(meter.TimeSignature)[0].numerator
    # Truncate the score and save each cropped piece separately
    num_measures = int(score.duration.quarterLength / beats_per_bar)
    num_crops = int(num_measures / num_bars)
    
    output_folder = f'./data/cropped/{file_name}'

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    df = pd.DataFrame(columns=['song_title' ,'file_name', 'instrument_name', 'start_position', 'end_position', 'num_bars', 'tempo', 'key'])

    cropped_scores = []
    instrument_names = []
    output_file_names = []
    start_positions = []
    end_positions = []
    num_bars_list = []
    tempos = []
    keys = []

    print(f'Processing the file: {file_name}...')
    for part in score.parts:
        instrument_name = part.partName
        instrument_name = to_snake_case(instrument_name)
        print(f'Processing {instrument_name}...')
        start_position = 1
        end_position = num_bars
        for i in range(num_crops):
            cropped_part = part.measures(int(start_position), int(end_position))
            notes = [note for note in cropped_part.flat.notes if not note.isRest]
            
            if len(notes) == 0:
                print(f'No notes in {instrument_name}_{str(i).zfill(2)}.mid')
            
            else:
                instrument_names.append(instrument_name)
                start_positions.append(start_position)
                end_positions.append(end_position)
                num_bars_list.append(num_bars)
                tempos.append(tempo)
                keys.append(key)
                cropped_scores.append(cropped_part)
                cropped_part.write('midi', f'{output_folder}/{instrument_name}_{str(i).zfill(2)}.mid')
                output_file_names.append(f'{instrument_name}_{str(i).zfill(2)}.mid')
                
            start_position +=  num_bars
            end_position +=  num_bars
            
    df['song_title'] = [file_name] * len(instrument_names)
    df['file_name'] = output_file_names
    df['instrument_name'] = instrument_names
    df['start_position'] = start_positions
    df['end_position'] = end_positions
    df['num_bars'] = num_bars_list
    df['tempo'] = tempos
    df['key'] = keys

    df.sort_values(by=['instrument_name', 'start_position'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    print(f'Finished processing {file_name}', 'Saved:', len(cropped_scores), 'files')

    # Return the list of cropped scores
    return df

In [5]:
df = truncate_midi_by_bars(midi_path, 8)

Processing the file: AustinPowers-TheSpyWhoShaggedMe...
Processing flute_1...
No notes in flute_1_00.mid
Processing flute_2...
No notes in flute_2_00.mid
No notes in flute_2_08.mid
Processing trumpet_1...
No notes in trumpet_1_00.mid
No notes in trumpet_1_08.mid
Processing trumpet_2...
No notes in trumpet_2_00.mid
No notes in trumpet_2_08.mid
Processing trumpet_3...
No notes in trumpet_3_00.mid
No notes in trumpet_3_08.mid
Processing trombone_1...
Processing trombone_2...
Processing bass_trombone...
Processing piano...
No notes in piano_02.mid
No notes in piano_03.mid
No notes in piano_04.mid
No notes in piano_05.mid
No notes in piano_06.mid
No notes in piano_07.mid
No notes in piano_08.mid
No notes in piano_09.mid
No notes in piano_10.mid
No notes in piano_11.mid
Processing bass...
Finished processing AustinPowers-TheSpyWhoShaggedMe Saved: 101 files


In [6]:
df[df['instrument_name'] == 'flute_2']

Unnamed: 0,song_title,file_name,instrument_name,start_position,end_position,num_bars,tempo,key
35,AustinPowers-TheSpyWhoShaggedMe,flute_2_01.mid,flute_2,9,16,8,150.0,B- major
36,AustinPowers-TheSpyWhoShaggedMe,flute_2_02.mid,flute_2,17,24,8,150.0,B- major
37,AustinPowers-TheSpyWhoShaggedMe,flute_2_03.mid,flute_2,25,32,8,150.0,B- major
38,AustinPowers-TheSpyWhoShaggedMe,flute_2_04.mid,flute_2,33,40,8,150.0,B- major
39,AustinPowers-TheSpyWhoShaggedMe,flute_2_05.mid,flute_2,41,48,8,150.0,B- major
40,AustinPowers-TheSpyWhoShaggedMe,flute_2_06.mid,flute_2,49,56,8,150.0,B- major
41,AustinPowers-TheSpyWhoShaggedMe,flute_2_07.mid,flute_2,57,64,8,150.0,B- major
42,AustinPowers-TheSpyWhoShaggedMe,flute_2_09.mid,flute_2,73,80,8,150.0,B- major
43,AustinPowers-TheSpyWhoShaggedMe,flute_2_10.mid,flute_2,81,88,8,150.0,B- major
44,AustinPowers-TheSpyWhoShaggedMe,flute_2_11.mid,flute_2,89,96,8,150.0,B- major


In [7]:
# df.to_csv('austin.csv', index=False)

In [8]:
import mido
import editdistance
        
def manhattan_distance(list1, list2):
    distance = 0
    for sub1, sub2 in zip(list1, list2):
        sub_distance = 0
        for x, y in zip(sub1, sub2):
            sub_distance += abs(x - y)
        distance += sub_distance
    return distance

def compare_midi_files2(file1, file2):
    # Load midi files
    mid1 = mido.MidiFile(file1)
    mid2 = mido.MidiFile(file2)

    # Extract notes from midi files and group adjacent pitches together
    notes1 = []
    for msg in mido.merge_tracks(mid1.tracks):
        if 'note_on' in msg.type:
            pitch = msg.note
            if notes1 and pitch == notes1[-1][-1] + 1:
                # Append pitch to last group
                notes1[-1].append(pitch)
            else:
                # Create new group for pitch
                notes1.append([pitch])

    notes2 = []
    for msg in mido.merge_tracks(mid2.tracks):
        if 'note_on' in msg.type:
            pitch = msg.note
            if notes2 and pitch == notes2[-1][-1] + 1:
                # Append pitch to last group
                notes2[-1].append(pitch)
            else:
                # Create new group for pitch
                notes2.append([pitch])

    # Calculate similarity for each group of pitches
    similarity_scores = []
    return manhattan_distance(notes1, notes2)

In [3]:
import os

In [4]:
output_folder = './cropped/'

In [5]:
os.listdir(output_folder)

['AllThatJazz', 'AustinPowers-TheSpyWhoShaggedMe', 'Avengers', 'Braveheart']

In [11]:
df = pd.read_csv('austin.csv')

In [19]:
str(df['song_title'][0])

'AustinPowers-TheSpyWhoShaggedMe'

In [9]:
midi_folder = './data/cropped/AustinPowers-TheSpyWhoShaggedMe'
midi_files = df['file_name']
midi_files_path = [os.path.join(midi_folder, file) for file in midi_files]

midi_files

0           bass_00.mid
1           bass_01.mid
2           bass_02.mid
3           bass_03.mid
4           bass_04.mid
             ...       
96     trumpet_3_06.mid
97     trumpet_3_07.mid
98     trumpet_3_09.mid
99     trumpet_3_10.mid
100    trumpet_3_11.mid
Name: file_name, Length: 101, dtype: object

In [10]:
df.head()

Unnamed: 0,song_title,file_name,instrument_name,start_position,end_position,num_bars,tempo,key
0,AustinPowers-TheSpyWhoShaggedMe,bass_00.mid,bass,1,8,8,150.0,B- major
1,AustinPowers-TheSpyWhoShaggedMe,bass_01.mid,bass,9,16,8,150.0,B- major
2,AustinPowers-TheSpyWhoShaggedMe,bass_02.mid,bass,17,24,8,150.0,B- major
3,AustinPowers-TheSpyWhoShaggedMe,bass_03.mid,bass,25,32,8,150.0,B- major
4,AustinPowers-TheSpyWhoShaggedMe,bass_04.mid,bass,33,40,8,150.0,B- major


In [None]:
current_instr = None
comparing_instr = None

current_file = None

duplicated = {}
duplicate_detected = []
df['repeated'] = [None] * len(df)

for i in range(len(df)):
    current_instr = df.loc[i, 'instrument_name']
    current_file = df.loc[i, 'file_name']
    print('comparing : {}'.format(df['file_name'][i]))

    if current_file in duplicate_detected:
        continue

    for j in range(len(df)):
        comparing_instr = df.loc[j, 'instrument_name']
        
        if current_instr == comparing_instr:
            if i == j:
                continue
            else:
                similarity = compare_midi_files2(midi_files_path[i], midi_files_path[j])
                if similarity == 0:
                    duplicated[midi_files[i]] = midi_files[j]
                    duplicate_detected.append(midi_files[j])
                    df.loc[i, 'repeated'] = midi_files[j]

        elif current_instr != comparing_instr:
            continue


In [15]:
len(duplicated)

13

In [17]:
# df.drop('same_with', axis=1, inplace=True)

In [21]:
# df.to_csv('austin.csv', index=False)

In [26]:
dfs = df.copy()

In [30]:
dfs[dfs['instrument_name'] == 'bass']

Unnamed: 0,song_title,file_name,instrument_name,start_position,end_position,num_bars,tempo,key,repeated
0,AustinPowers-TheSpyWhoShaggedMe,bass_00.mid,bass,1,8,8,150.0,B- major,
1,AustinPowers-TheSpyWhoShaggedMe,bass_01.mid,bass,9,16,8,150.0,B- major,
2,AustinPowers-TheSpyWhoShaggedMe,bass_02.mid,bass,17,24,8,150.0,B- major,bass_06.mid
3,AustinPowers-TheSpyWhoShaggedMe,bass_03.mid,bass,25,32,8,150.0,B- major,bass_10.mid
4,AustinPowers-TheSpyWhoShaggedMe,bass_04.mid,bass,33,40,8,150.0,B- major,
5,AustinPowers-TheSpyWhoShaggedMe,bass_05.mid,bass,41,48,8,150.0,B- major,
6,AustinPowers-TheSpyWhoShaggedMe,bass_06.mid,bass,49,56,8,150.0,B- major,
7,AustinPowers-TheSpyWhoShaggedMe,bass_07.mid,bass,57,64,8,150.0,B- major,
8,AustinPowers-TheSpyWhoShaggedMe,bass_08.mid,bass,65,72,8,150.0,B- major,
9,AustinPowers-TheSpyWhoShaggedMe,bass_09.mid,bass,73,80,8,150.0,B- major,


In [32]:
for i in range(len(df)):
    if df.loc[i, 'repeated'] is not None:
        repeated_file = df.loc[i, 'repeated']
        target_file = df.loc[i, 'file_name']
        target_idx = df[df['file_name'] == repeated_file].index
        df.loc[target_idx, 'file_name'] = target_file
    

In [33]:
df[df['instrument_name'] == 'bass']

Unnamed: 0,song_title,file_name,instrument_name,start_position,end_position,num_bars,tempo,key,repeated
0,AustinPowers-TheSpyWhoShaggedMe,bass_00.mid,bass,1,8,8,150.0,B- major,
1,AustinPowers-TheSpyWhoShaggedMe,bass_01.mid,bass,9,16,8,150.0,B- major,
2,AustinPowers-TheSpyWhoShaggedMe,bass_02.mid,bass,17,24,8,150.0,B- major,bass_06.mid
3,AustinPowers-TheSpyWhoShaggedMe,bass_03.mid,bass,25,32,8,150.0,B- major,bass_10.mid
4,AustinPowers-TheSpyWhoShaggedMe,bass_04.mid,bass,33,40,8,150.0,B- major,
5,AustinPowers-TheSpyWhoShaggedMe,bass_05.mid,bass,41,48,8,150.0,B- major,
6,AustinPowers-TheSpyWhoShaggedMe,bass_02.mid,bass,49,56,8,150.0,B- major,
7,AustinPowers-TheSpyWhoShaggedMe,bass_07.mid,bass,57,64,8,150.0,B- major,
8,AustinPowers-TheSpyWhoShaggedMe,bass_08.mid,bass,65,72,8,150.0,B- major,
9,AustinPowers-TheSpyWhoShaggedMe,bass_09.mid,bass,73,80,8,150.0,B- major,


In [34]:
midi_folder = './data'

In [36]:
import glob

In [37]:
glob.glob(f'{midi_folder}/*.mid')

['./data\\AllThatJazz.mid',
 './data\\AustinPowers-TheSpyWhoShaggedMe.mid',
 './data\\Avengers.mid',
 './data\\Braveheart.mid']