In [1]:
# concatenate input ABC files.
# Convert O'Neill's ABC corpus to MIDI:
import pandas as pd

from abc_ingest import create_midi_corpus_from_abc
from utils import concatenate_abc_files

in_path = '//Users/dannydiamond/NUIG/Polifonia/oneills1850/abc/'
oneills_concatenated = concatenate_abc_files(in_path, outfile_name='oneills1850')
target = 'oneills1850.abc'

# open concatenated ABC file and add MIDI geataccents command to standardise accents/bar structures across time signatures.

with open(f"{in_path}/{target}", 'r+') as abc_file:
    content = abc_file.readlines()
    content.insert(0, "%%MIDI beataccents\n\n")

    abc_file.seek(0)
    abc_file.writelines(content)


In [2]:
out_path= "/Users/dannydiamond/NUIG/Polifonia/oneills1850/midi"
test = create_midi_corpus_from_abc(in_path, target, out_path)

# Why does this output 1822 MIDI fields from 2018 ABC tunes?

Running abc2MIDI command-line tool...
exit status: 0


In [3]:
# Read O'Neills' MIDI corpus to m21

from corpus_processing_tools import Corpus, Tune

in_path = '/Users/dannydiamond/NUIG/Polifonia/oneills1850/midi'
in_file = 'CherishtheLadies.mid'

# FULL TUNE INGEST TEST for midi data
test_tune = Tune(in_path + '/' + in_file)
print(test_tune.title)
test_tune.extract_root()
test_tune.filter_score_accents()
test_tune.convert_music21_streams_to_feature_sequences(level='note')
print("Note-level data:")
print(test_tune.feat_seq.head())



Setting up lookup table for root assignment:
  note names  midi num  root num
0          C        60         0
1   C# or D-        61         1
2          D        62         2
3   D# or E-        63         3
4          E        64         4 



Setting up Music21 root detection lookup table:
  note name  pitch class
0         C          0.0
1        C#          1.0
2        D-          1.0
3         D          2.0
4        D#          3.0 


CherishtheLadies
Note-level data:
   midi_note_num  diatonic_note_num  abs_chromatic_pitch_class  onset  \
0             69                 34                          9   0.00   
1             71                 35                         11   0.50   
2             73                 36                          1   0.67   
3             74                 37                          2   1.00   
4             66                 32                          6   2.00   

   duration  velocity  
0       0.5       105  
1       0.5        80  
2     

In [4]:
from corpus_processing_tools import Corpus, Tune
# Setup MIDI corpus and read all files to music21 streams:
in_path = '/Users/dannydiamond/NUIG/Polifonia/oneills1850/test_midi'
corpus = Corpus(in_path)

Reading MIDI files to Music21 streams: 100%|██████████| 9/9 [00:00<00:00, 80.18it/s]


In [5]:
# Convert scores to feature sequences at note-level:
# Note: due to ABC formatting in this corpus, filtration at accent level is not possible via MIDI velocity.
# Instead, it can be achieved later via the method below with 'level' arg set to 'accent:
corpus.convert_scores_to_feat_seqs(level='note')

Calculating note-level feature sequences from music21 scores: 100%|██████████| 9/9 [00:00<00:00, 145.28it/s]


In [6]:
# Print sample feature sequence:
n=5
print(corpus.tunes[n].title)
print(corpus.tunes[n].feat_seq.head(50))

ABlastofWind
    midi_note_num  diatonic_note_num  abs_chromatic_pitch_class  onset  \
0              74                 37                          2      0   
1              74                 37                          2      3   
2              69                 34                          9      4   
3              67                 33                          7      5   
4              66                 32                          6      6   
5              64                 31                          4      7   
6              62                 30                          2      8   
7              74                 37                          2      9   
8              74                 37                          2     11   
9              74                 37                          2     12   
10             69                 34                          9     13   
11             66                 32                          6     14   
12             67        

In [7]:
# read chromatic and diatonic root notes:
corpus.extract_roots()

Extracting chromatic and diatonic roots: 100%|██████████| 9/9 [00:00<00:00, 2065.71it/s]


In [8]:
# add chromatic interval, pitch and pitch class data to feature sequence:
corpus.calc_chromatic_intervals()
corpus.calc_relative_chromatic_pitch_seqs()
corpus.calc_relative_chromatic_pitch_class_seqs()

Calculating chromatic interval sequences: 100%|██████████| 9/9 [00:00<00:00, 488.31it/s]
Calculating relative pitch sequences: 100%|██████████| 9/9 [00:00<00:00, 1221.37it/s]
Calculating relative pitch class sequences: 100%|██████████| 9/9 [00:00<00:00, 875.90it/s]


In [9]:
# sample output:
n=5
print(corpus.tunes[n].title)
# print(corpus.tunes[n].feat_seq.head(50))
print(corpus.tunes[n].feat_seq.columns)
print(len(corpus.tunes[n].feat_seq.columns))

ABlastofWind
Index(['midi_note_num', 'diatonic_note_num', 'abs_chromatic_pitch_class',
       'onset', 'duration', 'velocity', 'chromatic_interval',
       'relative_chromatic_pitch', 'relative_chromatic_pitch_class'],
      dtype='object')
9


In [10]:
# TODO: Calculate diatonic equivalents of above
corpus.calc_diatonic_intervals()
corpus.calc_abs_diatonic_pitch_class_seqs()
corpus.calc_rel_diatonic_pitch_seqs()
corpus.calc_rel_diatonic_pitch_class_seqs()


Calculating diatonic interval sequences: 100%|██████████| 9/9 [00:00<00:00, 803.49it/s]
Calculating (absolute) diatonic pitch class sequences: 100%|██████████| 9/9 [00:00<00:00, 1660.82it/s]
Calculating relative diatonic pitch sequences: 100%|██████████| 9/9 [00:00<00:00, 1084.45it/s]
Calculating (relative) diatonic pitch class sequences: 100%|██████████| 9/9 [00:00<00:00, 956.85it/s]


In [11]:
# sample output:
n=5
print(corpus.tunes[n].title)
# print(corpus.tunes[n].feat_seq.head(50))
print(corpus.tunes[n].feat_seq.columns)
print(len(corpus.tunes[n].feat_seq.columns))

ABlastofWind
Index(['midi_note_num', 'diatonic_note_num', 'abs_chromatic_pitch_class',
       'onset', 'duration', 'velocity', 'chromatic_interval',
       'relative_chromatic_pitch', 'relative_chromatic_pitch_class',
       'diatonic_interval', 'abs_diatonic_pitch_class',
       'relative_diatonic_pitch', 'relative_diatonic_pitch_class'],
      dtype='object')
13


In [12]:
# Add bar count and strip anacruses
corpus.add_bar_numbers()
corpus.strip_anacruses()

Adding bar numbers to feature sequence data: 100%|██████████| 9/9 [00:00<00:00, 1020.29it/s]
Removing anacruses (pick-up measures): 100%|██████████| 9/9 [00:00<00:00, 310.34it/s]


In [13]:
# sample output:
n=7
print(corpus.tunes[n].title)
print(corpus.tunes[n].feat_seq.head(50))
print(corpus.tunes[n].feat_seq.columns)
print(len(corpus.tunes[n].feat_seq.columns))

ABunchofHaws
    midi_note_num  diatonic_note_num  abs_chromatic_pitch_class  onset  \
0              74                 37                          2    0.0   
1              71                 35                         11    1.0   
2              67                 33                          7    2.0   
3              67                 33                          7    3.0   
4              71                 35                         11    4.0   
5              67                 33                          7    5.0   
6              69                 34                          9    6.0   
7              67                 33                          7    7.0   
8              64                 31                          4    8.0   
9              74                 37                          2    9.0   
10             71                 35                         11   10.0   
11             67                 33                          7   11.0   
12             67        

In [14]:
# Duration-weight the specific sequences we may need for Seán's alg:
# TODO: Don't re-title the cols
# TODO: [above] rename: 'scale degrees'
input_seqs = [
    'bar_count',
    'midi_note_num',
    'diatonic_note_num',
    'chromatic_interval',
    'relative_chromatic_pitch',
    'relative_chromatic_pitch_class',
    'diatonic_interval',
    'abs_diatonic_pitch_class',
    'relative_diatonic_pitch',
    'relative_diatonic_pitch_class'
]

corpus.calc_duration_weighted_feat_seqs(level='note',features=input_seqs)

# TODO: Group by bar

Calculating duration-weighted feature sequences: 100%|██████████| 9/9 [00:00<00:00, 80.88it/s]


In [15]:
# sample output:
n=5
print(corpus.tunes[n].title)
# print(corpus.tunes[n].feat_seq.head(50))
print(corpus.tunes[n].duration_weighted.head(50))

ABlastofWind
             bar_count_duration_weighted  midi_note_num_duration_weighted  \
eighth_note                                                                 
0                                      1                               74   
1                                      1                               74   
2                                      1                               74   
3                                      1                               74   
4                                      1                               69   
5                                      1                               67   
6                                      1                               66   
7                                      1                               64   
8                                      1                               62   
9                                      2                               74   
10                                     2                       

In [16]:
import pandas as pd

test_tune = corpus.tunes[5]

test_tune.group_by_bars(target_seq='relative_diatonic_pitch_class_duration_weighted')



   bar 1  bar 2  bar 3  bar 4  bar 5  bar 6  bar 7  bar 8  bar 9  bar 10  \
0      7      7      4      6      7      7      4      6      7       7   
1      7      7      5      5      7      7      5      5     10      10   
2      7      7      6      6      7      7      6      6     10      10   
3      7      7      7      1      7      7      7      1      9       9   
4      4      4      4      2      4      4      4      2      7       7   

   bar 11  bar 12  bar 13  bar 14  bar 15  bar 16  
0       7       6       7       7      11     6.0  
1      10       5      10      10       9     5.0  
2      10       6      10      10       7     6.0  
3       9       1       9       9      10     1.0  
4       7       2       7       7       8     2.0  


In [17]:
test_tune.calculate_boolean_inter_bar_similarity()


{'bar 1':    bar 1  bar 2  bar 3  bar 4  bar 5  bar 6  bar 7  bar 8  bar 9  bar 10  \
 0   True   True  False  False   True   True  False  False   True    True   
 1   True   True  False  False   True   True  False  False  False   False   
 2   True   True  False  False   True   True  False  False  False   False   
 3   True   True   True  False   True   True   True  False  False   False   
 4   True   True   True  False   True   True   True  False  False   False   
 5   True  False   True  False   True  False   True  False  False   False   
 6   True  False   True  False   True  False   True  False  False   False   
 7   True   True  False   True   True   True  False   True  False   False   
 8   True  False  False  False   True  False  False  False  False   False   
 
    bar 11  bar 12  bar 13  bar 14  bar 15  bar 16  
 0    True   False    True    True   False   False  
 1   False   False   False   False   False   False  
 2   False   False   False   False    True   False  
 3   Fa

In [18]:
test_tune.sum_boolean_inter_bar_similarity()

Similarity results for bar 1:
            bar 1  bar 2  bar 3  bar 4  bar 5  bar 6  bar 7  bar 8  bar 9  \
0             1.0  1.000  0.000  0.000    1.0  1.000  0.000  0.000  1.000   
1             1.0  1.000  0.000  0.000    1.0  1.000  0.000  0.000  0.000   
2             1.0  1.000  0.000  0.000    1.0  1.000  0.000  0.000  0.000   
3             1.0  1.000  1.000  0.000    1.0  1.000  1.000  0.000  0.000   
4             1.0  1.000  1.000  0.000    1.0  1.000  1.000  0.000  0.000   
5             1.0  0.000  1.000  0.000    1.0  0.000  1.000  0.000  0.000   
6             1.0  0.000  1.000  0.000    1.0  0.000  1.000  0.000  0.000   
7             1.0  1.000  0.000  1.000    1.0  1.000  0.000  1.000  0.000   
8             1.0  0.000  0.000  0.000    1.0  0.000  0.000  0.000  0.000   
similarity    1.0  0.667  0.444  0.111    1.0  0.667  0.444  0.111  0.111   

            bar 10  bar 11  bar 12  bar 13  bar 14  bar 15  bar 16  
0            1.000   1.000   0.000   1.000   1.000   

In [19]:



#
# results = dict(zip(bool_df.columns, ['']*len(bool_df.columns)))
#
# for i in bool_df.columns:
#     if .8 < bool_df[i].sum() / len(bool_df[i]) <= 1:
#         results[i] = 'a'
#     if .5 < (bool_df[i].sum() / len(bool_df[i])) < 1:
#         results[i] = 'a*'
#
# indices = []
# for key, val in results.items():
#     if val.endswith('*'):
#         indices.append(key)
#
# # TODO: Test elif block
# check_cols_equality = [bool_df[i] for i in indices]
# for i, col in enumerate(check_cols_equality):
#     if col[i] == col[i+1]:
#         for key, val in results.items():
#             if val.endswith('*'):
#                 new_val = val.replace('*', '1')
#                 results[key] = new_val
#     elif col[i] != col[i+1]:
#         for key, val in results.items():
#             if val.endswith('*'):
#                 new_val = val.replace('*', f'{i+1}')
#                 results[key] = new_val
#
#
#
# print(results)
#
# print(indices)







In [20]:
# compare all bars vs each other:

import numpy as np
# from scipy.spatial.distance import pdist, squareform
#
# distances = pdist(test_output.values.T, metric='hamming')
# dist_matrix = squareform(distances)
# bar_similarity = pd.DataFrame(1-dist_matrix, columns=test_output.columns, index=test_output.columns)
#
# bar_similarity = bar_similarity.round(decimals=4)
#
# # print(bar_similarity)
#
#
# output=pd.Series(data=None, dtype=str)

# row1 = bar_similarity.iloc[0:1,]
# col1 = bar_similarity.iloc[ :, 1]
# output_df.iloc[0:1,] = row1.apply(lambda x: 'a' if x > .8 else ('a1' if .5 < x < .8 else None))
# print(output_df)
# # print(row1)

# abstracted = pd.DataFrame(data=None, columns=bar_similarity.columns, index=bar_similarity.index)
# # assign relationships for all cols in df:
# letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
# for i, letter in enumerate(letters):
#     col = bar_similarity.iloc[ :, i]
#     # per Sean's rules:
#     # col = col.apply(lambda x: letter if x > .8 else None)
#     col = col.apply(lambda x: letter if x > .8 else (f"{letter}1" if .5 < x < .8 else None))
#     abstracted.iloc[ :, i] = col
#
# print(bar_similarity)
# print(abstracted)
# for i, letter in enumerate(letters):
#     res.loc[res['bar 1'].str.len() < 2 | res['bar 1'] == None , 'bar 1'] = res[f'bar {i+1}']
# print(res)


# remove all null vals:
# col_slice = col.dropna()
# col2_slice = col2.dropna()
# # print(col_slice)
# # print(col2_slice)
# # output = col2.apply(lambda x: col if col != 0 else x)
# output = np.where(col.str.startswith('a'), col, col2)
# print(output)
# TODO: Run above only for col 1
# TODO: copy non-null A vals to B OR concat/join
# Run for col B
# Etc for full df

# TODO Use this matrix to populate output per Seán's alg:


# Sean's rules:

# for each col in df:
# for i in range(len(bar_similarity)):
#     col = bar_similarity.iloc[ :, i]
#     # insert 'a' and 'a1' per Sean's rules:
#     col = col.apply(lambda x: 'a' if x > .8 else ('a1' if .5 < x < .8 else x))
#
#     # save a dict of all indexes and vals inserted:
#     vals_out = {}
#     print('\n\n')
#     for idx, val in enumerate(col):
#         if type(val) is str:
#             print(f"bar {idx+1}: {val}")
#
#             vals_out[idx] = val
#
#
#     try:
#         next_col = bar_similarity.iloc[ :, i+1]
#     except IndexError:
#         pass
#
#     next_col.update(vals_out)
#
#     print(next_col)

    # bar_similarity.iloc[ :, i+1] = bar_similarity.apply(lambda x: x.iloc[ :, i] if x.iloc[ :, i].isinstance() else x['col1'], axis=1)






# df.loc[[0,3],'Z'] = 3

# # Possibly useful:
# df['ticker'] = 'na'
# df.loc[df.index <= n/4, 'ticker'] = '$'
# df.loc[(n/4 < df.index) & (df.index <= n/2), 'ticker'] = '$$'
# df.loc[(n/2 < df.index) & (df.index <= n*3/4), 'ticker'] = '$$$'
# df.loc[df.index > n*3/4, 'ticker'] = '$$$$'

# bar_similarity.loc[(bar_similarity['bar 1'] > .8, 'bar 1')] = 'a'

# for col in bar_similarity.columns:
#     bar_similarity[col] = np.where((bar_similarity[col] > .5) | (bar_similarity[col] < .7999), col, f"{bar_similarity[col]}1")

# TODO Use this matrix to populate output per Seán's alg:


# we need to fully evaluate matches in col A (via lambda above?), copy the results to col B, rerun lambda etc...
# When col data is all string type, stop iteration
# Each iteration over a column must be informed by the results of the previous run, otherwise we get additional assignments.
# Once this is done we need to translate to Seán's syntax, of course!
# Should we also re-name bars/cols, at least 1-8?




In [21]:
# results = dict(zip(bar_similarity.index, [None]* (len(bar_similarity))))
# print(results)

In [22]:
# for l in letters:
#     for row in bar_similarity.iterrows():
#         print(row[0])
#         row_res = []
#         for val in row[1]:
#             if val == 1:
#                 row_res.append(l)
#             elif 0.5 < val 0.8:
#                 row_res.append(l)
#
#
#         print(row_res)

