Testing 'final' DD version of methodology:

This version involves:
-- Pairwise sequence alignment between two scale degree sequences
-- Extraction of alignment subsequences, which are filtered by frequency and length.
-- These are then inputted into a modified LCSS function, the output is counted and filtered
-- The outputs are processed via a maximal frequent pattern function
-- We are left with a small number of (ideally) musicially-significant subsequences

Beat strength weighting of pattern frequency counts was tested
but not retained in this version as it boosted prominence of very short patterns at the expense of all others.
This is not necessarily discarded but will need to be approached differently if it is to be of use.

In [1]:
# Step 1: Read input data:

import music21 as m21

# input 1
score_path = '~/facets-search-engine/data/Beethoven9thOdeToJoy.xml'
#score_path = '~/FONN_private/thesession_xml_annotated_subset/LordMcDonalds507.xml'
raw_score = m21.converter.parse(score_path)
print(len(raw_score.parts))
#score = raw_score.flat
score = raw_score.parts[0]
compared_score = raw_score.parts[1]

# input 2:
#score2_path = '~/FONN_private/thesession_xml_annotated_subset/LordMcDonalds13433.xml'
#score2_raw = m21.converter.parse(score2_path)
#print(len(score2_raw.parts))
#compared_score = score2_raw.flat

2


In [2]:
# Step 2: Get diatonic scale degree sequence from both docs

# get key / root information:
roots = [key.tonic for key in score.recurse().getElementsByClass(m21.key.Key)]
diatonic_root = roots[0].diatonicNoteNum if roots != [] else score.analyze('key').tonic.diatonicNoteNum
print(diatonic_root)

# get diatonic scale degree seq:
diatonic_note_number = []
diatonic_scale_degree = []
noteonset = []
noteoffset = []
beatstrength = []
#timesig = m21_score.getContextByClass(m21.meter.TimeSignature)
#barlength = timesig.barduration

for thisnote in score.recurse().notes:
    if thisnote.isNote:
        diatonic_note_number.append(float(thisnote.pitch.diatonicNoteNum))
        noteoffset.append(thisnote.offset)
        #"duration:", thisnote.duration.quarterLength)
        beatstrength.append(thisnote.beatStrength)
        #print("offset:", thisnote.offset, "beat strength", thisnote.beatStrength)
    elif thisnote.isChord:
        # if it is a chord, only take the root
        diatonic_note_number.append(float(thisnote.root().diatonicNoteNum))
        beatstrength.append(thisnote.beatStrength)
        noteoffset.append(thisnote.offset)

    # TODO: if it is a rest take the scale degree of the previous note


# Calculate diatonic scale degree of each note and save in a list
for note_num in diatonic_note_number:
        relative_diatonic_pitch = note_num - diatonic_root
        temp = int(relative_diatonic_pitch % 7) + 1
        diatonic_scale_degree.append(temp)

# get key / root information for second score:
roots2 = [key.tonic for key in compared_score.recurse().getElementsByClass(m21.key.Key)]
diatonic_root2 = roots[0].diatonicNoteNum if roots2 != [] else compared_score.analyze('key').tonic.diatonicNoteNum

#Extracting diatonic scale degree in second score
#barlength2 = compared_score.meter.TimeSignature.barduration
diatonic_note_num2 = []
diatonic_scale_degree2 = []
noteoffset2 = []
beatstrength2 = []

score2_data = compared_score.recurse().notes
for thisnote in score2_data:
    if thisnote.isNote:
        beatstrength2.append(thisnote.beatStrength)
        noteoffset2.append(thisnote.offset)
        diatonic_note_num2.append(float(thisnote.pitch.diatonicNoteNum))

        # if chords are encountered, take their root: (this is rare in our corpora)
    elif thisnote.isChord:
        # if it is a chord, only take the root
        diatonic_note_num2.append(float(thisnote.root().diatonicNoteNum))
        beatstrength2.append(thisnote.beatStrength)
        noteoffset2.append(thisnote.offset)

    # TODO: if it is a rest take the scale degree of the previous note


# Calculate diatonic scale degree of each note and save in a list
for note_num in diatonic_note_num2:
        relative_diatonic_pitch = note_num - diatonic_root
        temp = int(relative_diatonic_pitch % 7) + 1
        diatonic_scale_degree2.append(temp)


# transform the list of diatonic scale degrees into string format for alignment

string1 = ""
for i in diatonic_scale_degree:
    string1 += str(i)

string2 = ""
for i in diatonic_scale_degree2:
    string2 += str(i)

print(string2)

30
3455432123322345543212321123123431234321253334554321232112312343123432125333455432123211551671443211765516714432176545434567123455671576714432176545334567123455671576714432176545356711167141437617655432116714114321711121765434535515721657123555536716432716714114321721117654345355157216571235555367164327167141143217211176543451432511115555111511115555115155515551555362551111555511513215551555155536255111155551151321712115671712354323451456712344567123467126712234565754534567123567123451367123456234567124571237123171234565661353161724742726677724542721361631377112467642435717535223346121646461216464444444455671176765436652522556771223423425544712536653212345644225531225241412241414141575755552555555555131313131551133551133551133551111116666666655422171224465243177633244216264567221716422113217433351232211715433217764444271155555555555335555351153553223153351511535532122117552231552343155234325555125311533345355554323355112355211252535152231253435152343125343525234321525553512533333445

In [3]:
# Smith-Waterman alignment between the two input sequences
from swalign_local import  *
match = 2
mismatch = -2
scoring = NucleotideScoringMatrix(match, mismatch)
sw = LocalAlignment(scoring, gap_extension_penalty = -2)
# gap_penalty = -2 might largely reduce the matching sequence(only get part of the score)
# gap_extension_penalty to be experimented.
alignment = sw.align(string1, string2)
alignment_strings = alignment.dump()

Query:    1 34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115--5167144321176551--671443-2--17-65-4-543-4-56712345-567-157671443-2---176545334-5671-23-45----567157671-4-432-1-76----5453567111671-41--43761765--5--432--1---1671--4-1-1----432-1----71-1-1-217654---345--355---1572-165712355-5-5-3-6-71-6--432-7-1671-41-1-432-17----21--1--1-7-654345-3-5-5-1-57-2-1-65-7--12355-5---53671643271-6714--11-4-3-2172-1-11-7-654345-1432-51-1-115-5551115111155-55115--155515-5-5-1-555362551----1-11555511-51-3-215--5-5-15-551-5-5-53-6255111----155551-15-1321-71--2115-6-7---171235432345145-671-2-3--4-456712346-71-2-67122--3----4-56575453456712-356-71-2345136-71-2--3456--234567-124-5-7-12371231712345-6--56-6135316-17-24-74-272-66777245427-2-1-3-6-163--1-3-7-7-1-124676424357175-352233461216464612164644444-4--4455671--17676543665252-2---55-677---1-223423425----544-71-25-36653212345-6-----44-2-2--5-5-312-2-5-2-414122414-1--414-15757555525555-5-5------5--5-513-1-3-1-3-1-3

In [4]:
# reformat output of above cell
fullseq1 = alignment_strings[0]
print(fullseq1)
pat1 = fullseq1.split('-')

fullseq2 = alignment_strings[1]
print(fullseq2)
pat2 = fullseq2.split('-')

34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115--5167144321176551--671443-2--17-65-4-543-4-56712345-567-157671443-2---176545334-5671-23-45----567157671-4-432-1-76----5453567111671-41--43761765--5--432--1---1671--4-1-1----432-1----71-1-1-217654---345--355---1572-165712355-5-5-3-6-71-6--432-7-1671-41-1-432-17----21--1--1-7-654345-3-5-5-1-57-2-1-65-7--12355-5---53671643271-6714--11-4-3-2172-1-11-7-654345-1432-51-1-115-5551115111155-55115--155515-5-5-1-555362551----1-11555511-51-3-215--5-5-15-551-5-5-53-6255111----155551-15-1321-71--2115-6-7---171235432345145-671-2-3--4-456712346-71-2-67122--3----4-56575453456712-356-71-2345136-71-2--3456--234567-124-5-7-12371231712345-6--56-6135316-17-24-74-272-66777245427-2-1-3-6-163--1-3-7-7-1-124676424357175-352233461216464612164644444-4--4455671--17676543665252-2---55-677---1-223423425----544-71-25-36653212345-6-----44-2-2--5-5-312-2-5-2-414122414-1--414-15757555525555-5-5------5--5-513-1-3-1-3-1-315-5113-355-

In [5]:
# Get rid of the interval sequence shorter than 3; combine all alignment patterns remaining into single list

pattern1 = []
pattern2 = []
# startpos_pat saves all the start position of the pattern(if it appears only once, then it's a list of one element)
startposinseq1_pat = {}
startposinseq2_pat = {}

pos = 0
for pattern in pat1:
    if len(pattern) > 3:
        pattern1.append(pattern)
        if pattern not in startposinseq1_pat:
            startposinseq1_pat[pattern] = []
        startposinseq1_pat[pattern].append(pos)
    pos += len(pattern)
        
pos = 0
for pattern in pat2:
    if len(pattern) > 3:
        pattern2.append(pattern)
        if pattern not in startposinseq2_pat:
            startposinseq2_pat[pattern] = []
        startposinseq2_pat[pattern].append(pos)
    pos+= len(pattern)
    
allthepatterns = pattern1 + pattern2
# number of segments that are at least 3 intervals within both scores
len(allthepatterns)

144

In [6]:
from collections import Counter
allthepattern_count = dict(Counter(allthepatterns))
# elimiate the repeated strings
allthepatterns = list(set(allthepatterns))

In [7]:
issubstring = {}
hassubstring = {}
count_as_substring = {}
count_as_parent = {}

for i in range(0, len(allthepatterns)):
    for j in range(i+1, len(allthepatterns)):
        if allthepatterns[i] in allthepatterns[j]:
            parent = allthepatterns[j]
            kid = allthepatterns[i]
            times = parent.count(kid)

            if kid not in issubstring:
                issubstring[kid] = 1
                count_as_substring[kid] = times * allthepattern_count[parent]
            else:
                issubstring[kid] += 1
                count_as_substring[kid] += times * allthepattern_count[parent]

            if parent not in hassubstring:
                hassubstring[parent] = 1
                count_as_parent[parent] = 1 #times
            else:
                hassubstring[parent] += 1
                count_as_parent[parent] += 1

        elif allthepatterns[j] in allthepatterns[i]:
            parent = allthepatterns[i]
            kid = allthepatterns[j]
            times = parent.count(kid)

            if kid not in issubstring:
                issubstring[kid] = 1
                count_as_substring[kid] = times * allthepattern_count[parent]
            else:
                issubstring[kid] += 1
                count_as_substring[kid] += times * allthepattern_count[parent]

            if parent not in hassubstring:
                hassubstring[parent] = 1
                count_as_parent[parent] = 1#times
            else:
                hassubstring[parent] += 1
                count_as_parent[parent] += 1#times

In [8]:
# rank the ones that are substrings of another detected pattern
sortis = dict(Counter(issubstring))
sorthas = dict(Counter(hassubstring))
print(len(sorthas))
print(len(sortis))

54
34


In [10]:
# Refined the count!
refined_combined = {}

for item in sorthas:
    if item in sortis:
        refined_combined[item] = count_as_parent[item] + count_as_substring[item] + allthepattern_count[item]
    else:
        refined_combined[item] = count_as_parent[item] + allthepattern_count[item]

# from the patterns that are substrings
for item in sortis:
    if item not in refined_combined:
        refined_combined[item] = count_as_substring[item] + allthepattern_count[item]
    # otherwise is already counted

In [11]:
dict(Counter(refined_combined))

{'2174333512322': 2,
 '151151511321165444327': 3,
 '5534523432671223125133451511235432312356712365421': 5,
 '234522275676': 2,
 '1765671234': 2,
 '311233217': 2,
 '33543223125535712115': 2,
 '552323432712771251645511235555321755551123534522111714433551171211555435123362531334511535432': 6,
 '14555543421': 2,
 '55551123666732541565432143312371134547123555547121235555471212345555434217215555555511236667325615654321': 7,
 '7123433242172221713243345': 2,
 '7123553225234345123554321231671231765432117433321231451234314345234326712171253665334512355432125': 6,
 '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115345': 6,
 '512531153': 2,
 '56717656712': 2,
 '123543231236123654': 2,
 '36653212345': 2,
 '12355432125671': 3,
 '5167144321176551': 4,
 '5452343267122312513345': 2,
 '4321236': 3,
 '543322351734512354321236': 3,
 '172765671327156712': 2,
 '12134335176': 2,
 '567117214': 2,
 '6221765434567123456711245567712': 7,
 '5453567111671': 3,
 '4455671': 2,

In [23]:
# get the ones that has substring or is substring more than 4 times
finalpattern = []
for i in refined_combined:

    if len(i) < 4:
        # if the length is shorter than 5 notes, discard
        continue

    # TODO: Add similar threshold to remove very long patterns
    # This should be calculated on a piece-by-piece basis: the number of (DW) notes per bar.

    if refined_combined[i] > 2:
        # appeared more than 2 times
        finalpattern.append(i)
print(finalpattern)

['151151511321165444327', '5534523432671223125133451511235432312356712365421', '552323432712771251645511235555321755551123534522111714433551171211555435123362531334511535432', '55551123666732541565432143312371134547123555547121235555471212345555434217215555555511236667325615654321', '7123553225234345123554321231671231765432117433321231451234314345234326712171253665334512355432125', '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115345', '12355432125671', '5167144321176551', '4321236', '543322351734512354321236', '6221765434567123456711245567712', '5453567111671', '56575453456712', '56712345', '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115', '1151511321511512115123762511123', '117712332151151511511512115123762', '234567', '157671443', '671443', '3351', '1321', '5113', '67122', '5671', '5112', '1236', '23321', '2115', '1153', '5321', '12355', '455554', '2172', '1671', '6714', '23123431', '654345', '3456',

In [24]:
# for thisone in finalpattern:
#     print(thisone, ":", combined[thisone], "times.")

In [25]:
finalpattern_counted = {}
for thisone in finalpattern:
    finalpattern_counted[thisone] = refined_combined[thisone]

dict(Counter(finalpattern_counted))

{'151151511321165444327': 3,
 '5534523432671223125133451511235432312356712365421': 5,
 '552323432712771251645511235555321755551123534522111714433551171211555435123362531334511535432': 6,
 '55551123666732541565432143312371134547123555547121235555471212345555434217215555555511236667325615654321': 7,
 '7123553225234345123554321231671231765432117433321231451234314345234326712171253665334512355432125': 6,
 '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115345': 6,
 '12355432125671': 3,
 '5167144321176551': 4,
 '4321236': 3,
 '543322351734512354321236': 3,
 '6221765434567123456711245567712': 7,
 '5453567111671': 3,
 '56575453456712': 3,
 '56712345': 3,
 '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115': 5,
 '1151511321511512115123762511123': 4,
 '117712332151151511511512115123762': 3,
 '234567': 3,
 '157671443': 3,
 '671443': 4,
 '3351': 3,
 '1321': 3,
 '5113': 5,
 '67122': 3,
 '5671': 17,
 '5112': 7,
 '1236': 

In [26]:
# find repeated patterns within the subequences outputted above

def reduce_pattern_length(seq):
        # the function returns the longest substring that appeared at least once
        # if there is length 4 substring repeated 4 times and a length 7 substring repeated 2 times, we take the length 7
        best_performance = ""
        candidates = []
        for length in range(3, 16):
            for start in range(0, len(seq)-length):
                # get all the substrings of this length within the string, save in candidates
                candidates.append(seq[start:start+length])
        count_can = {}
        for candidate in candidates:
                count_can[candidate] = seq.count(candidate)
                if count_can[candidate] > 1:
                    # if it is repeated more than once in the string
                    if len(candidate) > len(best_performance):
                        best_performance = candidate
                    elif len(candidate) == len(best_performance) and count_can[candidate] > count_can[best_performance]:
                        best_performance = candidate

        return best_performance

# reducedfinalpat is the new list that contains all valid patterns after reduction plus the ones that does not need to be reduced
# reducedpat is the ones that needs extra care later for combined count
reducedfinalpat = []
reducedpat = []
for item in finalpattern:
    if len(item) > 10:
        #if the extracted pattern is too long, find longest repeated pattern within it
        frequent_substr = reduce_pattern_length(item)

        if len(frequent_substr) > 0.3*len(item):
            print("extracted", frequent_substr, "from", item)
            reducedpat.append(frequent_substr)
            reducedfinalpat.append(frequent_substr)
        else:
            # if there is not a lot to reduce, take the original
            reducedfinalpat.append(item)
    else:
        # otherwise keep the original
        reducedfinalpat.append(item)

print(reducedfinalpat)

['151151511321165444327', '5534523432671223125133451511235432312356712365421', '552323432712771251645511235555321755551123534522111714433551171211555435123362531334511535432', '55551123666732541565432143312371134547123555547121235555471212345555434217215555555511236667325615654321', '7123553225234345123554321231671231765432117433321231451234314345234326712171253665334512355432125', '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115345', '12355432125671', '5167144321176551', '4321236', '543322351734512354321236', '6221765434567123456711245567712', '5453567111671', '56575453456712', '56712345', '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115', '1151511321511512115123762511123', '117712332151151511511512115123762', '234567', '157671443', '671443', '3351', '1321', '5113', '67122', '5671', '5112', '1236', '23321', '2115', '1153', '5321', '12355', '455554', '2172', '1671', '6714', '23123431', '654345', '3456',

In [27]:
# compare final patterns against original sequences to make sure they do appear there as contiguous blocks--
# this is not guaranteed, as the alignment patters can allow gaps in one or both of the inputs.
# This step essentially filters out gapped patterns


import re

groundtruth1 = []
for pattern in reducedfinalpat: #patterns_to_test
    match = re.search(pattern, string1)
    if match != None:
        print("Match at index % s, % s" % (match.start(), match.end()))
        groundtruth1.append(pattern)
    else:
        print("The pattern does not match.")

Match at index 556, 577
Match at index 212, 261
Match at index 1049, 1142
Match at index 859, 963
Match at index 283, 380
Match at index 0, 92
Match at index 369, 383
The pattern does not match.
Match at index 94, 101
Match at index 105, 129
Match at index 620, 651
The pattern does not match.
The pattern does not match.
Match at index 630, 638
Match at index 0, 89
Match at index 523, 554
Match at index 480, 513
Match at index 634, 640
The pattern does not match.
The pattern does not match.
Match at index 1332, 1336
Match at index 529, 533
Match at index 527, 531
Match at index 160, 165
Match at index 250, 254
Match at index 175, 179
Match at index 97, 101
Match at index 471, 476
Match at index 85, 89
Match at index 86, 90
Match at index 1078, 1082
Match at index 270, 275
Match at index 842, 848
Match at index 850, 854
Match at index 140, 144
The pattern does not match.
Match at index 26, 34
Match at index 625, 631
Match at index 628, 632
Match at index 21, 26


In [28]:

for pattern in reducedfinalpat: #patterns_to_test:
    match = re.search(pattern, string2)
    if match != None:
        print("Match at index % s, % s" % (match.start(), match.end()))
        groundtruth1.append(pattern)
    else:
        print("The pattern does not match.")

The pattern does not match.
The pattern does not match.
The pattern does not match.
The pattern does not match.
The pattern does not match.
The pattern does not match.
The pattern does not match.
Match at index 89, 105
The pattern does not match.
The pattern does not match.
The pattern does not match.
Match at index 175, 188
Match at index 466, 480
Match at index 121, 129
Match at index 0, 89
The pattern does not match.
The pattern does not match.
Match at index 499, 505
Match at index 132, 141
Match at index 91, 97
Match at index 804, 808
Match at index 383, 387
Match at index 719, 723
Match at index 459, 464
Match at index 121, 125
Match at index 930, 934
The pattern does not match.
The pattern does not match.
Match at index 85, 89
Match at index 852, 856
Match at index 656, 660
Match at index 238, 243
Match at index 1000, 1006
Match at index 263, 267
Match at index 90, 94
Match at index 91, 95
Match at index 26, 34
Match at index 221, 227
Match at index 119, 123
Match at index 21, 2

In [29]:
groundtruth1

['151151511321165444327',
 '5534523432671223125133451511235432312356712365421',
 '552323432712771251645511235555321755551123534522111714433551171211555435123362531334511535432',
 '55551123666732541565432143312371134547123555547121235555471212345555434217215555555511236667325615654321',
 '7123553225234345123554321231671231765432117433321231451234314345234326712171253665334512355432125',
 '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115345',
 '12355432125671',
 '4321236',
 '543322351734512354321236',
 '6221765434567123456711245567712',
 '56712345',
 '34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115',
 '1151511321511512115123762511123',
 '117712332151151511511512115123762',
 '234567',
 '3351',
 '1321',
 '5113',
 '67122',
 '5671',
 '5112',
 '1236',
 '23321',
 '2115',
 '1153',
 '5321',
 '12355',
 '455554',
 '2172',
 '1671',
 '23123431',
 '654345',
 '3456',
 '23211',
 '5167144321176551',
 '5453567111671',
 '56

In [31]:
occu_in_strings = {}
for pattern in groundtruth1:
    occu_in_strings[pattern] = string1.count(pattern) + string2.count(pattern)

res = dict(Counter(occu_in_strings))

for k, v in res.items():
    if v > 1:
        print(f"Pattern: {k} | count: {v}")

Pattern: 12355432125671 | count: 2
Pattern: 4321236 | count: 2
Pattern: 56712345 | count: 9
Pattern: 34554321233223455432123211231234312343212533345543212321123123431234321253334554321232115 | count: 2
Pattern: 234567 | count: 3
Pattern: 3351 | count: 4
Pattern: 1321 | count: 7
Pattern: 5113 | count: 8
Pattern: 67122 | count: 3
Pattern: 5671 | count: 39
Pattern: 5112 | count: 8
Pattern: 1236 | count: 9
Pattern: 23321 | count: 6
Pattern: 2115 | count: 13
Pattern: 1153 | count: 6
Pattern: 5321 | count: 4
Pattern: 12355 | count: 14
Pattern: 455554 | count: 3
Pattern: 2172 | count: 10
Pattern: 1671 | count: 12
Pattern: 23123431 | count: 6
Pattern: 654345 | count: 4
Pattern: 3456 | count: 22
Pattern: 23211 | count: 7
Pattern: 157671443 | count: 2
Pattern: 671443 | count: 4
Pattern: 6714 | count: 9
