Testing 'final' DD version of methodology:

This version involves:
-- Pairwise sequence alignment between two scale degree sequences
-- Extraction of alignment subsequences, which are filtered by frequency and length.
-- These are then inputted into a modified LCSS function, the output is counted and filtered
-- The outputs are processed via a maximal frequent pattern function
-- We are left with a small number of (ideally) musicially-significant subsequences

Beat strength weighting of pattern frequency counts was tested
but not retained in this version as it boosted prominence of very short patterns at the expense of all others.
This is not necessarily discarded but will need to be approached differently if it is to be of use.

In [1150]:
# Step 1: Read input data:
import music21 as m21
import re
from collections import Counter
from swalign_local import *
import csv
import glob

score_path = "jkupddtest/bachmono/wtc2f20.krn"
#score_path = "jkupddtest/beethovenmono/sonata01-3.krn"
#score_path = "jkupddtest/mozartmono/sonata04-2.krn"
#score_path = "jkupddtest/gibbonsmono/silverswan.krn"
#score_path = "jkupddtest/chopinmono/mazurka24-4.krn"
#score_path = '~/facets-search-engine/data/Beethoven9thOdeToJoy.xml'

raw_score = m21.converter.parse(score_path)
num_of_parts = len(raw_score.parts)

In [1151]:
num_of_parts

3

In [1152]:
voices = []
for part in raw_score.parts:
    voices.append(part)

In [1189]:
# Step 2: Get diatonic scale degree sequence from all voices
def get_diatonic_scale_degree(voice):
    # get key / root information:
    roots = [key.tonic for key in voice.recurse().getElementsByClass(m21.key.Key)]
    diatonic_root = roots[0].diatonicNoteNum if roots != [] else voice.analyze('key').tonic.diatonicNoteNum
    #print(diatonic_root)

    # get diatonic scale degree seq:
    diatonic_note_number = []
    diatonic_scale_degree = []
    diatonic_pitch_scale = []
    noteoffset = []
    midi = []
    #timesig = m21_score.getContextByClass(m21.meter.TimeSignature)
    #barlength = timesig.barduration

    onset = 0
    for thisnote in voice.recurse().notes:
        onset += thisnote.offset
        if thisnote.isNote:
            diatonic = float(thisnote.pitch.diatonicNoteNum)
            diatonic_note_number.append(diatonic)
            noteoffset.append(onset)
            midi.append(thisnote.pitch.midi)
            diatonic_pitch_scale.append(int(diatonic % 7 +1))
            #"duration:", thisnote.duration.quarterLength)
            #beatstrength.append(thisnote.beatStrength)
            #print("offset:", thisnote.offset, "beat strength", thisnote.beatStrength)
        elif thisnote.isChord:
            # if it is a chord, only take the root
            diatonic = float(thisnote.root().diatonicNoteNum)
            diatonic_note_number.append(diatonic)
            #beatstrength.append(thisnote.beatStrength)
            noteoffset.append(onset) #thisnote.offset
            midi.append(thisnote.root().midi)
            diatonic_pitch_scale.append(int(diatonic % 7 +1))
        # TODO: if it is a rest take the scale degree of the previous note

    # Calculate diatonic scale degree of each note and save in a list
    for note_num in diatonic_note_number:
            relative_diatonic_pitch = note_num - diatonic_root
            temp = int(relative_diatonic_pitch % 7) + 1
            diatonic_scale_degree.append(temp)
            
    # transform the list of diatonic scale degrees into string format for alignment
    # USE DIATONIC PITCH SCALE INSTEAD OF SCALE DEGREE
    #print(len(diatonic_scale_degree), len(diatonic_pitch_scale))
    string = ""
    for i in diatonic_pitch_scale:
        string += str(i)
        
    return string, noteoffset, midi


In [1190]:
strings = []
noteoffsets = []
midis = []
for voice in voices:
    string, noteoffset, midi = get_diatonic_scale_degree(voice)
    strings.append(string)
    noteoffsets.append(noteoffset)
    midis.append(midi)
    print(len(string))
    print(string)
    #print(midi)

211
4256314275316427122234543212176576543455432123321767117126475566567123456762375164275345654324567176543212176543221147773351361143217246532123654751264753536427531642212345671765432143217654765432175456656771467
243
762375164275316476776543543212332176711765456656771232176671217655554567654333456543262573513536273112212321551432172176567765434554321233234316445676543513256314276543572165456217123323146247773513663167176543231767432176532176543412321235612
306
425631427534432172176567765434554321233234456765433456543234543455136147564567653654715364455456473142753425735144567654375364777123217676543217765435432123321767117654566567122123455456711712344345677671123217656717654566543234432171221234554321233217671171223454321123432171234567765432563142746543454347


In [1155]:
# Smith-Waterman alignment between the two input sequences
def swm_alignment(string1, string2):
    match = 2
    mismatch = -2
    scoring = NucleotideScoringMatrix(match, mismatch)
    sw = LocalAlignment(scoring)#, gap_extension_penalty = -5) 
    
    alignment = sw.align(string1, string2)
    alignment_strings = alignment.dump()

    simi_score = alignment.matches / (alignment.mismatches + alignment.matches)
    
    return alignment_strings, simi_score


In [1156]:
def filter_and_gather_patterns(allthepatterns, fullseq):
    pat = fullseq.split('-')
    print("num of pat in this seq:", len(pat))
    for pattern in pat:
        # Get rid of the sequence shorter than 4; combine all alignment patterns into all pattern list
        if len(pattern) > 3:
            allthepatterns.append(pattern)
    return allthepatterns

In [1157]:
# compare each pair of voices!
allthepatterns = []

for i in range(0, len(voices)-1):
    for j in range(i+1, len(voices)):
        alignment_strings, simi_score = swm_alignment(strings[i], strings[j])
        if simi_score < 0.25:
            print("ignoring voice pair:", i, j)
            # if two voices are very different, getting patterns from their alignment won't make sense
            continue
        # get the patterns from the first voice of this pair
        fullseq1 = alignment_strings[0]
        fullseq2 = alignment_strings[1]
        allthepatterns = filter_and_gather_patterns(allthepatterns, fullseq1)
        allthepatterns = filter_and_gather_patterns(allthepatterns, fullseq2)
        

Query:  54 5427-72453275-3--332357--53277723-532757---53773473-75------5274466467322227--53---27327545--7---75421---------2442164611612745-2235753273475317425---75327473253235517245--53-5125--67-7--777345557-467--2753275--7-421227542167542164344576467 239
           | || | || | | |  || |||  ||| .|.| ||.|||   || .|..| ||      ||||||||||  |..|  ||   ||| | ||  |   ||..|         | .|.||     || | || .| .||  |  |  ||   ||||||||.||||||||  |  .| || |  .| |  ..||...|| |.|  |||||||  | ||   ||.|  ||||   |.| .||||
Ref  :   5 5-2737-45-2-5237233-35717532-1753153175711753-235532757227235274466467--23572453165273-7-45157131753713565317532-32764-----27-5322-27-727--7--1--2562275327473153235517--51643751-5153737152673323572457542753275327642---7532--7542---324-66467 206

num of pat in this seq: 55
num of pat in this seq: 39
Query:  74 7457275275426527446646742---2647413-6562-36446724--27-6425-71422245----754212164276427542542767227---6---43--466431613-3--13467-1612456467244245-----7----227--2357-757135

In [1158]:
allthepattern_count = dict(Counter(allthepatterns))
# elimiate the repeated strings

allthepatterns = list(set(allthepatterns))

In [1159]:
def count_substringandparents(allthepatterns, allthepattern_count):
    
    issubstring = {}
    hassubstring = {}
    count_as_substring = {}
    count_as_parent = {}

    for i in range(0, len(allthepatterns)-1):
        for j in range(i+1, len(allthepatterns)):
            if allthepatterns[i] in allthepatterns[j]:
                parent = allthepatterns[j]
                kid = allthepatterns[i]
                times = parent.count(kid)

                if kid not in issubstring:
                    issubstring[kid] = 1
                    count_as_substring[kid] = times * allthepattern_count[parent]
                else:
                    issubstring[kid] += 1
                    count_as_substring[kid] += times * allthepattern_count[parent]

                if parent not in hassubstring:
                    hassubstring[parent] = 1
                    count_as_parent[parent] = 1#times
                else:
                    hassubstring[parent] += 1
                    count_as_parent[parent] += 1 

            elif allthepatterns[j] in allthepatterns[i]:
                parent = allthepatterns[i]
                kid = allthepatterns[j]
                times = parent.count(kid)

                if kid not in issubstring:
                    issubstring[kid] =1
                    count_as_substring[kid] = times * allthepattern_count[parent]
                else:
                    issubstring[kid] += 1
                    count_as_substring[kid] += times * allthepattern_count[parent]

                if parent not in hassubstring:
                    hassubstring[parent] = 1
                    count_as_parent[parent] = 1 #times
                else:
                    hassubstring[parent] += 1
                    count_as_parent[parent] += 1 #times

    return issubstring, hassubstring, count_as_substring, count_as_parent


In [1160]:
issubstring, hassubstring, count_as_substring, count_as_parent = count_substringandparents(allthepatterns, allthepattern_count)

In [1161]:
# rank the ones that are substrings of another detected pattern
sortis = dict(Counter(issubstring))
sorthas = dict(Counter(hassubstring))
print(len(sorthas))
print(len(sortis))

37
25


In [1162]:
# This counts the times each pattern appeared after original segmentation plus the times they are in other patterns,
# plus the times other patterns showed up in them.

def combined_count(sorthas, sortis, count_as_parent, count_as_substring, allthepattern_count):
    refined_combined = {}

    for item in sorthas:
        if item in sortis:
            # the ones that has substring and also are the substrings
            refined_combined[item] = count_as_parent[item] + count_as_substring[item] + allthepattern_count[item]
        else:
            # the ones that has substrings
            refined_combined[item] = count_as_parent[item] + allthepattern_count[item]
    # the patterns that are substrings of other patterns but none of the others are substrings of it
    for item in sortis:
        if item not in refined_combined:
            refined_combined[item] = count_as_substring[item] + allthepattern_count[item]
        # otherwise is already counted
    
    return refined_combined

combined = combined_count(sorthas, sortis, count_as_parent, count_as_substring, allthepattern_count)

In [1163]:
dict(Counter(combined))

{'35753277723532757537734': 4,
 '5427672': 3,
 '754212164276427542542767227': 5,
 '542753275': 6,
 '5153737152673323572457542753275327642': 11,
 '327545775421244216461161224575427': 6,
 '35717532': 2,
 '6424664675': 3,
 '7457275275426527446646742': 4,
 '235274466467': 4,
 '7375527446646732': 3,
 '5274466467322227': 3,
 '235532757227235274466467': 5,
 '66467': 8,
 '515153737152673323572457': 4,
 '23572453165273': 4,
 '2235753273475317425': 3,
 '42457227235775': 2,
 '332357': 4,
 '23572': 5,
 '421227542167542164344576467': 3,
 '753275723275457': 5,
 '713575327572327545775421244216461161': 8,
 '2753275': 6,
 '3276427532754232': 7,
 '356531753232': 2,
 '75327473253235517245': 3,
 '76427532': 4,
 '45157131753713565317532': 3,
 '27532': 11,
 '6275327': 3,
 '7277125622753274': 3,
 '2461221642753264746276467': 3,
 '2562275327473153235517': 4,
 '75421': 8,
 '7542124421646': 5,
 '724532753332': 2,
 '53277723': 2,
 '7532': 20,
 '532757': 5,
 '5427': 6,
 '6646': 8,
 '2357': 8,
 '7542': 13,
 '32754

In [1164]:
def filter_combined_patterns(combined):
    
    finalpattern = []
    for i in combined: 

        if len(i) < 3:
            # if the length is shorter than 4 notes, discard
            continue

        if combined[i] > 1:
            # keep the ones that has substring or is substring more than once
            finalpattern.append(i)
            
    finalpattern_combined = {}
        
    for thisone in finalpattern:
        finalpattern_combined[thisone] = combined[thisone]

    #dict(Counter(finalpattern_combined))
    """
    # commented for testing
    if len(finalpattern) > 20:
        # if more than 20 patterns, only take the top 20
        finalpattern_combined = dict(sorted(finalpattern_combined.items(), key = lambda x:-x[1], reverse = True)[-20:])
    """
    return finalpattern_combined


In [1165]:
# filter out the short ones
top20patterns_combined = filter_combined_patterns(combined)

In [1166]:
print(dict(Counter(top20patterns_combined)))

{'35753277723532757537734': 4, '5427672': 3, '754212164276427542542767227': 5, '542753275': 6, '5153737152673323572457542753275327642': 11, '327545775421244216461161224575427': 6, '35717532': 2, '6424664675': 3, '7457275275426527446646742': 4, '235274466467': 4, '7375527446646732': 3, '5274466467322227': 3, '235532757227235274466467': 5, '66467': 8, '515153737152673323572457': 4, '23572453165273': 4, '2235753273475317425': 3, '42457227235775': 2, '332357': 4, '23572': 5, '421227542167542164344576467': 3, '753275723275457': 5, '713575327572327545775421244216461161': 8, '2753275': 6, '3276427532754232': 7, '356531753232': 2, '75327473253235517245': 3, '76427532': 4, '45157131753713565317532': 3, '27532': 11, '6275327': 3, '7277125622753274': 3, '2461221642753264746276467': 3, '2562275327473153235517': 4, '75421': 8, '7542124421646': 5, '724532753332': 2, '53277723': 2, '7532': 20, '532757': 5, '5427': 6, '6646': 8, '2357': 8, '7542': 13, '3275457': 4, '32764': 3, '5323551': 3, '451571317

In [1167]:
def reduce_pattern_length(seq):
        # the function returns the longest substring that appeared at least once
        # if there is length 4 substring repeated 4 times and a length 7 substring repeated 2 times, we take the length 7
        best_performance = ""
        candidates = []
        for length in range(int(0.3*len(seq)), int(len(seq)*0.5)+1): 
            for start in range(0, len(seq)-length):
                # get all the substrings of this length within the string, save in candidates
                candidates.append(seq[start:start+length])
        count_can = {}
        for candidate in candidates:
                count_can[candidate] = seq.count(candidate)
                if count_can[candidate] > 1:
                    # if it is repeated more than once in the string
                    if len(candidate) > len(best_performance):
                        best_performance = candidate
                    elif len(candidate) == len(best_performance) and count_can[candidate] > count_can[best_performance]:
                        best_performance = candidate
        
        return best_performance

def check_reduction(item):
    success = False
    frequent_substr = reduce_pattern_length(item)
    if len(frequent_substr) > 4 and len(frequent_substr) > 0.3*len(item):
        print("extracted", frequent_substr, "from", item)
        # sucessfully reduced the pattern
        return frequent_substr
    return None

In [1168]:
def reduce_length_of_each_pattern(onlyassubstring, top20patterns):
    
    beforereduction = {}
    # reducedfinalpat is the new list that contains all valid patterns after reduction plus the ones that does not need to be reduced
    reducedfinalpat = []

    for item in top20patterns:
        # check the ones that has substring, see if their length can be reduced
        if len(item) > 11 and item not in onlyassubstring:
            #if the extracted pattern is too long, find longest repeated pattern within it
            frequent_substr = check_reduction(item)
            if frequent_substr != None:
                if len(frequent_substr) > 23:
                    # second attempt, if a frequent substring is takend and it is still quite long
                    new_frequent_substr = check_reduction(frequent_substr)
                    if new_frequent_substr != None:
                        # if the second attempt is a success, take the twice reduced pattern
                        reducedfinalpat.append(new_frequent_substr)
                        beforereduction[new_frequent_substr] = item
                    else:
                        # if the second attempt failed, take the result of first reduction
                        reducedfinalpat.append(frequent_substr)
                        beforereduction[frequent_substr] = item
                else:
                    # if the extracted pattern is already not very long
                    reducedfinalpat.append(frequent_substr)
                    beforereduction[frequent_substr] = item
            else:
                # if the reduction failed, take the original
                reducedfinalpat.append(item)
        else:
            # otherwise keep it without reduction
            reducedfinalpat.append(item)
    
    return reducedfinalpat, beforereduction

In [1169]:
onlyassubstring = set(issubstring) - set(hassubstring)
reducedfinalpat, beforereduction = reduce_length_of_each_pattern(onlyassubstring, list(set(top20patterns_combined)))

In [1170]:
def find_pattern_pos(string, pattern):
    patpos_start = []
    patpos_end = []
    count = 0
    for m in re.finditer(pattern, string):
        count +=1
        patpos_start.append(m.start())
        patpos_end.append(m.end())
    return patpos_start, patpos_end, count

In [1171]:
def count_final_pattern(reducedfinalpat, strings):

    count_pattern = {}
    refined_finalpat = []
    startonsets = []
    startpitches = []
    endonsets = []
    endpitches = []

    for pattern in reducedfinalpat:
        stringnum = 0
        for string in strings:
            # find the pattern in eachvoice
            patpos_start, patpos_end, count = find_pattern_pos(string, pattern)
            if pattern not in count_pattern:
                count_pattern[pattern] = count
            else:
                count_pattern[pattern] += count
            if count != 0:
                print("Pattern", pattern, "appears in voice no.", stringnum+1, ", first pos starts at ", patpos_start[0], "end at", patpos_end[0])
            num = 0
            for startpos in patpos_start:
                startonsets.append(noteoffsets[stringnum][startpos])
                startpitches.append(midis[stringnum][startpos])
                #endonsets.append(noteoffsets[stringnum][patpos_end[num]])
                #endpitches.append(midis[stringnum][patpos_end[num]])
                num+=1
            stringnum+=1

        if count_pattern[pattern] >= 2:
            # get rid of the ones that actually appeared less than 2 times in total
            refined_finalpat.append(pattern)
            
    return refined_finalpat, count_pattern, startonsets, startpitches, endonsets, endpitches


In [1172]:
refined_finalpat, count_pattern, startonsets, startpitches, endonsets, endpitches = count_final_pattern(reducedfinalpat, strings)


Pattern 53277723 appears in voice no. 2 , first pos starts at  72 end at 80
Pattern 35753277723532757537734 appears in voice no. 2 , first pos starts at  69 end at 92
Pattern 5427 appears in voice no. 1 , first pos starts at  175 end at 179
Pattern 5427 appears in voice no. 2 , first pos starts at  53 end at 57
Pattern 5427 appears in voice no. 3 , first pos starts at  17 end at 21
Pattern 6424664675 appears in voice no. 3 , first pos starts at  250 end at 260
Pattern 35717532 appears in voice no. 1 , first pos starts at  19 end at 27
Pattern 66467 appears in voice no. 1 , first pos starts at  62 end at 67
Pattern 66467 appears in voice no. 2 , first pos starts at  101 end at 106
Pattern 66467 appears in voice no. 3 , first pos starts at  91 end at 96
Pattern 5427672 appears in voice no. 3 , first pos starts at  17 end at 24
Pattern 2357 appears in voice no. 1 , first pos starts at  67 end at 71
Pattern 2357 appears in voice no. 2 , first pos starts at  68 end at 72
Pattern 2357 appear

In [1173]:
# Test old
"""
groundtruths = []
groundtruthm = []
filenames = glob.glob("jkupddtest/mozartmono/A/*.csv")
for filename in filenames:
    with open(filename, 'r') as file:
        csvreader = csv.reader(file)
        for row in csvreader:
            groundtruths.append(float(row[0]))
            groundtruthm.append(float(row[1]))
            break
            #only want to know the first row for now
"""

'\ngroundtruths = []\ngroundtruthm = []\nfilenames = glob.glob("jkupddtest/mozartmono/A/*.csv")\nfor filename in filenames:\n    with open(filename, \'r\') as file:\n        csvreader = csv.reader(file)\n        for row in csvreader:\n            groundtruths.append(float(row[0]))\n            groundtruthm.append(float(row[1]))\n            break\n            #only want to know the first row for now\n'

In [1174]:
#groundtruths

In [1175]:
#groundtruthm

In [1176]:
"""
countnum = 0
for startonset in startonsets:
    startpitch = startpitches[countnum]
    endonset = endonsets[countnum] 
    endpitch = endpitches[countnum]
    countnum2 = 0
    for gts in groundtruths:
        gtm = groundtruthm[countnum2]
        if startonset == gts and startpitch == gtm:
            print("there is a match:", startonset, startpitch)
            #print("check the end of this pattern:")
            #print("endonset:", endonset)
            #print("endpitch:", endpitch)
        countnum2+=1
    countnum+=1
"""

'\ncountnum = 0\nfor startonset in startonsets:\n    startpitch = startpitches[countnum]\n    endonset = endonsets[countnum] \n    endpitch = endpitches[countnum]\n    countnum2 = 0\n    for gts in groundtruths:\n        gtm = groundtruthm[countnum2]\n        if startonset == gts and startpitch == gtm:\n            print("there is a match:", startonset, startpitch)\n            #print("check the end of this pattern:")\n            #print("endonset:", endonset)\n            #print("endpitch:", endpitch)\n        countnum2+=1\n    countnum+=1\n'

In [1186]:
# Get patterns

groundtruthpatterns = []
print(glob.glob("jkupddtest/bachmono/*"))
filenames = glob.glob("jkupddtest/bachmono/C/*.csv")
for filename in filenames:
    print(filename)
    with open(filename, 'r') as file:
        csvreader = csv.reader(file)
        gts = []
        gtm = []
        pattern_gt = []
        for row in csvreader:
            gts.append(float(row[0]))
            gtm.append(float(row[1]))
        for i in gtm:
            pattern_gt.append(int(i%7+1))
        pattern_string = ""
        interval_string = ""
        for i in pattern_gt:
            pattern_string += str(i)
        """
        for i in range(1, len(pattern_gt)):
            interval = pattern_gt[i] - pattern_gt[i-1]
            interval_string += str(interval)
        print(interval_string)
        """
        print(pattern_string)
        groundtruthpatterns.append(pattern_string)


['jkupddtest/bachmono/A', 'jkupddtest/bachmono/B', 'jkupddtest/bachmono/C', 'jkupddtest/bachmono/wtc2f20.krn']
jkupddtest/bachmono/C/occ5.csv
125677
jkupddtest/bachmono/C/occ1.csv
5623644
jkupddtest/bachmono/C/occ2.csv
5623644
jkupddtest/bachmono/C/occ6.csv
773455
jkupddtest/bachmono/C/occ4.csv
12562
jkupddtest/bachmono/C/occ3.csv
773473


In [1187]:
set(groundtruthpatterns)

{'12562', '125677', '5623644', '773455', '773473'}

In [1188]:
for pattern in groundtruthpatterns:
    count = 0
    for string in strings:
        count += string.count(pattern)
    if count == 0:
        print("pattern", pattern, "not in original score")
    if count == 1:
        print("pattern", pattern, "only appeared once")

pattern 125677 only appeared once
pattern 773455 only appeared once
pattern 12562 only appeared once
pattern 773473 only appeared once


In [1184]:
for i in groundtruthpatterns:
    if i in refined_finalpat: #refined_finalpat:
        print("found a match:", i)

In [1185]:
refined_finalpat

['5427',
 '66467',
 '5427672',
 '2357',
 '2753275',
 '32764',
 '7532',
 '5323551',
 '7542',
 '3275457',
 '75421',
 '76427532',
 '532757',
 '332357',
 '23572',
 '7542124421646',
 '6646',
 '27532']