In [2]:
import music21 as m21
import re
from collections import Counter
from swalign_local import *

In [3]:
# import the score
score = '~/facets-search-engine/data/Beethoven9thOdeToJoy.xml'
#score = '~/jupyternotebooks/bwv1046a.mei'
full_score = m21.converter.parse(score)
len(full_score.parts)

2

In [4]:
# get first voice as the first doc in alignment
m21_score = full_score.parts[0]

In [5]:
# get the second voice as the second doc in alignment
compared_score = full_score.parts[1]

In [6]:
if len(full_score.parts) > 2:
    print("More than 2 voices! We should compare all the voices.")

In [7]:
def get_intervals(m21_score):
        
        noteoffset = []
        beatstrength = []
        
        dict_wordtonum = {"Unison": '2', "Second": '2', "Third": '3', "Fourth": '4', "Fifth": '5', "Sixth": '6', "Seventh": '7'}
        #P.S.: Unison as "2" instead of "0" for a reason, see explanation later
        
        # encoding the diatonic intervals. not 1A and 1D are represented as 2A and 2D here...
        dict_encode_dia_intervals = {"7D": 'A', '6D': 'B', '5D': 'C', '4D':'D', '3D':'E', '2D':'F', 
                                     '2A':'G', '3A':'H', '4A':'I', '5A':'J', '6A': 'K', '7A':'L', '0A': 'M'}
        
        diatonic_intervals = []
        
        previous_note = None
        
        # Scan the items        
        for thisnote in m21_score.recurse().notes:
            
            # We ignore rests
            if thisnote.isRest: 
                # If the rest is a full measure, part of a multi-measure rest: we need to adjust
                continue
            
            noteoffset.append(thisnote.offset)
            beatstrength.append(thisnote.beatStrength)
            #"duration:", thisnote.duration.quarterLength)
            #print("offset:", thisnote.offset, "beat strength", thisnote.beatStrength)
            if previous_note is None:
                previous_note = thisnote
            else:
                if thisnote.isNote and previous_note.isNote:
                    # gap = number of semi-tones of the current interval 
                    gap = thisnote.pitch.diatonicNoteNum - previous_note.pitch.diatonicNoteNum
                elif thisnote.isChord and previous_note.isNote:
                    gap = thisnote.root().diatonicNoteNum - previous_note.pitch.diatonicNoteNum
                elif thisnote.isChord and previous_note.isChord:
                    gap = thisnote.root().diatonicNoteNum - previous_note.root().diatonicNoteNum
                elif thisnote.isNote and previous_note.isChord:
                    gap = thisnote.pitch.diatonicNoteNum - previous_note.root().diatonicNoteNum
                # if a pitch change is detected
                    
                if gap != 0:
                    if gap > 0:
                        #  if the semi-tone difference between the current and the previous item > 0, it is an ascending interval.
                        direction = 'A'
                    else:
                        #  otherwise, it is a descending interval.
                        direction = 'D'

                    # Get intervals using music21
                    """
                            "directedSimpleNiceName" examples: "Descending Doubly-Diminished Fifth", "Ascending Perfect Fourth", "Ascending Doubly-Augmented Fourth"
                            "simpleName" examples: dd5, P5, AA4. There's no direction information
                            Since it only executes when a pitch interval is detected, "unison" refers to an augmented unison, a.k.a minor second
                    """
                    # take intervals between root notes if there exists any chord
                    if previous_note.isChord:
                        startnote = previous_note.root()
                    else:
                        startnote = previous_note
                    if thisnote.isChord:
                        endnote = thisnote.root()
                    else:
                        endnote = thisnote
                        
                    m21_interval_directed = m21.interval.Interval(noteStart=startnote, noteEnd=endnote).directedSimpleNiceName

                    arr_diatonic = m21_interval_directed.split(" ")

                    m21_generic = dict_wordtonum[arr_diatonic[-1]]
                    
                    # m21_interval: 2A, 3D etc...
                    m21_interval = m21_generic+direction
                    # to make each m21_interval unique, show direction and each as single character in string, we encode the diatonic intervals as letters
                    encode_interval = dict_encode_dia_intervals[m21_interval]
                    diatonic_intervals.append(encode_interval)

                else:
                    # We take the interval between two consecutive pitches as 0A
                    encode_interval = dict_encode_dia_intervals['0A']
                    diatonic_intervals.append(encode_interval)
                previous_note = thisnote
                    

        return diatonic_intervals, noteoffset, beatstrength


In [8]:
def to_string(list_intervals):
    string = ""
    for i in list_intervals:
        string += str(i)
    print(len(string))
    return string

In [9]:
def swm_alignment(string1, string2):
    match = 2
    mismatch = -2
    scoring = NucleotideScoringMatrix(match, mismatch)
    sw = LocalAlignment(scoring)#, gap_extension_penalty = -5) 
    
    alignment = sw.align(string1, string2)
    alignment_strings = alignment.dump()
    return alignment_strings


In [10]:
def get_patterns_from_alignment(alignment_strings):
    fullseq1 = alignment_strings[0]
    pat1 = fullseq1.split('-')
    fullseq2 = alignment_strings[1]
    pat2 = fullseq2.split('-')

    pattern1 = []
    pattern2 = []
    
    # startpos_pat saves all the start position of the pattern(if it appears only once, then it's a list of one element)
    # this dictionary saves the pattern appears from the Xth note, instead of offset. 
    startposinseq1_pat = {}
    startposinseq2_pat = {}

    pos = 0
    for pattern in pat1:
        if len(pattern) > 2:
            # Get rid of sequences shorter than 3 intervals
            pattern1.append(pattern)
            if pattern not in startposinseq1_pat:
                startposinseq1_pat[pattern] = []
            startposinseq1_pat[pattern].append(pos)
        pos += len(pattern)

    pos = 0
    for pattern in pat2:
        if len(pattern) > 2:
            pattern2.append(pattern)
            if pattern not in startposinseq2_pat:
                startposinseq2_pat[pattern] = []
            startposinseq2_pat[pattern].append(pos)
        pos += len(pattern)
        
    tempallpatterns = pattern1 + pattern2
    
    allthepattern_count = dict(Counter(tempallpatterns))

    # Get the number of distinctive ones
    allthepatterns = list(set(tempallpatterns))
    len(allthepatterns)
    
    return startposinseq1_pat, startposinseq2_pat, allthepatterns, allthepattern_count


In [11]:
def count_substringandparents(allthepatterns, allthepattern_count):
    issubstring = {}
    hassubstring = {}
    count_as_substring = {}
    count_as_parent = {}

    for i in range(0, len(allthepatterns)-1):
        for j in range(i+1, len(allthepatterns)):
            if allthepatterns[i] in allthepatterns[j]:
                parent = allthepatterns[j]
                kid = allthepatterns[i]
                times = parent.count(kid)

                if kid not in issubstring:
                    issubstring[kid] = 1
                    count_as_substring[kid] = times * allthepattern_count[parent]
                else:
                    issubstring[kid] += 1
                    count_as_substring[kid] += times * allthepattern_count[parent]

                if parent not in hassubstring:
                    hassubstring[parent] = 1
                    count_as_parent[parent] = 1#times
                else:
                    hassubstring[parent] += 1
                    count_as_parent[parent] += 1 

            elif allthepatterns[j] in allthepatterns[i]:
                parent = allthepatterns[i]
                kid = allthepatterns[j]
                times = parent.count(kid)

                if kid not in issubstring:
                    issubstring[kid] =1
                    count_as_substring[kid] = times * allthepattern_count[parent]
                else:
                    issubstring[kid] += 1
                    count_as_substring[kid] += times * allthepattern_count[parent]

                if parent not in hassubstring:
                    hassubstring[parent] = 1
                    count_as_parent[parent] = 1 #times
                else:
                    hassubstring[parent] += 1
                    count_as_parent[parent] += 1 #times

    return issubstring, hassubstring, count_as_substring, count_as_parent

In [12]:
# This counts the times each pattern appeared after original segmentation plus the times they are in other patterns,
# plus the times other patterns showed up in them.

def combined_count(sorthas, sortis, count_as_parent, count_as_substring, allthepattern_count):
    refined_combined = {}

    for item in sorthas:
        if item in sortis:
            # the ones that has substring and also are the substrings
            refined_combined[item] = count_as_parent[item] + count_as_substring[item] + allthepattern_count[item]
        else:
            # the ones that has substrings
            refined_combined[item] = count_as_parent[item] + allthepattern_count[item]

    # the patterns that are substrings of other patterns but none of the others are substrings of it
    for item in sortis:
        if item not in refined_combined:
            refined_combined[item] = count_as_substring[item] + allthepattern_count[item]
        # otherwise is already counted
    return refined_combined


In [14]:
def show_result(finalpattern):
    print(dict(Counter(finalpattern)))

In [15]:
def filter_combined_patterns(combined):
    
    finalpattern = []
    for i in combined: 

        if len(i) < 3:
            # if the length is shorter than 5 notes, discard
            continue

        if combined[i] > 1:
            # keep the ones that has substring or is substring more than once
            finalpattern.append(i)
            
    finalpattern_combined = {}
        
    for thisone in finalpattern:
        finalpattern_combined[thisone] = combined[thisone]

    dict(Counter(finalpattern_combined))
        
    if len(finalpattern) > 20:
        # if more than 20 patterns, only take the top 20
        finalpattern_combined = dict(sorted(finalpattern_combined.items(), key = lambda x:-x[1], reverse = True)[-20:])
    
    return finalpattern_combined


In [16]:
def reduce_pattern_length(seq):
        # the function returns the longest substring that appeared at least once
        # if there is length 4 substring repeated 4 times and a length 7 substring repeated 2 times, we take the length 7
        best_performance = ""
        candidates = []
        for length in range(int(0.3*len(seq)), int(len(seq)*0.5)+1): 
            for start in range(0, len(seq)-length):
                # get all the substrings of this length within the string, save in candidates
                candidates.append(seq[start:start+length])
        count_can = {}
        for candidate in candidates:
                count_can[candidate] = seq.count(candidate)
                if count_can[candidate] > 1:
                    # if it is repeated more than once in the string
                    if len(candidate) > len(best_performance):
                        best_performance = candidate
                    elif len(candidate) == len(best_performance) and count_can[candidate] > count_can[best_performance]:
                        best_performance = candidate
        
        return best_performance

def check_reduction(item):
    success = False
    frequent_substr = reduce_pattern_length(item)
    if len(frequent_substr) > 4 and len(frequent_substr) > 0.3*len(item):
        print("extracted", frequent_substr, "from", item)
        # sucessfully reduced the pattern
        return frequent_substr
    return None

In [17]:
def reduce_length_of_each_pattern(onlyassubstring, top20patterns):
    
    beforereduction = {}
    # reducedfinalpat is the new list that contains all valid patterns after reduction plus the ones that does not need to be reduced
    reducedfinalpat = []

    for item in top20patterns:
        # check the ones that has substring, see if their length can be reduced
        if len(item) > 11 and item not in onlyassubstring:
            #if the extracted pattern is too long, find longest repeated pattern within it
            frequent_substr = check_reduction(item)
            if frequent_substr != None:
                if len(frequent_substr) > 23:
                    # second attempt, if a frequent substring is takend and it is still quite long
                    new_frequent_substr = check_reduction(frequent_substr)
                    if new_frequent_substr != None:
                        # if the second attempt is a success, take the twice reduced pattern
                        reducedfinalpat.append(new_frequent_substr)
                        beforereduction[new_frequent_substr] = item
                    else:
                        # if the second attempt failed, take the result of first reduction
                        reducedfinalpat.append(frequent_substr)
                        beforereduction[frequent_substr] = item
                else:
                    # if the extracted pattern is already not very long
                    reducedfinalpat.append(frequent_substr)
                    beforereduction[frequent_substr] = item
            else:
                # if the reduction failed, take the original
                reducedfinalpat.append(item)
        else:
            # otherwise keep it without reduction
            reducedfinalpat.append(item)
    
    return reducedfinalpat, beforereduction

In [18]:
# find patterns everytime they appear in the sequence 1 and 2, and check their beat strength

def find_pattern_beat_weighed(beatstrength, string, pattern):
    weighedscore = 0
    count = 0
    #match = re.search(pattern, string)
    #if match != None:
    #    print("no match in this one")
    for m in re.finditer(pattern, string):
        # appeared once
        count +=1
        # If the pattern starts at the strongest beat, weighed 1.5.
        if beatstrength[m.start()] == 1.0:
            weighedscore += 1.5
        # If the pattern start at the middle of a bar, weighed 1, otherwise weighed as what it is in beat strength.
        elif beatstrength[m.start()] >= 0.5:
            weighedscore += 1
        else:
            weighedscore += beatstrength[m.start()]
        #print(m.start(), beatstrength[m.start()])
    return weighedscore, count


In [19]:
def count_final_pattern_beat_weighed(reducedfinalpat, string1, string2, beatstrength1, beatstrength2):

    weighed_score = {}
    count_pattern = {}
    refined_finalpat = []

    for pattern in reducedfinalpat: 

        # find the pattern in the first score
        weighedscore, count = find_pattern_beat_weighed(beatstrength1, string1, pattern)
        weighed_score[pattern] = weighedscore
        count_pattern[pattern] = count

        # find the pattern in the second score
        weighedscore, count = find_pattern_beat_weighed(beatstrength2, string2, pattern)
        weighed_score[pattern] += weighedscore
        count_pattern[pattern] += count
        
        if count_pattern[pattern] >= 2:
            # get rid of the ones that actually appeared less than 2 times in total
            refined_finalpat.append(pattern)
            
    return refined_finalpat, weighed_score, count_pattern


In [20]:
def decode_results(finalpat, weighed_score, count_pattern, combined, beforereduction):
    
    decoded_patterns = []
    decoded_patterns_weighed = {}
    decoded_patterns_count = {}
    decoded_patterns_combined = {}
    
    decode_dia_intervals = {'A': '7D', 'B': '6D', 'C': '5D', 'D': '4D', 'E': '3D', 'F': '2D', 
                            'G': '2A', 'H': '3A', 'I': '4A', 'J': '5A', 'K': '6A', 'L': '7A', 'M': '0A'}

    # decode all the candidates for patterns
    for pattern in finalpat:
        trans_pattern = ""
        for letter in pattern:
            trans = decode_dia_intervals[letter]
            trans_pattern += trans

        decoded_patterns_weighed[trans_pattern] = weighed_score[pattern]
        decoded_patterns_count[trans_pattern] = count_pattern[pattern]
        
        if pattern in combined:
            # some might be reduced to a shorter pattern
            decoded_patterns_combined[trans_pattern] = combined[pattern]
        else:
            decoded_patterns_combined[trans_pattern] = combined[beforereduction[pattern]]
        decoded_patterns.append(trans_pattern)
        
    return decoded_patterns, decoded_patterns_combined, decoded_patterns_weighed, decoded_patterns_count


In [21]:
def decode_strings(string1, string2):
    
    decode_dia_intervals = {'A': '7D', 'B': '6D', 'C': '5D', 'D': '4D', 'E': '3D', 'F': '2D', 
                            'G': '2A', 'H': '3A', 'I': '4A', 'J': '5A', 'K': '6A', 'L': '7A', 'M': '0A'}
    
    decode_string1 = ""
    decode_string2 = ""

    for letter in string1:
        trans = decode_dia_intervals[letter]
        decode_string1+=trans

    for letter in string2:
        trans = decode_dia_intervals[letter]
        decode_string2+=trans
    
    return decode_string1, decode_string2

In [22]:
def process_two_voices(m21_score, compared_score):
    
    # Get diatonic intervals, beat strength and other information from scores.
    dia_intervals1, noteoffset1, beatstrength1 = get_intervals(m21_score)
    dia_intervals2, noteoffset2, beatstrength2 = get_intervals(compared_score)

    string1 = to_string(dia_intervals1)
    string2 = to_string(dia_intervals2)

    # Smith waterman alignment of two strings
    alignment_strings = swm_alignment(string1, string2)
    
    """
    startposinseq1 saves the starting position of each pattern in seq1
    startposinseq2 saves the starting position of each pattern in seq2
    allthepatterns is a list of unique candidate patterns(that will go through parent-child count)
    allthepatterns_count counts the times each candidate pattern in allthepatterns appear in both seqs.       
    """
    startposinseq1, startposinseq2, allthepatterns, allthepattern_count = get_patterns_from_alignment(alignment_strings)
    
    # find substring relationships between all pairs of candidate patterns
    issubstring, hassubstring, count_as_substring, count_as_parent = count_substringandparents(allthepatterns, allthepattern_count)
    
    # sort the ones that are substrings
    sortis = dict(Counter(issubstring))
    # sort the ones that have substrings
    sorthas = dict(Counter(hassubstring))
    
    combined = combined_count(sorthas, sortis, count_as_parent, count_as_substring, allthepattern_count)
    
    # filter out the short ones
    top20patterns_combined = filter_combined_patterns(combined)
    
    # just print what's going on
    show_result(top20patterns_combined)
    
    onlyassubstring = set(issubstring) - set(hassubstring)
    reducedfinalpat, beforereduction = reduce_length_of_each_pattern(onlyassubstring, list(top20patterns_combined))
    
    # count the final patterns and weigh with their beat strength
    finalpat, weighed_score, count_pattern = count_final_pattern_beat_weighed(reducedfinalpat, string1, string2, beatstrength1, beatstrength2)
    show_result(finalpat)
    
    # Decode patterns
    decoded_patterns, decoded_patterns_combined, decoded_patterns_weighed, decoded_patterns_count = decode_results(finalpat, weighed_score, count_pattern, combined, beforereduction)
    
    decode_string1, decode_string2 = decode_strings(string1, string2)
        
    for pattern in decoded_patterns:
        print("Pattern:", pattern)
        print("importance score:", decoded_patterns_combined[pattern],
              "weighed score:", decoded_patterns_weighed[pattern], 
              "occurrence:", decoded_patterns_count[pattern])
        match1 = re.search(pattern, decode_string1)
        if match1 != None:
            print("first occurrence in string1: % s, % s" % (match1.start(), match1.end()))
        else:
            print("the pattern does not exist in string1.")
        match2 = re.search(pattern, decode_string2)
        if match2 != None:
            print("first occurrence in string2: % s, % s" % (match2.start(), match2.end()))
        else:
            print("the pattern does not exist in string2.")
            
    return decoded_patterns, decoded_patterns_combined, decoded_patterns_weighed, decoded_patterns_count

In [23]:
decoded_patterns, decoded_patterns_combined, decoded_patterns_weighed, decoded_patterns_count = process_two_voices(m21_score, compared_score)

2487
2623
Query:    1 GGMFFFFGGMFMGGGMFFFFGGFFMGGEGGGFEGGGFFFGCKMMGGMFFFFGGFFMGGEGGGFEGGGFFFGCKMMGGMFFFFGGFFMJMCEGGIMFFFMFFFGCEGGIMFFF-F----F-------FFGFLGG--GAG----------GGGGMGGG--DBFG--G-----------I-M---------FFF---------FFF-F-------GEGGG-----GAG-GG----GGMG-G----GDBFG-GIM--LFFF-----------FF-F----GEHGG-GMFKG-GIDIFDFHFF-----------FM------------FFFLFKGG---IDGIFFFFGMMGF-F-F-F--FFGGEHFIJHHFE-FHGG-GBMGFEIGGEEFF---K--AK-GGIDG-IFFFF---------HFMMFF-F-F-F----GGEHFIJHHFE-FHGG-GBMGFEIGGEEFF---K--AK-GGIDG-IFFFF---------HFMMFF-F-F-F-----GGCIFF-C--I-F---G-FJFGFIM-GD---IFGFJFGFIMDIDMG---C--DMGCDGM-EIC--I-----F-IFGFJFGFIMDIHFFDMG---C--DMGCDGM-EIC--I----FIF-GFJF------G--F--I----M----------DIHF---F-F-GG--F--M-DGGGFGGGHFFFGGGCCGG-GGGGGFG-G--GG-GG-GBG--GG-DGGGMGGAGFHE----FG-EGG-----G------------GGGGBGGGGGGGC-H---I-G----------------G--G--------GGGGCGG--GG----G-GG-BGHGGGD-G-GGE-F---GGGGGGFGMHHHEEEHF-HHIDEEHDMG--MMHHG-FEEHF----HIHEDEHDMG-MGHHG--FEEHFHHGFEEHDMGM--GHHGFEEHEHHGFEEHEMMM----F----M--G--F------G---

274 322
274 323
274 324
274 325
274 326
274 327
274 328
274 329
274 330
274 331
274 332
274 333
275 276
275 277
275 278
275 279
275 280
275 281
275 282
275 283
275 284
275 285
275 286
275 287
275 288
275 289
275 290
275 291
275 292
275 293
275 294
275 295
275 296
275 297
275 298
275 299
275 300
275 301
275 302
275 303
275 304
275 305
275 306
275 307
275 308
275 309
275 310
275 311
275 312
275 313
275 314
275 315
275 316
275 317
275 318
275 319
275 320
275 321
275 322
275 323
275 324
275 325
275 326
275 327
275 328
275 329
275 330
275 331
275 332
275 333
276 277
276 278
276 279
276 280
276 281
276 282
276 283
276 284
276 285
276 286
276 287
276 288
276 289
276 290
276 291
276 292
276 293
276 294
276 295
276 296
276 297
276 298
276 299
276 300
276 301
276 302
276 303
276 304
276 305
276 306
276 307
276 308
276 309
276 310
276 311
276 312
276 313
276 314
276 315
276 316
276 317
276 318
276 319
276 320
276 321
276 322
276 323
276 324
276 325
276 326
276 327
276 328
276 329
276 330
276 331
