In [445]:
import pandas as pd
import numpy as np
from scipy import stats
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [446]:
quality_dict = {
    "-7": "minor",
    "-": "minor",
    "7": "major",
    "^7": "major",
    "sus": "sus",
    "5": "major",
    "add9": "major",
    "6": "major",
    "-9": "minor",
    "7sus": "sus",
    "9": "major",
    "^": "major",
    "^9": "major",
    "13": "major",
    "-11": "minor",
    "-6": "minor",
    "69": "major",
    "2": "major",
    "h7": "dim",
    "7#9": "major",
    "7b9": "major",
    "11": "major",
    "o7": "dim",
    "9sus": "sus",
    "13sus": "sus",
    "7#5": "aug",
    "+": "aug",
    "-^9": "minor",
    "-^7": "minor",
    "^13": "major",
    "7b9b13": "major",
    "13#11": "major",
    "h": "dim",
    "7#11": "major",
    "7b13": "major",
    "o": "dim",
    "-7b5": "dim",
    "^7#11": "major",
    "^9#11": "major",
    "7b5": "dim",
    "-69": "minor",
    "7b9#11": "major",
    "7b13sus": "sus",
    "9#11": "major",
    "13b9": "major",
    "-b6": "minor",
    "7b9sus": "sus",
    "7#9#5": "aug",
    "7alt": "aug",
    "7#9b5": "dim",
    '^7#5': "aug",
    '7b9b5': "dim",
    '7b9#5': "aug",
    '9#5': "aug",
    '7b9#9': "major",
    '7#9#11': "major",
    '7susadd3': "major",
    '13#9': "major",
    '9b5': "dim",
    '-#5': "minor",
    "": "major"
}

In [447]:
files = [r'data\jazz1350.json',r'data\pop400.json']

dfs = []
for file in files:
    with open(file) as json_file: 
        data_json = json.load(json_file)
        dfs.append(pd.DataFrame(data_json["songs"]))
df = pd.concat(dfs).reset_index(drop=True)

In [448]:
df

Unnamed: 0,bpm,compStyle,composer,key,music,repeats,style,title,transpose
0,160,Jazz-Medium Up Swing,Coltrane John,F,"{'measures': [['F^7', 'Ab7'], ['Db^7', 'E7'], ...",0,Medium Up Swing,26-2,5
1,140,Jazz-Bossa Nova,Corea Chick,E-,"{'measures': [['E-7'], ['E-7'], ['G-7'], ['G-7...",0,Bossa Nova,500 Miles High,7
2,100,Jazz-Medium Swing,Rowles Jimmy,A-,"{'measures': [['A-7'], ['Db^7'], ['Bh7'], ['E7...",0,Waltz,502 Blues,0
3,240,Jazz-Up Tempo Swing,Monk Thelonious,C,"{'measures': [['C', 'A-7'], ['D-7', 'G7'], ['C...",0,Up Tempo Swing,52nd Street Theme,0
4,100,Jazz-Medium Swing,Warren Earl,C,"{'measures': [['C9'], ['Eb-6'], ['C9'], ['Eb-6...",0,Medium Swing,9.20 Special,0
5,80,Jazz-Slow Swing,Mulligan Gerry,C,"{'measures': [['D-7', 'G7b9'], ['C^7'], ['C#-7...",0,Slow Swing,A Ballad,0
6,100,Jazz-Medium Swing,Kahn Donald,C,"{'measures': [['C^7'], ['Gb7#11'], ['F^7'], ['...",0,Medium Swing,A Beautiful Friendship,0
7,60,Jazz-Ballad Swing,Barnes-Cornelius,Bb,"{'measures': [['Bb6', 'Bo'], ['C-7'], ['F7', '...",0,Ballad,A Blossom Fell,10
8,100,Jazz-Medium Swing,Fain-Webster,F,"{'measures': [['G-7'], ['C7'], ['F^7'], ['F^7'...",0,Medium Swing,A Certain Smile,5
9,100,Jazz-Medium Swing,Jones Thad,Bb,"{'measures': [['Bb^7'], ['Eb-/Bb'], ['Bb^7'], ...",0,Waltz,A Child Is Born,10


In [449]:
def get_chordlist(music):
    """
    input: music dict object, containing "measures" element
    output: returns python list of sequence of chords in the song, with no sequential duplicates
    """   
    #collapses all measures into single array
    chords = []
    for measure in music["measures"]:
        for chord in measure:
            chords.append(chord)
    
    #remove None
    chords = [chord for chord in chords if chord != None]

    return chords

In [450]:
def get_interval(startnote, endnote):
    """
    startnote and endnote are notes (like G or Eb or F#)
    returns: number of semitones between startnote and endnote, counting upwards
    """
    notesdict = {
        "C": 0,
        "C#": 1,
        "Db": 1,  
        "D": 2,
        "D#": 3,
        "Eb": 3,
        "E": 4,
        "F": 5,
        "F#": 6,
        "Gb": 6,
        "G": 7,
        "G#": 8,
        "Ab": 8,
        "A": 9,
        "A#": 10,
        "Bb": 10,
        "B": 11,
    }
    try:
        return (notesdict[endnote] - notesdict[startnote] + 5) % 12 -5
    except:
        return 0

In [451]:
print(get_interval("C", "F#"))
print(get_interval("C", "G"))

6
-5


In [452]:
df.shape

(1753, 9)

In [453]:
def get_chord_change(row):
    try:
        ""
    except:
        return "none_0_none"

songdfs = []
for i in range(df.shape[0]):
#for i in range(3,4):
    dfdict = {
        "song_index": i,
        #"title": df["title"][i],
        "chord": get_chordlist(df["music"][i]) 
    }
    songdf = pd.DataFrame(dfdict)
    
    chord_info = songdf["chord"].str.extract("^([ABCDEFG][b#]?)([^/]*)?/?(.*)?", expand=True)
    songdf["quality"] = chord_info[1]
    songdf["inversion"] = chord_info[2]
    songdf["root"] = chord_info[0]
    songdf["triad_quality"] = songdf["quality"].apply(lambda x: quality_dict[x])
    #create columns for previous chord
    songdf["prev_root"] = songdf['root'].shift(1, fill_value="none")
    songdf["prev_triad_quality"] = songdf['triad_quality'].shift(1, fill_value="none")
    
    #drop rows where previous and current chord root/triad-quality are identical
    duplicates = (songdf["root"]==songdf["prev_root"]) & (songdf["triad_quality"]==songdf["prev_triad_quality"])
    songdf = songdf[~duplicates].copy().reset_index(drop=True)
    songdf["chord_sequence"] = songdf.index
    songdf["interval"] = songdf.apply(lambda x: get_interval(x["prev_root"], x["root"]), axis=1)
    songdf["chord_change"] = songdf["prev_triad_quality"] + "_" + songdf["interval"].astype(str) +"_"+ songdf["triad_quality"]
    songdfs.append(songdf)

chordsdf =  pd.concat(songdfs).reset_index(drop=True)

In [454]:
chordsdf.head(30)

Unnamed: 0,song_index,chord,quality,inversion,root,triad_quality,prev_root,prev_triad_quality,chord_sequence,interval,chord_change
0,0,F^7,^7,,F,major,none,none,0,0,none_0_major
1,0,Ab7,7,,Ab,major,F,major,1,3,major_3_major
2,0,Db^7,^7,,Db,major,Ab,major,2,5,major_5_major
3,0,E7,7,,E,major,Db,major,3,3,major_3_major
4,0,A^7,^7,,A,major,E,major,4,5,major_5_major
5,0,C7,7,,C,major,A,major,5,3,major_3_major
6,0,C-7,-7,,C,minor,C,major,6,0,major_0_minor
7,0,F7,7,,F,major,C,minor,7,5,minor_5_major
8,0,Bb^7,^7,,Bb,major,F,major,8,5,major_5_major
9,0,Db7,7,,Db,major,Bb,major,9,3,major_3_major


In [456]:
chordsdf["chord_change"].value_counts().head(15)

major_5_major     14435
minor_5_major     10531
major_5_minor      5421
major_-5_major     4965
major_2_major      2923
major_-3_minor     2796
major_-1_major     2686
minor_5_minor      2454
major_2_minor      2450
major_-2_major     2165
dim_5_major        1996
major_-5_minor     1885
major_0_minor      1592
major_-1_minor     1531
minor_-4_major     1528
Name: chord_change, dtype: int64

In [461]:
songs = df.drop(["music", "transpose", "repeats"],axis=1)
songs
songs["song_index"] = songs.index

In [462]:
songs

Unnamed: 0,bpm,compStyle,composer,key,style,title,song_index
0,160,Jazz-Medium Up Swing,Coltrane John,F,Medium Up Swing,26-2,0
1,140,Jazz-Bossa Nova,Corea Chick,E-,Bossa Nova,500 Miles High,1
2,100,Jazz-Medium Swing,Rowles Jimmy,A-,Waltz,502 Blues,2
3,240,Jazz-Up Tempo Swing,Monk Thelonious,C,Up Tempo Swing,52nd Street Theme,3
4,100,Jazz-Medium Swing,Warren Earl,C,Medium Swing,9.20 Special,4
5,80,Jazz-Slow Swing,Mulligan Gerry,C,Slow Swing,A Ballad,5
6,100,Jazz-Medium Swing,Kahn Donald,C,Medium Swing,A Beautiful Friendship,6
7,60,Jazz-Ballad Swing,Barnes-Cornelius,Bb,Ballad,A Blossom Fell,7
8,100,Jazz-Medium Swing,Fain-Webster,F,Medium Swing,A Certain Smile,8
9,100,Jazz-Medium Swing,Jones Thad,Bb,Waltz,A Child Is Born,9


In [464]:
songs.to_csv(r"data\songs.csv",index=False)
chordsdf.to_csv(r"data\chords.csv", index=False)