In [3]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as spark_F

df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv")

spark = (SparkSession
        .builder # builder pattern abstraction 
        .appName("Chord Progression Prediction")
        .config("spark.driver.memory", "16g")
        .getOrCreate() # work both in batch & interactive mode 
        )

chords_df = spark.createDataFrame(df)

  from .autonotebook import tqdm as notebook_tqdm
  df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv")
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 23:17:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/04 23:17:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/04 23:17:50 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/04 23:17:50 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/05/04 23:17:50 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
25/05/04 23:17:50 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
25/05/04 23:17:50 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
25/

In [4]:
flat_chords = chords_df.select(
    spark_F.col("id"), 
    spark_F.col("main_genre").alias("genre"),
    spark_F.split(spark_F.col("chords"), " ").alias("chord_arrays"))\
        .select(spark_F.col("id"), spark_F.col("genre"), spark_F.col("chord_arrays"))

In [5]:
from pyspark.sql.types import MapType, StringType, ArrayType

@spark_F.udf(returnType=MapType(StringType(), ArrayType(StringType())))
def split_progression_by_sections(chord_array):
    sections = {}
    current_section = None
    
    for item in chord_array:
        # if item is a section marker
        if item.startswith('<') and item.endswith('>'):
            current_section = item[1:-1]
            sections[current_section] = []
        elif current_section is not None:
            sections[current_section].append(item)
    
    return sections

result_df = flat_chords.withColumn(
    "sections", 
    split_progression_by_sections("chord_arrays")
)

sections_df = result_df.select(
    "genre",
    spark_F.explode("sections").alias("section_name", "chords")
)

genre_chords_df = sections_df.select(spark_F.col("genre"), spark_F.col("chords"))

In [6]:
genre_chords_df.show(5, truncate=False)

25/05/04 23:18:47 WARN TaskSetManager: Stage 0 contains a task of very large size (38752 KiB). The maximum recommended task size is 1000 KiB.
25/05/04 23:18:52 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker
                                                                                

+-----+-------------------------------------------------------------+
|genre|chords                                                       |
+-----+-------------------------------------------------------------+
|pop  |[F, C, E7, Amin, C, F, C, G7, C, F, C, E7, Amin, C, F, G7, C]|
|pop  |[G, D, G, D, A, D, G, D, Fs7, Bmin, D, G, A7, D, G, A7, D]   |
|pop  |[F, C, F, C, G, C, F, C, E7, Amin, C, F, G7, C]              |
|pop  |[C]                                                          |
|pop  |[D]                                                          |
+-----+-------------------------------------------------------------+
only showing top 5 rows



In [7]:
genres_list = sections_df.select(spark_F.col("genre")).distinct()

genre_chords_df = genre_chords_df.where(spark_F.col("genre") != "NaN")\
    .where(spark_F.size(spark_F.col("chords")) >= 5) # only keep chord array with 5 or more chords

sequences = genre_chords_df.select("chords").rdd.map(lambda r: r[0]).collect()
genre_for_sequences = genre_chords_df.select("genre").rdd.map(lambda r: (r[0])).collect()

# Distinct genres
genres = genres_list.select("genre").rdd.map(lambda r: r[0]).collect()
# Distinct chords
distinct_chords = genre_chords_df.select(spark_F.explode("chords")).distinct().rdd.map(lambda r: r[0]).collect()


25/05/04 23:19:07 WARN TaskSetManager: Stage 1 contains a task of very large size (38752 KiB). The maximum recommended task size is 1000 KiB.
25/05/04 23:19:24 WARN TaskSetManager: Stage 2 contains a task of very large size (38752 KiB). The maximum recommended task size is 1000 KiB.
25/05/04 23:19:42 WARN TaskSetManager: Stage 3 contains a task of very large size (38752 KiB). The maximum recommended task size is 1000 KiB.
25/05/04 23:20:07 WARN TaskSetManager: Stage 6 contains a task of very large size (38752 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [8]:
# vocab list
special_tokens = ["PAD", "UNK"]
chord_vocab = special_tokens + distinct_chords
chord_to_id = {chord: i for i, chord in enumerate(chord_vocab)}
id_to_chord = {i: chord for chord, i in chord_to_id.items()}

# tokenizer
def tokenize_chords(chord_seq):
    return [chord_to_id.get(chord, chord_to_id["UNK"]) for chord in chord_seq]

# chord -> token
tokenized_sequences = [tokenize_chords(seq) for seq in sequences]

# Genres to IDs
genre_to_id = {genre: i for i, genre in enumerate(genres)}
id_to_genre = {i: genre for genre, i in genre_to_id.items()}

encoded_genres = [genre_to_id[g] for g in genre_for_sequences]

In [9]:
import pickle

# Saving chord_to_id and id_to_chord dictionaries
with open("chord_mappings.pkl", "wb") as f:
    pickle.dump({"chord_to_id": chord_to_id, "id_to_chord": id_to_chord}, f)

In [None]:
# Shift 1 to left for input, and right 1 for target
input_sequences = [seq[:-1] for seq in tokenized_sequences if len(seq) > 1]
target_sequences = [seq[1:] for seq in tokenized_sequences if len(seq) > 1]
genres_for_sequences = [genre for seq, genre in zip(tokenized_sequences, genre_for_sequences) if len(seq) > 1]

import pickle

data_to_save = {
    "input_sequences": input_sequences,
    "target_sequences": target_sequences,
    "genres_for_sequences": genres_for_sequences
}

with open("processed_sequences.pkl", "wb") as f:
    pickle.dump(data_to_save, f)