In [5]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import pickle as pkl

import os, glob

In [6]:
# AMOUNT = '-small'
# AMOUNT = '-medium'
AMOUNT = '' # entire dataset

root_path = '/home/tikim/code/midi-velocity-infer'
dataset_train_path = f'{root_path}/dataset/maestro-midi{AMOUNT}/train'
dataset_val_path = f'{root_path}/dataset/maestro-midi{AMOUNT}/validation'
dataset_test_path = f'{root_path}/dataset/maestro-midi{AMOUNT}/test'

current_dir = os.getcwd()

extension = 'csv'
os.chdir(dataset_train_path)
train_csv_filenames = glob.glob('*.{}'.format(extension))
csv_files_train = []
for filename in train_csv_filenames:
    df = pd.read_csv(filename, index_col=None, header=0)
    csv_files_train.append(df)
dataset_entire_train = pd.concat(csv_files_train, axis=0, ignore_index=True)

os.chdir(dataset_test_path)
test_csv_filenames = glob.glob('*.{}'.format(extension))
csv_files_test = []
for filename in test_csv_filenames:
    df = pd.read_csv(filename, index_col=None, header=0)
    csv_files_test.append(df)
    
os.chdir(dataset_val_path)
val_csv_filenames = glob.glob('*.{}'.format(extension))
csv_files_val = []
for filename in val_csv_filenames:
    df = pd.read_csv(filename, index_col=None, header=0)
    csv_files_val.append(df)
    
os.chdir(current_dir)

# df = pd.read_csv('midi.csv', index_col=None, header=0)

columns_train = ['time_diff', 'note_num', 'length']
columns_label = ['velocity']

dataset_entire_train = np.array(dataset_entire_train[columns_train], dtype=np.float32)

train_time_diff_min = np.min(dataset_entire_train[:, 0])
train_time_diff_max = np.max(dataset_entire_train[:, 0])

length_min = np.min(dataset_entire_train[:, 2])
length_max = np.max(dataset_entire_train[:, 2])

note_num_min = 0
note_num_max = 127

velocity_min = 0
velocity_max = 127

dataset_entire_train = None


In [7]:
def divide_list(l, n, overlapping_window=0):
    for i in range(0, len(l) - n + 1, n - overlapping_window):
        yield l[i:i + n]
    if len(l) % n != 0 and len(l) % n < n:
        yield l[-(len(l) % n):]
                
SAMPLE_LENGTH = 4
def pad_data(data, feature_num):
            if (len(data[-1]) != SAMPLE_LENGTH):
                # print(f'Length of last array: {len(data[-1])}')
                last_array = data.pop()
                # print(f'before padding: {last_array}')
                zero_array = np.zeros((SAMPLE_LENGTH - len(last_array), feature_num), dtype=np.float32)
                last_array = np.concatenate((last_array, zero_array))
                # print(f'after padding: {last_array}')
                data.append(last_array)
                # print(f'Length of last array (after padding): {len(data[-1])}')
            return data

def make_dataset(csv_files, columns_train, columns_label):
    dataset_entire_input = np.empty((0, SAMPLE_LENGTH, 3), dtype=np.float32)
    dataset_entire_label = np.empty((0, SAMPLE_LENGTH, 1), dtype=np.float32)
    
    for df in csv_files:
        data_input_raw = np.array(df[columns_train], dtype=np.float32)
        data_label_raw = np.array(df[columns_label], dtype=np.float32)
        
        # normalize only the time difference
        data_input_raw[:, 0] = (data_input_raw[:, 0] - train_time_diff_min) / (train_time_diff_max - train_time_diff_min)
        # normalize only the note number
        data_input_raw[:, 1] = (data_input_raw[:, 1] - note_num_min) / (note_num_max - note_num_min)
        # normalize only the length
        data_input_raw[:, 2] = (data_input_raw[:, 2] - length_min) / (length_max - length_min)
        # normalize only the velocity
        data_label_raw[:, 0] = (data_label_raw[:, 0] - velocity_min) / (velocity_max - velocity_min)

        data_input_raw2 = list(divide_list(data_input_raw, SAMPLE_LENGTH, SAMPLE_LENGTH - 1))
        data_input_raw2 = pad_data(data_input_raw2, 3)
        data_input = np.array(data_input_raw2, dtype=np.float32)
        dataset_entire_input = np.vstack((dataset_entire_input, data_input))

        data_label_raw2 = list(divide_list(data_label_raw, SAMPLE_LENGTH, SAMPLE_LENGTH - 1))
        data_label_raw2 = pad_data(data_label_raw2, 1)
        data_label = np.array(data_label_raw2, dtype=np.float32)
        dataset_entire_label = np.vstack((dataset_entire_label, data_label))
    
    return dataset_entire_input, dataset_entire_label

dataset_train_input, dataset_train_label = make_dataset(csv_files_train, columns_train, columns_label)
dataset_val_input, dataset_val_label = make_dataset(csv_files_val, columns_train, columns_label)
dataset_test_input, dataset_test_label = make_dataset(csv_files_test, columns_train, columns_label)


In [8]:
pkl.dump({'dataset_train_input': dataset_train_input, 'dataset_train_label': dataset_train_label,
          'dataset_val_input': dataset_val_input, 'dataset_val_label': dataset_val_label,
          'dataset_test_input': dataset_test_input, 'dataset_test_label': dataset_test_label,
          'train_time_diff_min': train_time_diff_min, 'train_time_diff_max': train_time_diff_max, 
          'note_num_min': note_num_min, 'note_num_max': note_num_max, 
          'length_min': length_min, 'length_max': length_max,
          'velocity_min': velocity_min, 'velocity_max': velocity_max}, open('dataset.pkl', 'wb'))