In [1]:
from model import Encoder, Decoder, Seq2Seq
from data_loader import *
import pandas as pd
import torch.optim.lr_scheduler as lr_scheduler
from torch import optim
import torch.nn.functional as F
import datetime
import pretty_midi
import glob
import librosa

In [2]:
import os
import matplotlib
import math
matplotlib.use('Agg')
# matplotlib.use("QtAgg")
import ffmpeg
#conda install -c conda-forge ffmpeg-python

import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, writers
plt.rcParams['animation.ffmpeg_path'] = '/home/ilc/anaconda3/bin/ffmpeg'#'/usr/bin/ffmpeg'

import numpy as np
import subprocess as sp
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.audio.io.AudioFileClip import AudioFileClip

from midi2audio import FluidSynth

from torch.autograd import Variable
from sklearn.model_selection import KFold

import itertools

In [3]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
dataset_name_path = f"./data_list_symbolic_cross_audio.txt"
dataloader = get_dataloader(dataset_name_path, batch_size=8) #[20, 512, 128], [20, 512, 102]
dataset = AudioMotionDataSet(dataset_name_path)

val_dataset_name_path = f"./data_list_symbolic_cross_audio.txt"
# val_dataloader = get_val_dataloader(val_dataset_name_path, batch_size=40) #[20, 512, 128], [20, 512, 102]

full_data_path = None
with open("./data_list_symbolic_cross_audio.txt", "r") as file:
    lines = [line.strip() for line in file]
    full_data_path = np.array(lines)

val_data_read = np.reshape(full_data_path, (11, 10))
# print(val_data_read)

learning_rate = 0.001#0.001

# input_size_encoder = 128 #129 #128
# input_size_decoder = 115 #102 #24
# output_size = 115#102 #24

# encoder_embedding_size = 300
# decoder_embedding_size = 300
enc_dropout = 0.5
dec_dropout = 0.
step = 0

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

self.piece_count:  110
dataset_len:  11000
self.piece_count:  110
dataset_len:  11000
cuda:0


In [5]:
def reset_weights(model): # reset the weight every fold
    if isinstance(model, nn.LSTM) or isinstance(model, nn.Linear):
        model.reset_parameters()

In [6]:
class LSTM1(nn.Module):
    def __init__(self, output_dim, input_size, hidden_size, num_layers, seq_length):
        super(LSTM1, self).__init__()
        self.output_dim = output_dim #number of classes
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.seq_length = seq_length #sequence length

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) #lstm
        self.fc_1 =  nn.Linear(hidden_size, output_dim) #fully connected to determine output dim

        self.relu = nn.ReLU()

    def forward(self,x):
        # h0, c0 no time information
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device) #hidden state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device) #internal state
        # Propagate input through LSTM
        # x is MIDI => [44, 512, 128]

        # hn is final state, run over the sequence length
        output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
        # hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
        # print("output.shape", output.shape)
        # print("hn.shape", hn.shape)
        # out = self.relu(hn)
        out = self.fc_1(output) #final
        return out
 

In [7]:
# Define the model architecture
input_size = 156 #number of features
hidden_size = 1024 #number of features in hidden state
num_layers = 1 #number of stacked lstm layers
seq_len = 512
output_dim = 115 #number of output classes
batch_size_define = 20#128

# model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights).to(device)
# model = LSTM(embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights).to(device)
# model = LSTM1(output_dim, input_size, hidden_size, num_layers, seq_len).to(device) #our lstm class
# model.train()
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

num_epochs = 300#100 #10
k_folds = 11
cross_valid_results = {}
torch.manual_seed(42)

avg_loss_list = []
all_loss_list = []
val_loss_per_epoch_list = []

#TODO: important cross val record
val_time_loss_list = []
val_dim_loss_list = []
val_mse_loss_list = []
val_per_split_list = [] #just mse loss


In [None]:
def time_wise_loss_fn(preds, labels):
    '''
    calculate time-wise loss for motion (along the time axis)
    input: labels[batch, time, dimension(joint*xyz)]
    preds[batch, time , dimension(joint*xyz)]
    output: time loss
    '''
    # points_2 = ax.scatter(column(each_frame[30:32], 0), column(each_frame[30:32], 1), column(each_frame[30:32], 2), cmap='jet', marker='o', label='body joint', color = 'blue')
    # points_3 = ax.scatter(column(each_frame[32:34], 0), column(each_frame[32:34], 1), column(each_frame[32:34], 2), cmap='jet', marker='o', label='body joint', color = 'red')
    # [8, 9], [9, 11], [9, 10], [10, 11], [10, 12], [9, 12], [11, 12], #right hand
    # [13, 14], [14, 16], [14, 15], [16, 15], [14, 17], [16, 17], [15, 17], #left hand
    # [30, 31], [32, 33],  #instrument
    
    # print("preds.shape", preds.shape)
    # print("labels.shape", labels.shape)
    
    # print("preds[9*3:18*3]", preds[:, :, 9*3:19*3].shape)
    # print("labels[9*3:18*3]", labels[:, :, 9*3:19*3].shape)
    
    select_joint_preds = torch.cat((preds[:, :, 9*3:19*3], preds[:, :, 31*3:35*3]), 2)
    select_joint_labels = torch.cat((labels[:, :, 9*3:19*3], labels[:, :, 31*3:35*3]), 2)
    
    # print("select_joint_preds.shape", select_joint_preds.shape)
    # # print(select_joint_preds)
    # print("select_joint_labels.shape", select_joint_labels.shape)
    
    epsilon = 1e-7
    select_joint_preds = select_joint_preds + epsilon

    labels_transpose = torch.permute(select_joint_labels, (0, 2, 1))#tf.transpose(labels, [0, 2, 1]) # [b, 3, t]
    preds_transpose = torch.permute(select_joint_preds, (0, 2, 1))#tf.transpose(preds, [0, 2, 1]) # [b, 3, t]
    # print("labels_transpose.shape", labels_transpose.shape)
    # print("preds_transpose.shape", preds_transpose.shape)
    # print("labels_transpose[:, :, :, None].shape", labels_transpose[:, :, :, None].shape)
    # print("labels_transpose[:, :, None, :].shape", labels_transpose[:, :, None, :].shape)
    label_diff = labels_transpose[:, :, :, None] - labels_transpose [:, :, None, :] # [b, 3, t, t]
    
    preds_diff = preds_transpose[:, :, :, None] - preds_transpose [:, :, None, :] # [b, 3, t, t]
    # print(preds_diff.shape)
    time_loss = (preds_diff - label_diff)**2 # [b, 3, t, t]
    time_loss_value = time_loss.mean() #float()
    torch.cuda.empty_cache()

    return time_loss_value
    
def dim_wise_loss_fn(preds, labels):
    '''
    calculate dimension-wise loss for motion (along the dimension axis)
    input: labels[batch, time, dimension(joint*xyz)]
    preds[batch, time , dimension(joint*xyz)]
    output: dimension loss
    '''
    select_joint_preds = torch.cat((preds[:, :, 9*3:19*3], preds[:, :, 31*3:35*3]), 2)
    select_joint_labels = torch.cat((labels[:, :, 9*3:19*3], labels[:, :, 31*3:35*3]), 2)

    epsilon = 1e-7
    preds = preds + epsilon
    
    label_diff = select_joint_labels[:, :, :, None] - select_joint_labels[:, :, None, :] # [b, t, 3, 3]
    preds_diff = select_joint_preds[:, :, :, None] - select_joint_preds[:, :, None, :] # [b, t, 3, 3]
    dim_loss = (preds_diff - label_diff)**2 # [b, t, 3, 3]
    dim_loss_value = dim_loss.mean() #float()
    torch.cuda.empty_cache()
    
    return dim_loss_value

In [8]:
# def time_wise_loss_fn(preds, labels):
#     '''
#     calculate time-wise loss for motion (along the time axis)
#     input: labels[batch, time, dimension(joint*xyz)]
#     preds[batch, time , dimension(joint*xyz)]
#     output: time loss
#     '''
#     epsilon = 1e-7
#     preds = preds + epsilon

#     labels_transpose = torch.permute(labels, (0, 2, 1))#tf.transpose(labels, [0, 2, 1]) # [b, 3, t]
#     preds_transpose = torch.permute(preds, (0, 2, 1))#tf.transpose(preds, [0, 2, 1]) # [b, 3, t]
#     # print("labels_transpose.shape", labels_transpose.shape)
#     # print("preds_transpose.shape", preds_transpose.shape)
#     # print("labels_transpose[:, :, :, None].shape", labels_transpose[:, :, :, None].shape)
#     # print("labels_transpose[:, :, None, :].shape", labels_transpose[:, :, None, :].shape)
#     label_diff = labels_transpose[:, :, :, None] - labels_transpose [:, :, None, :] # [b, 3, t, t]
    
#     preds_diff = preds_transpose[:, :, :, None] - preds_transpose [:, :, None, :] # [b, 3, t, t]
#     # print(preds_diff.shape)
#     time_loss = (preds_diff - label_diff)**2 # [b, 3, t, t]
#     time_loss_value = time_loss.mean() #float()
#     torch.cuda.empty_cache()

#     return time_loss_value
    
# def dim_wise_loss_fn(preds, labels):
#     '''
#     calculate dimension-wise loss for motion (along the dimension axis)
#     input: labels[batch, time, dimension(joint*xyz)]
#     preds[batch, time , dimension(joint*xyz)]
#     output: dimension loss
#     '''
#     epsilon = 1e-7
#     preds = preds + epsilon
    
#     label_diff = labels[:, :, :, None] - labels[:, :, None, :] # [b, t, 3, 3]
#     preds_diff = preds[:, :, :, None] - preds[:, :, None, :] # [b, t, 3, 3]
#     dim_loss = (preds_diff - label_diff)**2 # [b, t, 3, 3]
#     dim_loss_value = dim_loss.mean() #float()
#     torch.cuda.empty_cache()
    
#     return dim_loss_value

In [9]:
def customized_mse_loss(output, target):
    # target = target.transpose(0, 1)

    # print("output.shape:", output.shape) #torch.Size([20, 513, 102])
    # print("target.shape:", target.shape) #torch.Size([20, 513, 102])

    w1_time = 0.3
    w2_dim = 0.3
    w3_mse = 0.4

    mse_loss = F.mse_loss(output, target)
    time_loss = time_wise_loss_fn(output, target)
    dim_loss = dim_wise_loss_fn(output, target)

    # print("time_loss:", time_loss)
    # print("dim_loss:", dim_loss)
    # print("mse_loss:", mse_loss)
    val_time_loss_list.append(time_loss.cpu().item())
    val_dim_loss_list.append(dim_loss.cpu().item())
    val_mse_loss_list.append(mse_loss.cpu().item())

    segment_loss = (w1_time * time_loss) + (w2_dim * dim_loss) + (w3_mse * mse_loss)
    torch.cuda.empty_cache()
    return  segment_loss

In [10]:
def evaluate_lstm_cross(model, split_count):
    model.eval()
    print('Validation')
    valid_running_loss = 0.0
    counter = 0
    # previous_output = torch.zeros(512, 102).to(device)
    
    outputs_save = []
    
    np.savetxt('./temp_path.txt', val_data_read[split_count], delimiter="\n", fmt="%s")
    # print(val_data_read[split_count])
    val_dataloader = get_audio_val_dataloader('./temp_path.txt', batch_size=11)
    
    with torch.no_grad():
        for i, (inputs, targets) in enumerate(val_dataloader): #tqdm(enumerate(val_dataloader), total=len(val_dataloader))
            counter += 1

            inputs = inputs.to(device).float()
            targets = targets.to(device).float()
            # print("val inputs.shape:", inputs.shape)
            # print("val targets.shape:", targets.shape)
            outputs = model(inputs)
            # print("val outputs.shape:", outputs.shape)

            loss =  F.mse_loss(outputs, targets)
            valid_running_loss += loss.cpu().item()
            # previous_output = outputs
            outputs_save.append(np.asarray(outputs.cpu()))

    loc_dt = datetime.datetime.today()
    loc_dt_format = loc_dt.strftime("%Y-%m-%d_%H-%M-%S")
    if not os.path.exists("./output_eval/"):
        os.makedirs('./output_eval/')

    # print("counter", counter)
    # print("val_data_read[split_count][counter]", val_data_read[split_count][counter])
    val_file_name = val_data_read[split_count][counter].split('/')[2].split('.')[0]

    # print("val file_name:", val_file_name)
    # print("outputs_save length: ", len(outputs_save), ", element shape: " , outputs_save[0].shape)
    #                                            str(split_count)
    eval_output = open("./output_eval/[split_" + str(6) + "][audio_with_anno][total" + str(num_epochs) + "_hs" + str(hidden_size) +"]save_"+ str(loc_dt_format) + "_l1_loss_" + str(loss.cpu().item())+".pkl", 'wb')
    pickle.dump(np.asarray(outputs_save), eval_output)
    eval_output.close()
    
    # print("val counter:", counter)
    epoch_val_loss_f1 = valid_running_loss / counter
    val_per_split_list.append(epoch_val_loss_f1)
    print("split_count:", split_count, ", epoch_val_loss:", epoch_val_loss_f1)
    os.remove("./temp_path.txt")
    return #epoch_val_loss_f1
# model.train()

In [11]:
kf = KFold(n_splits=k_folds)
print(kf.get_n_splits(dataset))
KFold(n_splits=k_folds, random_state=None, shuffle=False)
# for i, (train_index, test_index) in enumerate(kf.split(X)):
#     #...TODO
split_count = 0
random_pick_fold = 6#random.randint(0, 10) #0~10
print("random: ", random_pick_fold)
# for fold,(train_idx, test_idx) in enumerate(kf.split(dataset)): #TODO: random pick 1 fold
for (train_idx, test_idx) in itertools.islice(kf.split(dataset), random_pick_fold, random_pick_fold+1):
    # print('------------fold no---------{}----------------------'.format(fold))
    print('------------fold no---------{}----------------------'.format(random_pick_fold))
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    val_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)
    print("train_idx:", train_idx[0], "~", train_idx[-1], " test_idx:", test_idx[0], "~", test_idx[-1])

    train_loader = DataLoader(
                        dataset,
                        num_workers=0,
                        pin_memory=False,
                        drop_last=False,
                        batch_size=batch_size_define, sampler=train_subsampler) #bs=40:4.49G, bs=128:14.65G

    val_loader = DataLoader(
                        dataset,
                        num_workers=0,
                        pin_memory=False,
                        drop_last=False,
                        batch_size=batch_size_define, sampler=val_subsampler)
    
    model = LSTM1(output_dim, input_size, hidden_size, num_layers, seq_len).to(device) #our lstm class
    model.apply(reset_weights)
    
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        print(f'Starting epoch {epoch+1}')
        losses = []
        loss = 0
        mean_loss = 0
        for i, (audio_batch, motion_batch) in enumerate(train_loader):
            model.train()
            
            audio_batch = audio_batch.to(device).float()
            motion_batch = motion_batch.to(device).float()

            optimizer.zero_grad()
            output = model(audio_batch) #midi_batch
            # print("train inputs.shape:", midi_batch.shape, torch.isnan(midi_batch).any())
            # # print(motion_batch)
            # print("train targets.shape:", motion_batch.shape, torch.isnan(motion_batch).any())
            # print("train outputs.shape:", output.shape, torch.isnan(output).any())

            # loss =  F.mse_loss(output, motion_batch)
            # loss = customized_mse_loss(output.cpu(), motion_batch.cpu())
            loss = customized_mse_loss(output, motion_batch)
            
            losses.append(loss.cpu().item()) #.cpu().item()
            all_loss_list.append(loss.cpu().item()) #.cpu().item()
            loss.backward()

            optimizer.step()

            # print(f"Epoch {epoch}, batch {i}: loss = {loss.cpu().item():.6f}") #.cpu().item()

        # print(losses, sum(losses), len(losses))
        mean_loss = sum(losses)/len(losses)
        # correct, total = 0, 0
        valid_running_loss = 0.0
        counter = 0
        with torch.no_grad():
            for i, (audio_test, motion_test) in enumerate(val_loader):
                
                inputs = audio_test.to(device).float()
                targets = motion_test.to(device).float()

                outputs = model(inputs)
                # print("val inputs.shape:", inputs.shape)
                # print("val targets.shape:", targets.shape)
                # print("val outputs.shape:", outputs.shape)

                val_loss =  customized_mse_loss(outputs, targets)
                valid_running_loss += val_loss.cpu().item() #.cpu().item()
                counter += 1
            
            epoch_val_loss = valid_running_loss / counter
            # print(f"Epoch {epoch}: val_loss = {epoch_val_loss:.6f}") #.cpu().item()

        avg_loss_list.append(mean_loss) #.cpu().item()
        val_loss_per_epoch_list.append(epoch_val_loss) #.cpu().item()

        cross_valid_results[0] = epoch_val_loss
        
        loc_dt = datetime.datetime.today()
        loc_dt_format = loc_dt.strftime("%Y-%m-%d_%H-%M-%S")
        if (epoch+1)%100 == 0:
            torch.save({
                'epoch':epoch,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict':optimizer.state_dict(),
                'loss':loss
            },  "./model_save/[audio_with_anno][total"+str(num_epochs)+ "_hs" + str(hidden_size) +"]LSTM_save_epoch_" + str(epoch)+ "_"+ str(loc_dt_format) + "_avg_loss_" + str(mean_loss) +".tar")

    # Print fold results
    print(f'K-FOLD CROSS VALIDATION RESULTS FOR {1} FOLDS')
    print('--------------------------------')
    sum_loss = 0.0
    for key, value in cross_valid_results.items():
        print(f'Fold loss {key}: {value}')
        sum_loss += value
    print(f'Average vaildation new loss: {sum_loss/len(cross_valid_results.items())}')
    
    # validation result save
    evaluate_lstm_cross(model, split_count)
    
    split_count += 1

11
random:  6
------------fold no---------6----------------------
train_idx: 0 ~ 10999  test_idx: 6000 ~ 6999


Starting epoch 1
Starting epoch 2
Starting epoch 3
Starting epoch 4
Starting epoch 5
Starting epoch 6
Starting epoch 7
Starting epoch 8
Starting epoch 9
Starting epoch 10
Starting epoch 11
Starting epoch 12
Starting epoch 13
Starting epoch 14
Starting epoch 15
Starting epoch 16
Starting epoch 17
Starting epoch 18
Starting epoch 19
Starting epoch 20
Starting epoch 21
Starting epoch 22
Starting epoch 23
Starting epoch 24
Starting epoch 25
Starting epoch 26
Starting epoch 27
Starting epoch 28
Starting epoch 29
Starting epoch 30
Starting epoch 31
Starting epoch 32
Starting epoch 33
Starting epoch 34
Starting epoch 35
Starting epoch 36
Starting epoch 37
Starting epoch 38
Starting epoch 39
Starting epoch 40
Starting epoch 41
Starting epoch 42
Starting epoch 43
Starting epoch 44
Starting epoch 45
Starting epoch 46
Starting epoch 47
Starting epoch 48
Starting epoch 49
Starting epoch 50
Starting epoch 51
Starting epoch 52
Starting epoch 53
Starting epoch 54
Starting epoch 55
Starting epoch 56
S

In [12]:
print(val_data_read[split_count])

['preprocessed_data_save_cross_aud/audio/vio01_Elgar_S1_T1_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Elgar_S1_T2_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Flower_S1_T1_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Flower_S1_T2_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Mend_S1_T1_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Mend_S1_T2_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Mozart1_S1_T1_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Mozart1_S1_T2_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Mozart2_S1_T1_audio_data.pkl'
 'preprocessed_data_save_cross_aud/audio/vio01_Mozart2_S1_T2_audio_data.pkl']


In [13]:
print(loc_dt_format)
print(avg_loss_list)

2023-08-18_00-02-01
[0.022574301759898663, 0.021180258623510598, 0.020839990484714507, 0.020409348321706055, 0.020016615065932275, 0.0198870617069304, 0.019450549481064082, 0.019248797840625047, 0.019049027463048696, 0.018804145911335945, 0.018564312371611595, 0.01836824054569006, 0.018364610700309276, 0.018073319456726314, 0.01807970843911171, 0.01788466232419014, 0.01761753334030509, 0.01764287444502115, 0.01759130107164383, 0.017453278943151237, 0.01741305451244116, 0.017504273404926062, 0.01743151963725686, 0.017119019469618798, 0.01713726275190711, 0.017259640707820652, 0.01719920119345188, 0.01699799572825432, 0.017544726924598218, 0.017221683264523745, 0.01744527061879635, 0.01733361777961254, 0.01716467973664403, 0.016750868076086046, 0.016833669143915177, 0.016889546924829482, 0.016740041889995335, 0.016712463850528003, 0.01717237947806716, 0.016601429896056653, 0.016565701138973238, 0.016818792567402124, 0.01699169590473175, 0.016882372639328243, 0.0164340042129159, 0.0167584

In [14]:
print(val_loss_per_epoch_list)

[0.021298756062984467, 0.020573857106268405, 0.02026856142282486, 0.020617131255567075, 0.020115891113877297, 0.01919410962611437, 0.01929555121809244, 0.019033971309661866, 0.019028468400239945, 0.01865253648161888, 0.018609327659010886, 0.01811175514012575, 0.019116331584751605, 0.017741454474627973, 0.01824856188893318, 0.01761901331692934, 0.017547306597232817, 0.017348801307380198, 0.017990633338689804, 0.01707701237499714, 0.018065265774726866, 0.017024904541671278, 0.017062797397375106, 0.017436405465006827, 0.016586425423622132, 0.016738970212638378, 0.016700909212231635, 0.01708371753245592, 0.017221762388944625, 0.016514594592154028, 0.017774952247738837, 0.017545716650784016, 0.017161910980939867, 0.016487753115594386, 0.016614845924079417, 0.017591939598321914, 0.01684808924794197, 0.016310379542410373, 0.016558199487626553, 0.01675050626695156, 0.016746047526597977, 0.017143569014966487, 0.017348543509840966, 0.016665008388459683, 0.015998411938548087, 0.016436675876379012

In [15]:
# val_time_loss_list
# val_dim_loss_list
# val_mse_loss_list
plt.cla()
plt.clf()
print(len(val_time_loss_list))
val_time_loss_list_dataframe = pd.DataFrame(val_time_loss_list)
plt.plot(np.array(val_time_loss_list_dataframe.index), np.array(val_time_loss_list_dataframe[0]))
plt.savefig("avg_time_loss_training.jpg")
plt.show()

plt.cla()
plt.clf()
print(len(val_dim_loss_list))
val_dim_loss_list_dataframe = pd.DataFrame(val_dim_loss_list)
plt.plot(np.array(val_dim_loss_list_dataframe.index), np.array(val_dim_loss_list_dataframe[0]))
plt.savefig("avg_dim_loss_training.jpg")
plt.show()

plt.cla()
plt.clf()
print(len(val_mse_loss_list))
val_mse_loss_list_dataframe = pd.DataFrame(val_mse_loss_list)
plt.plot(np.array(val_mse_loss_list_dataframe.index), np.array(val_mse_loss_list_dataframe[0]))
plt.savefig("avg_mse_loss_training.jpg")
plt.show()

137500
137500
137500


In [16]:
plt.cla()
plt.clf()

In [17]:
print(len(avg_loss_list))
avg_loss_list_dataframe = pd.DataFrame(avg_loss_list)

100


In [18]:
avg_loss_list_dataframe

Unnamed: 0,0
0,0.022574
1,0.021180
2,0.020840
3,0.020409
4,0.020017
...,...
95,0.016328
96,0.016284
97,0.016791
98,0.016180


In [19]:
plt.plot(np.array(avg_loss_list_dataframe.index), np.array(avg_loss_list_dataframe[0]))
plt.savefig("avg_loss_training.jpg")
plt.show()

In [20]:
plt.cla()
plt.clf()

In [21]:
loss_list_dataframe = pd.DataFrame(all_loss_list)

In [22]:
plt.plot(np.array(loss_list_dataframe.index), np.array(loss_list_dataframe[0]))
plt.savefig("training_loss.jpg")
plt.show()

In [23]:
plt.cla()
plt.clf()

In [24]:
val_loss_per_epoch_list_dataframe = pd.DataFrame(val_loss_per_epoch_list)

In [25]:
plt.plot(np.array(val_loss_per_epoch_list_dataframe.index), np.array(val_loss_per_epoch_list_dataframe[0]))
plt.savefig("training_val_loss.jpg")
plt.show()

In [26]:
def predict(model, input, device):
    model.eval()
    with torch.no_grad():
        input = torch.as_tensor(input).to(torch.float32).to(device)
        # print(target.shape)
        # target = torch.as_tensor(target).to(torch.float32).to(device)
        # TODO: target should be <sos>, should not random
        outputs = model(input)
        return outputs.cpu().numpy()

In [27]:
def read_midi(filename, specific_fps):
    # Load the MIDI file
    midi_data = pretty_midi.PrettyMIDI(filename)

    piano_roll = midi_data.get_piano_roll(fs=specific_fps)  # 40fps #250fps
    piano_roll[piano_roll > 0] = 1

    return piano_roll

In [28]:
def audio_preprocess(audio_path, specific_fps):
    n_fft = 4096
    hop = int(44000/specific_fps)  # 1102.5 -> 40fps #882 -> 50fps
    y, sr = librosa.load(audio_path, sr=44000)  # 44000 for divide 40
    print("y.shape", y.shape)
    print("sample rate: ", sr)
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_fft=n_fft, hop_length=hop, n_mfcc=13)
    y = np.where(y == 0, 1e-10, y)
    energy = np.log(librosa.feature.rms(
        y=y, frame_length=n_fft, hop_length=hop, center=True))
    mfcc_energy = np.vstack((mfcc, energy))
    mfcc_delta = librosa.feature.delta(mfcc_energy)

    sgram = librosa.stft(y, n_fft=n_fft, hop_length=hop)
    sgram_mag, _ = librosa.magphase(sgram)
    mel_scale_sgram = librosa.feature.melspectrogram(
        S=sgram_mag, sr=sr)

    print("mfcc_energy", mfcc_energy.shape)
    print("mfcc_delta", mfcc_delta.shape)
    print("mel_scale_sgram", mel_scale_sgram.shape)

    aud = np.vstack((mfcc_energy, mfcc_delta, mel_scale_sgram)).T

    print("hop:", hop)
    print("aud:", aud.shape)
    return aud

In [29]:
test_datapath = "./BWV1001/"
change_fps = 40
test_audio_path_list = glob.glob(test_datapath + "*.wav")
test_data_list = []
test_music_list = []
for test_audio in test_audio_path_list:
    str_name = test_audio
    print("str_name:", str_name)
    filename = str_name.split('/')[2]
    filecode = filename.split('.')[0]
    print("filecode: ",filecode)
    test_music_list.append(filecode)
    
    print(test_audio)
    # read_piano_roll = read_midi(test_midi, change_fps)
    read_audio = audio_preprocess(test_audio, change_fps)
    # read_audio_transpose = read_audio
    print(read_audio.shape)
    test_audio_len = read_audio.shape[0]
    test_data_list.append(read_audio)

str_name: ./BWV1001/vs1-1ada.wav
filecode:  vs1-1ada
./BWV1001/vs1-1ada.wav


y.shape (8988979,)
sample rate:  44000
mfcc_energy (14, 8172)
mfcc_delta (14, 8172)
mel_scale_sgram (128, 8172)
hop: 1100
aud: (8172, 156)
(8172, 156)
str_name: ./BWV1001/vs1-3sic.wav
filecode:  vs1-3sic
./BWV1001/vs1-3sic.wav
y.shape (7687297,)
sample rate:  44000
mfcc_energy (14, 6989)
mfcc_delta (14, 6989)
mel_scale_sgram (128, 6989)
hop: 1100
aud: (6989, 156)
(6989, 156)
str_name: ./BWV1001/vs1-2fug.wav
filecode:  vs1-2fug
./BWV1001/vs1-2fug.wav
y.shape (12687134,)
sample rate:  44000
mfcc_energy (14, 11534)
mfcc_delta (14, 11534)
mel_scale_sgram (128, 11534)
hop: 1100
aud: (11534, 156)
(11534, 156)
str_name: ./BWV1001/vs1-4prs.wav
filecode:  vs1-4prs
./BWV1001/vs1-4prs.wav
y.shape (8686945,)
sample rate:  44000
mfcc_energy (14, 7898)
mfcc_delta (14, 7898)
mel_scale_sgram (128, 7898)
hop: 1100
aud: (7898, 156)
(7898, 156)


In [30]:
def column(matrix, i):
    return [row[i] for row in matrix]

def test_render_animation(fps, output, azim, prediction, ground_truth=None):
    prediction_array = np.asarray(prediction)
    print(prediction_array.size)
    limit = len(prediction_array)
    print("limit", limit)
    size = 6#6
    fps = 40

    # Skeleton layout
    parents = [[0, 1], [1, 3], [3, 2], [0, 2],#head
                [8, 6], [6, 13], [13, 4], [4, 8],#shoulder
                [6, 4], [4, 5], [5, 7], [7, 6],#Upper torso
                [8, 18], [8, 20], [13, 21], [13, 19],
                [5, 20], [5, 21], [7, 18], [7, 19],
                [18, 19], [19, 21], [21, 20], [20, 18], #waist
                [18, 22], [20, 22], [22, 23], [22, 25], [23, 25], [24,23], [24, 25],  #right lag
                [21, 26], [19, 26], [26, 27], [26, 29], [27, 29], [28, 27], [28, 29], #left lag
                [8, 9], [9, 11], [9, 10], [10, 11], [10, 12], [9, 12], [11, 12], #right hand
                [13, 14], [14, 16], [14, 15], [16, 15], [14, 17], [16, 17], [15, 17], #left hand
                [31, 33], [30, 32], [30, 31], [32, 33], [31, 32], [30, 33] #instrument
                        ]
    # joints_right = [1, 2, 12, 13, 14]

    prediction_array[:, :, 2] += 0.1 #[:, :, 2]
    if ground_truth is not None:
        ground_truth[:, :, 2] += 0.1
        poses = {'Prediction': prediction_array,
                 'Ground_truth': ground_truth}
    else:
        poses = {'Prediction': prediction_array}
    

    fig = plt.figure()#(figsize=(size*len(poses), size))
    # ax_3d = []
    # lines_3d = []
    radius = 1#14 #3.7#
    # print(poses)
    for index, (title, data) in enumerate(poses.items()):
        ax = fig.add_subplot(1, len(poses), index + 1, projection='3d')
        ax.clear()
        print(data)
        ims = [] #每一 frame 都存
        for frame_index, each_frame in enumerate(data):
            # print("each_frame")
            # print(each_frame)
            ax.view_init(elev=15., azim=azim)
            ax.set_xlim3d([-radius/2, radius/2])
            ax.set_zlim3d([0, radius])
            ax.set_ylim3d([-radius/2, radius/2])
            ax.set_aspect('auto') #ax.set_aspect('equal')

            # print(title)
            points = ax.scatter(column(each_frame[:30], 0), column(each_frame[:30], 1), column(each_frame[:30], 2), cmap='jet', marker='o', label='body joint', color = 'black')
            points_2 = ax.scatter(column(each_frame[30:32], 0), column(each_frame[30:32], 1), column(each_frame[30:32], 2), cmap='jet', marker='o', label='body joint', color = 'blue')
            points_3 = ax.scatter(column(each_frame[32:34], 0), column(each_frame[32:34], 1), column(each_frame[32:34], 2), cmap='jet', marker='o', label='body joint', color = 'red')
            
            # ax.scatter(column(each_frame, 0), column(each_frame, 1), column(each_frame, 2), cmap='jet', marker='o', label='body joint')
            # ax.legend()
            # print("+++")
            
            parents = [[0, 1], [1, 3], [3, 2], [0, 2],#head
                        [8, 6], [6, 13], [13, 4], [4, 8],#shoulder
                        [6, 4], [4, 5], [5, 7], [7, 6],#Upper torso
                        [8, 18], [8, 20], [13, 21], [13, 19],
                        [5, 20], [5, 21], [7, 18], [7, 19],
                        [18, 19], [19, 21], [21, 20], [20, 18], #waist
                        [18, 22], [20, 22], [22, 23], [22, 25], [23, 25], [24,23], [24, 25],  #right lag
                        [21, 26], [19, 26], [26, 27], [26, 29], [27, 29], [28, 27], [28, 29], #left lag
                        [8, 9], [9, 11], [9, 10], [10, 11], [10, 12], [9, 12], [11, 12], #right hand
                        [13, 14], [14, 16], [14, 15], [16, 15], [14, 17], [16, 17], [15, 17], #left hand
                        [30, 31], [32, 33],  #instrument
                        # [31, 33], [30, 32], [30, 31], [32, 33], [31, 32], [30, 33] #instrument
                        ]
            lines = []
            # draw line
            
            # lines = [ax.plot([each_frame[vs][0], each_frame[ve][0]],
            #                  [each_frame[vs][1], each_frame[ve][1]],
            #                  [each_frame[vs][2], each_frame[ve][2]]) for (vs, ve) in parents]
            line_num = len(parents)
            for idx, each_line in enumerate(parents):
                vec_start = each_frame[each_line[0]]
                vec_end = each_frame[each_line[1]]
                # print(vec_start)
                # print(vec_end)
                line_color = "black"
                if idx == line_num-2:
                    line_color = "blue"
                if idx == line_num-1:
                    line_color = "red"
                # ax.plot([vec_start[0], vec_end[0]], [vec_start[1], vec_end[1]], [vec_start[2], vec_end[2]])
                
                temp, = ax.plot([vec_start[0], vec_end[0]], [vec_start[1], vec_end[1]], [vec_start[2], vec_end[2]], color=line_color)
                lines.append(temp)

            # ax.figure.savefig('./test_pic/pic' + str(frame_index) + '.png', dpi=100, bbox_inches = 'tight')

            # ims.append([points])
            # image_frame = [points].extend(lines)
            ims.append([points]+[points_2]+[points_3]+lines) #TODO: try extend

            # plt.cla()
            # print("+++")

    anim = matplotlib.animation.ArtistAnimation(fig, ims, interval=1000/fps)

    if output.endswith('.mp4'):
        FFwriter = matplotlib.animation.FFMpegWriter(fps=fps, extra_args=['-vcodec', 'libx264'])
        anim.save(output, writer=FFwriter)
    elif output.endswith('.gif'):
        anim.save(output, fps=fps, dpi=100, writer='imagemagick')
    else:
        raise ValueError('Unsupported output format (only .mp4 and .gif are supported)')

In [31]:
def plot(audio_path, plot_path, prediction, sample_time, fps, name=""): #audio_path, plot_path, 
    # render_animation(fps, output='new_temp.mp4', azim=75, prediction=prediction)
    test_render_animation(fps, output='new_temp_' + name + '.mp4', azim=75, prediction=prediction)

    # # #merge with wav
    input_video = ffmpeg.input('new_temp_' + name + '.mp4')
    fluid_syn = FluidSynth()
    fluid_syn.midi_to_audio(audio_path, './output' + name + '.wav')
    input_audio = ffmpeg.input('./output' + name + '.wav')
    # output = ffmpeg.output(video, audio, plot_path, vcodec='copy', acodec='aac', strict='experimental')
    ffmpeg.concat(input_video, input_audio, v=1, a=1).output(plot_path).run()
    # os.remove('new_temp_' + name + '.mp4')

In [32]:
model.eval()

full_prediction = pd.DataFrame()
num_count = 0
# read midi
# test_dataloader = get_dataloader(test_datapath, batch_size=1)
for test_batch in test_data_list:
    with torch.no_grad():
        # first_target = torch.zeros(test_batch.shape[0],112)
        # print(first_target.shape)
        test_input = test_batch[None, :]
        # test_target = first_target[None, :]
        print("test_input", test_input.shape)
        # print("test_target", test_target.shape)
        prediction = predict(model, test_input, device)
        
        # print(prediction.shape)
        
        prediction  = prediction[:, :, :102]
        print("prediction.shape", prediction.shape)
        
        # full_prediction.append(prediction)
        full_prediction = pd.DataFrame(prediction[0])
        print("full_prediction", full_prediction.shape)
        
        # prev_prediction = prediction[0][:-1][None, :]
        # print(prev_prediction.shape)
        
        Row_list_prediction =[]
        
        filecode = test_music_list[num_count]
    
        # Iterate over each row
        for index, rows in full_prediction.iterrows():
            #fill nan
            rows = rows.fillna(0)
            # Create list for the current row
            my_list = rows.values.tolist()
            # print(my_list)
            
            my_list_per3 = [my_list[i:i+3] for i in range(0, len(my_list), 3)]
            # append the list to the final list
            Row_list_prediction.append(my_list_per3)

        # print(len(Row_list_prediction), len(Row_list_prediction[0]),len(Row_list_prediction[0][0]))
        plot(test_datapath + test_music_list[num_count] + ".mid", "./video_" + filecode + "_test_predict.mp4", Row_list_prediction[:800], None, 40, filecode) #ow_list[0:900]
        # print("prediction.shape", prediction.shape)
        prediction_arr = np.array(Row_list_prediction)
        # formated_motion = prediction_format(full_prediction)
        # # # plot(formated_motion)
        # audio_path = test_music_list[num_count][0]
        # output_path = "test_output_" + filecode + ".mp4"
        # plot(formated_motion, audio_path, output_path, None, 10, filecode)
        num_count += 1

# model.train()

test_input (1, 8172, 156)
prediction.shape (1, 8172, 102)
full_prediction (8172, 102)
81600
limit 800
[[[ 0.07024748  0.13829876  1.08953187]
  [ 0.00320912  0.10406311  1.10779772]
  [ 0.10916515  0.06320114  1.08616326]
  ...
  [ 0.00599956  0.09332527  0.95309654]
  [-0.15569627  0.05236987  1.08529637]
  [ 0.08991085  0.2768048   0.79241619]]

 [[ 0.07078319  0.13274395  1.08814154]
  [ 0.01116113  0.09102634  1.10696743]
  [ 0.11871173  0.0642844   1.08729217]
  ...
  [ 0.01844802  0.08720319  0.95615945]
  [-0.13366577  0.03363332  1.06781969]
  [ 0.1143583   0.25749567  0.77368329]]

 [[ 0.06853681  0.14032692  1.09137473]
  [ 0.00629252  0.09998175  1.11210392]
  [ 0.11808024  0.06919579  1.09514359]
  ...
  [ 0.01587716  0.09266126  0.95754907]
  [-0.14618626  0.01775675  1.05718282]
  [ 0.1175925   0.27764222  0.80062178]]

 ...

 [[ 0.02798912  0.14885654  1.0945401 ]
  [-0.03520807  0.11679058  1.10939167]
  [ 0.07607566  0.08289334  1.10835061]
  ...
  [-0.01866166  0.1003

fluidsynth: panic: An error occurred while reading from stdin.


FluidSynth runtime version 2.1.1
Copyright (C) 2000-2020 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of E-mu Systems, Inc.

Rendering audio to file './outputvs1-1ada.wav'..


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/home/ilc/anaconda3/envs/sinica --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55

test_input (1, 6989, 156)
prediction.shape (1, 6989, 102)
full_prediction (6989, 102)
81600
limit 800
[[[ 6.64970726e-02  1.44645646e-01  1.09606991e+00]
  [ 4.73620743e-03  1.08050868e-01  1.11665711e+00]
  [ 1.12585939e-01  7.46731013e-02  1.09251008e+00]
  ...
  [ 8.57814401e-03  1.01595968e-01  9.52531612e-01]
  [-1.58674598e-01  5.84007502e-02  1.09456382e+00]
  [ 7.23238736e-02  2.53025889e-01  7.80236280e-01]]

 [[ 5.26834950e-02  1.71314448e-01  1.08458707e+00]
  [-6.92925602e-03  1.35885656e-01  1.11036346e+00]
  [ 9.99843925e-02  1.04730412e-01  1.09779129e+00]
  ...
  [-6.90653920e-03  1.11187860e-01  9.64657283e-01]
  [-1.67858273e-01  6.10936619e-02  1.10961053e+00]
  [ 6.03713654e-02  2.47765511e-01  7.66304588e-01]]

 [[ 7.02223182e-02  1.38029113e-01  1.09087596e+00]
  [ 7.52680004e-03  9.94205996e-02  1.11175761e+00]
  [ 1.18992642e-01  6.86630309e-02  1.09567831e+00]
  ...
  [ 1.96898729e-02  8.99086595e-02  9.61378670e-01]
  [-1.33494571e-01  3.29402536e-02  1.070893

fluidsynth: panic: An error occurred while reading from stdin.


FluidSynth runtime version 2.1.1
Copyright (C) 2000-2020 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of E-mu Systems, Inc.

Rendering audio to file './outputvs1-3sic.wav'..


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/home/ilc/anaconda3/envs/sinica --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55

test_input (1, 11534, 156)
prediction.shape (1, 11534, 102)
full_prediction (11534, 102)
81600
limit 800
[[[ 0.09275538  0.10923153  1.09746257]
  [ 0.01912919  0.07353182  1.10775874]
  [ 0.13199309  0.03939848  1.08087347]
  ...
  [ 0.02764072  0.08211461  0.94495521]
  [-0.03914905  0.16792393  1.07998822]
  [ 0.09403341  0.17051506  0.67577926]]

 [[ 0.0659423   0.1855974   1.08663604]
  [ 0.00332499  0.15101495  1.11131892]
  [ 0.1080711   0.12085122  1.08242402]
  ...
  [-0.01543661  0.1252639   0.97322849]
  [-0.09703211  0.20841785  1.11903915]
  [ 0.01574616  0.19986212  0.67343185]]

 [[ 0.04467207  0.19257402  1.09122226]
  [-0.01911042  0.16006075  1.11668108]
  [ 0.08896825  0.12563969  1.09408472]
  ...
  [-0.03348367  0.12808569  0.98153636]
  [-0.1299573   0.19854002  1.13870749]
  [-0.0048928   0.20966193  0.70726821]]

 ...

 [[ 0.07422368  0.12432808  1.0934864 ]
  [ 0.01115414  0.08570659  1.10973487]
  [ 0.1239897   0.0569272   1.09129939]
  ...
  [ 0.02848584  0.0

fluidsynth: panic: An error occurred while reading from stdin.


FluidSynth runtime version 2.1.1
Copyright (C) 2000-2020 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of E-mu Systems, Inc.

Rendering audio to file './outputvs1-2fug.wav'..


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/home/ilc/anaconda3/envs/sinica --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55

test_input (1, 7898, 156)
prediction.shape (1, 7898, 102)
full_prediction (7898, 102)
81600
limit 800
[[[ 7.01931790e-02  1.32361948e-01  1.09280787e+00]
  [ 9.21661872e-03  9.09868926e-02  1.10980246e+00]
  [ 1.21582948e-01  6.35351539e-02  1.08697019e+00]
  ...
  [ 1.42150410e-02  9.25019681e-02  9.60306919e-01]
  [-1.20856658e-01  9.49248821e-02  1.13092325e+00]
  [ 6.23735599e-02  2.04532966e-01  7.23854339e-01]]

 [[ 7.60195032e-02  1.33963510e-01  1.08706585e+00]
  [ 1.49858594e-02  9.52953398e-02  1.10783968e+00]
  [ 1.21314958e-01  6.52025118e-02  1.08245358e+00]
  ...
  [ 1.89691558e-02  8.89421552e-02  9.56016457e-01]
  [-1.01252452e-01  8.61087814e-02  1.08845983e+00]
  [ 1.01630740e-01  2.07988411e-01  7.07730925e-01]]

 [[ 7.77217001e-02  1.22461744e-01  1.08942018e+00]
  [ 1.42224971e-02  8.61178562e-02  1.11023972e+00]
  [ 1.24471441e-01  5.32897040e-02  1.09019098e+00]
  ...
  [ 2.62598656e-02  8.37194100e-02  9.55978072e-01]
  [-9.33095664e-02  8.91763419e-02  1.089244

fluidsynth: panic: An error occurred while reading from stdin.


FluidSynth runtime version 2.1.1
Copyright (C) 2000-2020 Peter Hanappe and others.
Distributed under the LGPL license.
SoundFont(R) is a registered trademark of E-mu Systems, Inc.

Rendering audio to file './outputvs1-4prs.wav'..


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/home/ilc/anaconda3/envs/sinica --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55

In [33]:
model.eval()

full_prediction = pd.DataFrame()
num_count = 0
# read midi
# test_dataloader = get_dataloader(test_datapath, batch_size=1)
for test_batch in test_data_list:
    with torch.no_grad():
        first_target = torch.zeros(test_batch.shape[0],115)
        print(first_target.shape)
        test_input = test_batch[None, :]
        test_target = first_target[None, :]
        print("test_input", test_input.shape)
        print("test_target", test_target.shape)
        prediction = predict(model, test_input, device)
        
        # print(prediction.shape)
        
        prediction  = prediction[:, :, :102]
        print("prediction.shape", prediction.shape)
        
        # full_prediction.append(prediction)
        full_prediction = pd.DataFrame(prediction[0])
        print("full_prediction", full_prediction.shape)
        
        # prev_prediction = prediction[0][:-1][None, :]
        # print(prev_prediction.shape)
        
        Row_list_prediction =[]
        
        filecode = test_music_list[num_count]
    
        # Iterate over each row
        for index, rows in full_prediction.iterrows():
            #fill nan
            rows = rows.fillna(0)
            # Create list for the current row
            my_list = rows.values.tolist()
            # print(my_list)
            
            my_list_per3 = [my_list[i:i+3] for i in range(0, len(my_list), 3)]
            # append the list to the final list
            Row_list_prediction.append(my_list_per3)

        prediction_arr = np.array(Row_list_prediction)
        if not os.path.exists('./output_prediction/[audio_with_anno]'+str(num_layers)+'LSTM_hidden'+str(hidden_size)+'_'+str(num_epochs)+'epoch/'):
            os.makedirs('./output_prediction/[audio_with_anno]'+str(num_layers)+'LSTM_hidden'+str(hidden_size)+'_'+str(num_epochs)+'epoch/')
        audio_data_output = open('./output_prediction/[audio_with_anno]'+str(num_layers)+'LSTM_hidden'+str(hidden_size)+'_'+str(num_epochs)+'epoch/prediction_'+
                                filecode +'.pkl', 'wb')
        pickle.dump(prediction_arr, audio_data_output)
        audio_data_output.close()
        
        num_count += 1

# model.train()

torch.Size([8172, 115])
test_input (1, 8172, 156)
test_target torch.Size([1, 8172, 115])
prediction.shape (1, 8172, 102)
full_prediction (8172, 102)
torch.Size([6989, 115])
test_input (1, 6989, 156)
test_target torch.Size([1, 6989, 115])
prediction.shape (1, 6989, 102)
full_prediction (6989, 102)
torch.Size([11534, 115])
test_input (1, 11534, 156)
test_target torch.Size([1, 11534, 115])
prediction.shape (1, 11534, 102)
full_prediction (11534, 102)
torch.Size([7898, 115])
test_input (1, 7898, 156)
test_target torch.Size([1, 7898, 115])
prediction.shape (1, 7898, 102)
full_prediction (7898, 102)


In [34]:
# final_val_loss = evaluate_lstm(model, val_dataloader)

In [35]:
# print(final_val_loss)