In [None]:
import os
import glob
from mmsdk import mmdatasdk
import numpy
import pandas
import h5py
import tqdm
import skimage.io
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
import librosa

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
fr = 29.97
ft = 1 / fr
print(ft)
sr = 11025
spf = round(ft * sr)
print(spf)
profile = "obama"
root = "/home/santiago/Data/deep_puppetry/{}".format(profile)
out_dir ="/home/santiago/Data/deep_puppetry/csd/{}".format(profile)
videos = sorted(os.listdir(root))
print(videos)

In [None]:
def read_openface(video, start, end):
    openface_df = pandas.read_csv(os.path.join(root, video, "processed/{}.csv".format(video)), sep=", ")
    intervals = []
    features = []
    t = start
    for i, row in openface_df.iterrows():
        timestamp = float(row["timestamp"])
        if timestamp >= start and timestamp <= end:
            intervals.append([t, timestamp])
            features.append(row.values[5:])
            t = timestamp
    return numpy.array(intervals), numpy.array(features)

In [None]:
def read_frames(video, start, end):
    openface_df = pandas.read_csv(os.path.join(root, video, "processed/{}.csv".format(video)), sep=", ")
    intervals = []
    features = []
    t = start
    for file in sorted(glob.glob(os.path.join(root, video, "processed/{}_aligned/*.bmp".format(video)))):
        frame = int(file[-10:-4])
        timestamp = float(openface_df.loc[openface_df["frame"] == frame].iloc[0]["timestamp"])
        confidence = float(openface_df.loc[openface_df["frame"] == frame].iloc[0]["confidence"])
        if timestamp >= start and timestamp <= end and confidence >= 0.9:
            img = skimage.io.imread(file)
            intervals.append([t, timestamp])
            features.append(img.flatten())
        t = timestamp
    return numpy.array(intervals), numpy.array(features)

In [None]:
def read_frames10(video, start, end):
    start = round(start * fr)
    end = round(end * fr)
    intervals = []
    features = []
    t = start / fr
    for file in sorted(glob.glob(os.path.join(root, video, "processed/{}_aligned/*.bmp".format(video)))):
        frame = int(file[-10:-4])
        if frame >= start and frame <= end and frame % 3 == 0:
            img = skimage.io.imread(file)
            timestamp = frame / fr
            intervals.append([t, timestamp])
            features.append(img.flatten())
            t = timestamp
    return numpy.array(intervals), numpy.array(features)

In [None]:
def read_words(video):
    data = h5py.File(os.path.join(root, video, "AlignFilter/{}_words.hdf5".format(video)))[video]
    intervals = list(data["intervals"])
    features = list(data["features"])
    return numpy.array(intervals), numpy.array(features)

In [None]:
def read_phones(video):
    data = h5py.File(os.path.join(root, video, "AlignFilter/{}_phones.hdf5".format(video)))[video]
    intervals = list(data["intervals"])
    features = list(data["features"])
    return numpy.array(intervals), numpy.array(features)

In [None]:
def read_spectrograms(video, start, end):
    intervals = []
    features = []
    t = start
    spectrogram = numpy.load(os.path.join(root, video, "spectrogram.npy"))
    for i, spectro in enumerate(spectrogram.T):
        timestamp = i * ft
        if timestamp >= start and timestamp <= end:
            spectro = numpy.stack([spectro.real, spectro.imag])
            intervals.append([t, timestamp])
            features.append(spectro.flatten())
            t = timestamp
    return numpy.array(intervals), numpy.array(features)

In [None]:
# def read_spectrograms(video, start, end):
#     openface_df = pandas.read_csv(os.path.join(root, video, "processed/{}.csv".format(video)), sep=", ")
#     intervals = []
#     features = []
#     t = start
#     for file in sorted(glob.glob(os.path.join(root, video, "spectrograms/*.npy"))):
#         frame = int(file[-9:-4])
#         timestamp = float(openface_df.loc[openface_df["frame"] == frame].iloc[0]["timestamp"])
#         if timestamp >= start and timestamp <= end:
#             arr = numpy.load(file)
#             intervals.append([t, timestamp])
#             features.append(arr.flatten())
#             t = timestamp
#     return numpy.array(intervals), numpy.array(features)

In [None]:
word_data = {}
for video in tqdm.tqdm(videos):
    word_intervals, word_features = read_words(video)
    word_data[video] = {}
    word_data[video]["intervals"] = word_intervals
    word_data[video]["features"] = word_features
words = mmdatasdk.computational_sequence("{}_words".format(profile))
words.setData(word_data, root)
words.deploy(os.path.join(out_dir, "{}_words.csd").format(profile))

In [None]:
phone_data = {}
for video in tqdm.tqdm(videos):
    phone_intervals, phone_features = read_phones(video)
    phone_data[video] = {}
    phone_data[video]["intervals"] = phone_intervals
    phone_data[video]["features"] = phone_features
phones = mmdatasdk.computational_sequence("{}_phones".format(profile))
phones.setData(phone_data, root)
phones.deploy(os.path.join(out_dir, "{}_phones.csd").format(profile))

In [None]:
openface_data = {}
for video in tqdm.tqdm(videos):
    phone_intervals = list(h5py.File(os.path.join(root, video, "AlignFilter/{}_phones.hdf5".format(video)))[video]["intervals"])
    start = phone_intervals[0][0]
    end = phone_intervals[-1][1]
    openface_intervals, openface_features = read_openface(video, start, end)
    openface_data[video] = {}
    openface_data[video]["intervals"] = openface_intervals
    openface_data[video]["features"] = openface_features
openface = mmdatasdk.computational_sequence("{}_openface".format(profile))
openface.setData(openface_data, root)
openface.deploy(os.path.join(out_dir, "{}_openface.csd".format(profile)))

In [None]:
frame_data = {}
for video in tqdm.tqdm(videos):
    phone_intervals = list(h5py.File(os.path.join(root, video, "AlignFilter/{}_phones.hdf5".format(video)))[video]["intervals"])
    start = phone_intervals[0][0]
    end = phone_intervals[-1][1]
    frame_intervals, frame_features = read_frames(video, start, end)
    frame_data[video] = {}
    frame_data[video]["intervals"] = frame_intervals
    frame_data[video]["features"] = frame_features
frames = mmdatasdk.computational_sequence("{}_frames".format(profile))
frames.setData(frame_data, root)
frames.deploy(os.path.join(out_dir, "{}_frames.csd".format(profile)))

In [None]:
frame10_data = {}
for video in tqdm.tqdm(videos):
    phone_intervals = list(h5py.File(os.path.join(root, video, "AlignFilter/{}_phones.hdf5".format(video)))[video]["intervals"])
    start = phone_intervals[0][0]
    end = phone_intervals[-1][1]
    frame10_intervals, frame10_features = read_frames10(video, start, end)
    frame10_data[video] = {}
    frame10_data[video]["intervals"] = frame10_intervals
    frame10_data[video]["features"] = frame10_features
frames10 = mmdatasdk.computational_sequence("{}_frames10".format(profile))
frames10.setData(frame10_data, root)
frames10.deploy(os.path.join(out_dir, "{}_frames10.csd".format(profile)))

In [None]:
spectrogram_data = {}
for video in tqdm.tqdm(videos):
    phone_intervals = list(h5py.File(os.path.join(root, video, "AlignFilter/{}_phones.hdf5".format(video)))[video]["intervals"])
    start = phone_intervals[0][0]
    end = phone_intervals[-1][1]
    spectrogram_intervals, spectrogram_features = read_spectrograms(video, start, end)
    spectrogram_data[video] = {}
    spectrogram_data[video]["intervals"] = spectrogram_intervals
    spectrogram_data[video]["features"] = spectrogram_features
spectrograms = mmdatasdk.computational_sequence("{}_spectrograms".format(profile))
spectrograms.setData(spectrogram_data, root)
spectrograms.deploy(os.path.join(out_dir, "{}_spectrograms.csd".format(profile)))

In [None]:
# create spectrograms
for video in tqdm.tqdm(videos):
    samples, rate = librosa.core.load(os.path.join(root, video, "{}.wav".format(video)), sr=sr, mono=True, dtype=numpy.float32)
    assert rate == sr
    frequencies, times, spectrogram = signal.stft(samples, fs=rate, nperseg=spf*2)
    numpy.save(os.path.join(root, video, "spectrogram.npy"), spectrogram)

In [None]:
# # create spectrograms
# for video in tqdm.tqdm(videos):
# #     d = os.path.join(root, video, "spectrograms")
# #     if not os.path.exists(d):
# #         os.mkdir(d)
# #     rate, samples = wavfile.read(os.path.join(root, video, "{}.wav".format(video)))
#     samples, rate = librosa.core.load(os.path.join(root, video, "{}.wav".format(video)), sr=sr, mono=True, dtype=numpy.float32)
#     assert rate == sr
# #     assert len(samples.shape) == 1
# #     openface_df = pandas.read_csv(os.path.join(root, video, "processed/{}.csv".format(video)), sep=", ")
# #     frames = []
#     spectrograms = []
#     remainder = len(samples) % spf
#     if remainder > 0:
#         samples = samples[:-remainder]
#     chunks = len(samples) // spf
#     for chunk in numpy.split(samples, chunks):
#         frequencies, times, spectrogram = signal.stft(chunk, fs=rate, nperseg=spf//4)
#         spectrogram = numpy.stack([spectrogram.real, spectrogram.imag])
#     #     assert spectrogram.shape == (2, 62, 8)
#         assert spectrogram.shape == (2, 47, 9)
#         spectrograms.append(spectrogram)
#     spectrograms = numpy.stack(spectrograms)
#     numpy.save(os.path.join(root, video, "spectrograms.npy"), spectrograms)
# #         print(spectrogram.shape)
# #     for i, row in openface_df.iterrows():
# #         t = float(row["timestamp"])
# #         f = int(row["frame"])
# #         frequencies, times, spectrogram = signal.stft(samples[round(rate*t):round(rate*(t+ft))], fs=rate, nperseg=nps)
# #         spectrogram = numpy.stack([spectrogram.real, spectrogram.imag])
# #         if spectrogram.shape == (2, 62, 8):
# #             frames.append(f)
# #             spectrograms.append(spectrogram)
# #         else:
# #             print("Skipped", video, f)
# #     spectrograms = numpy.stack(spectrograms)
# #     assert spectrograms.dtype == numpy.float32
# #     spectrograms = skimage.exposure.rescale_intensity(spectrograms)
# #     for f, spectrogram in zip(frames, spectrograms):
# #         numpy.save(os.path.join(d, "frame_{:05d}.npy".format(f)), spectrogram)
# #         skimage.io.imsave(os.path.join(d, "frame_{:05d}.tif".format(f)), spectrogram)

In [None]:
# END

In [None]:
# # test spectrograms
# for video in tqdm.tqdm(videos):
#     d = os.path.join(root, video, "spectrograms")
#     samples = []
#     for f in os.listdir(d):
#         f = os.path.join(d, f)
#         spectrogram = numpy.load(f)
#         Zxx = numpy.zeros_like(spectrogram[0], dtype=numpy.complex64)
#         Zxx.real = spectrogram[0]
#         Zxx.imag = spectrogram[1]
#         t, x = signal.istft(Zxx, fs=sr, nperseg=nps)
#         samples.append(x)
#     samples = numpy.concatenate(samples, axis=-1)
#     wavfile.write(os.path.join(d, "test.wav"), sr, samples)

In [None]:
# samples, rate = librosa.core.load("/home/santiago/Data/deep_puppetry/obama/0SaVqB0w718/0SaVqB0w718.wav", sr=sr, mono=True, dtype=numpy.float32)
# assert rate == sr
# assert len(samples.shape) == 1
# # openface_df = pandas.read_csv(os.path.join(root, "0SaVqB0w718", "processed/0SaVqB0w718.csv"), sep=", ")
# # frames = []
# spectrograms = []
# remainder = len(samples) % spf
# samples = samples[:-remainder]
# chunks = len(samples) // spf
# for i, chunk in enumerate(numpy.split(samples, chunks)):
#     frequencies, times, spectrogram = signal.stft(samples[i*spf:(i+1)*spf], fs=rate, nperseg=spf//4)
#     spectrogram = numpy.stack([spectrogram.real, spectrogram.imag])
# #     assert spectrogram.shape == (2, 62, 8)
#     spectrograms.append(spectrogram)
#     print(spectrogram.shape)
# #     if spectrogram.shape == (2, 62, 8):
# #         frames.append(f)
# #         spectrograms.append(spectrogram)
# #     else:
# #         print("Skipped", i)
# # for i, row in openface_df.iterrows():
# #     t = float(row["timestamp"])
# #     f = int(row["frame"])
# #     frequencies, times, spectrogram = signal.stft(samples[round(rate*t):round(rate*(t+ft))], fs=rate, nperseg=nps)
# #     spectrogram = numpy.stack([spectrogram.real, spectrogram.imag])
# # #     print(spectrogram)
# #     if spectrogram.shape == (2, 62, 8) and spectrogram.dtype == numpy.float32:
# #         frames.append(f)
# #         spectrograms.append(spectrogram)
# #     else:
# #         print("Skipped", f)

In [None]:
# recon = []
# for i, spectrogram in enumerate(spectrograms):
# #     print(spectrogram.dtype)
#     Zxx = numpy.zeros_like(spectrogram[0], dtype=numpy.complex64)
#     Zxx.real = spectrogram[0]
#     Zxx.imag = spectrogram[1]
#     t, x = signal.istft(Zxx, fs=rate, nperseg=spf//4)
# #     print(x.shape)
# #     print(x.dtype)
#     recon.append(x)
# #     samples.append(x.astype(numpy.float32))
# recon = numpy.concatenate(recon, axis=-1)
# print(recon.dtype)
# # IPython.display.Audio(data=samples, rate=sr)
# wavfile.write("test.wav", sr, recon)

In [None]:
# identity = "trump"
# dset = "phones"
# f = h5py.File("/home/santiago/Downloads/deep_puppetry/csd/{}/{}_{}.csd".format(identity, identity, dset), "r+")
# f["{}_{}".format(identity, dset)] = f["{}".format(identity)]
# del f["{}".format(identity)]
# f["{}_{}".format(identity, dset)]["metadata"]["root name"][...] = numpy.array(["{}_{}".format(identity, dset)], dtype=object)
# f.close()

In [None]:
# root_dir = "/home/santiago/Data/deep_puppetry/obama/"
# videos = dict.fromkeys(os.listdir(root_dir))
# for video in tqdm.tqdm(videos.keys()):
#     videos[video] = {}
#     openface = pandas.read_csv(os.path.join(root_dir, video, "processed/{}.csv".format(video)), sep=", ", index_col=False)
#     videos[video]["openface"] = openface[openface["frame"] % 3 == 0]
#     frames = sorted(glob.glob(os.path.join(root_dir, video, "processed/{}_aligned/*.bmp".format(video))))
#     videos[video]["frames"] = list(filter(lambda x: int(x[-10:-4]) in videos[video]["openface"]["frame"].values, frames))

In [None]:
# import matplotlib.pyplot as plt
# from scipy import signal
# from scipy.io import wavfile
# import librosa

In [None]:
# samples, rate = librosa.load("/home/santiago/Data/deep_puppetry/obama/0SaVqB0w718/0SaVqB0w718.wav", sr=sr)

In [None]:
# rate

In [None]:
# len(samples[round(rate*25.000):round(rate*25.033)])

In [None]:
# frequencies, times, spectrogram = signal.spectrogram(samples[round(rate*30):round(rate*30.033)], rate, nperseg=30)

In [None]:
# spectrogram.shape

In [None]:
# skimage.io.imshow(spectrogram)

In [None]:
# plt.pcolormesh(times, frequencies, spectrogram)
# plt.imshow(spectrogram)
# plt.ylabel('Frequency [Hz]')
# plt.xlabel('Time [sec]')
# plt.show()

In [None]:
# df = pandas.read_csv("/home/santiago/Data/deep_puppetry/obama/0SaVqB0w718/processed/0SaVqB0w718.csv", sep=", ")
# df

In [None]:
# for video in videos:
#     rate, samples = wavfile.read(os.path.join(root, video, "{}.wav".format(video)))
#     print(rate, len(samples))
#     frequencies, times, spectrogram = signal.spectrogram(samples[round(rate*200):round(rate*(200.0+ft))], rate, nperseg=50)
#     print(frequencies)
#     print(spectrogram.shape)
#     skimage.io.imshow(spectrogram)
#     break

In [None]:
# for i, row in df.iterrows():
#     print(row["frame"])

In [None]:
# import IPython