<a href="https://colab.research.google.com/github/reshalfahsi/AI-Cover-Song/blob/master/AICoverSong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## AI Cover Song

Just Run all (Ctrl + F9)

In [None]:
#@title Prerequisite
from google.colab import drive
drive.mount('/content/drive')


!pip install pydub
!pip install yt_dlp
!pip install ffmpeg
!python3 -m pip install -U demucs
!python -m pip install -U pip wheel
%pip install -U ipython
%pip install -U so-vits-svc-fork


!mkdir -p drive/MyDrive/so-vits-svc-fork
!rm -rf drive/MyDrive/so-vits-svc-fork
!mkdir drive/MyDrive/so-vits-svc-fork


!mkdir -p youtubeaudio
!mkdir -p dataset_raw
!mkdir -p dataset

In [None]:
#@title Download Youtube WAV
from __future__ import unicode_literals
import yt_dlp
import ffmpeg
import sys


def download_from_url(url, output_dir):
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }],
        "outtmpl": output_dir,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [None]:
#@title Separate Vocal and Instrument/Noise using Demucs
import subprocess


def separate_vocal_and_instrument(audio_input):
    command = f"demucs --two-stems=vocals {audio_input}"
    result = subprocess.run(command.split(), stdout=subprocess.PIPE)
    print(result.stdout.decode())

In [None]:
#@title Song

SONG_URL = "https://www.youtube.com/watch?v=cQGfLDnmWS8&pp=ygULYXNtYWxpYnJhc2k%3D" # @param {type:"string"}
SONG_PATH = 'youtubeaudio'
download_from_url(
    url=SONG_URL,
    output_dir=f"{SONG_PATH}/audio",
    )
%cd -q $SONG_PATH
separate_vocal_and_instrument(audio_input=f"/content/{SONG_PATH}/audio.wav")
%cd /content

In [None]:
#@title Split The Audio Dataset (Speaker) into Smaller Duration Before Training

SPEAKER_NAME = "ItsukiNakano" # @param {type:"string"}
DATASET_URL = "https://www.youtube.com/watch?v=oR7hJx7h0WQ" # @param{type:"string"}
DATASET_PATH = 'dataset'
download_from_url(
    url=DATASET_URL,
    output_dir=f"{DATASET_PATH}/audio",
    )
%cd -q $DATASET_PATH
separate_vocal_and_instrument(audio_input=f"/content/{DATASET_PATH}/audio.wav")
%cd /content

DATASET_RAW = f'dataset_raw/{SPEAKER_NAME}'

%mkdir -p $DATASET_RAW


from scipy.io import wavfile
import os
import numpy as np
import argparse
from tqdm import tqdm
import json

from datetime import datetime, timedelta

# Utility functions


def GetTime(video_seconds):
    if video_seconds < 0:
        return 00

    else:
        sec = timedelta(seconds=float(video_seconds))
        d = datetime(1, 1, 1) + sec

        instant = (
            str(d.hour).zfill(2)
            + ":"
            + str(d.minute).zfill(2)
            + ":"
            + str(d.second).zfill(2)
            + str(".001")
        )

        return instant


def GetTotalTime(video_seconds):
    sec = timedelta(seconds=float(video_seconds))
    d = datetime(1, 1, 1) + sec
    delta = str(d.hour) + ":" + str(d.minute) + ":" + str(d.second)

    return delta


def windows(signal, window_size, step_size):
    if type(window_size) is not int:
        raise AttributeError("Window size must be an integer.")
    if type(step_size) is not int:
        raise AttributeError("Step size must be an integer.")
    for i_start in range(0, len(signal), step_size):
        i_end = i_start + window_size
        if i_end >= len(signal):
            break
        yield signal[i_start:i_end]


def energy(samples):
    return np.sum(np.power(samples, 2.0)) / float(len(samples))


def rising_edges(binary_signal):
    previous_value = 0
    index = 0
    for x in binary_signal:
        if x and not previous_value:
            yield index
        previous_value = x
        index += 1


"""
Last Acceptable Values

min_silence_length = 0.3
silence_threshold = 1e-3
step_duration = 0.03/10

"""
# Change the arguments and the input file here
input_file = "/content/dataset/separated/htdemucs/audio/vocals.wav"
output_dir = f"/content/dataset_raw/{SPEAKER_NAME}"
min_silence_length = 0.6  # The minimum length of silence at which a split may occur [seconds]. Defaults to 3 seconds.
silence_threshold = 1e-4  # The energy level (between 0.0 and 1.0) below which the signal is regarded as silent.
step_duration = (
    0.03 / 10
)  # The amount of time to step forward in the input file after calculating energy. Smaller value = slower, but more accurate silence detection. Larger value = faster, but might miss some split opportunities. Defaults to (min-silence-length / 10.).


input_filename = input_file
window_duration = min_silence_length
if step_duration is None:
    step_duration = window_duration / 10.0
else:
    step_duration = step_duration

output_filename_prefix = os.path.splitext(os.path.basename(input_filename))[0]
dry_run = False

print(
    "Splitting {} where energy is below {}% for longer than {}s.".format(
        input_filename, silence_threshold * 100.0, window_duration
    )
)

# Read and split the file

sample_rate, samples = wavfile.read(filename=input_filename, mmap=True)

max_amplitude = np.iinfo(samples.dtype).max
print(max_amplitude)

max_energy = energy([max_amplitude])
print(max_energy)

window_size = int(window_duration * sample_rate)
step_size = int(step_duration * sample_rate)

signal_windows = windows(signal=samples, window_size=window_size, step_size=step_size)

window_energy = (
    energy(w) / max_energy
    for w in tqdm(signal_windows, total=int(len(samples) / float(step_size)))
)

window_silence = (e > silence_threshold for e in window_energy)

cut_times = (r * step_duration for r in rising_edges(window_silence))

# This is the step that takes long, since we force the generators to run.
print("Finding silences...")
cut_samples = [int(t * sample_rate) for t in cut_times]
cut_samples.append(-1)

cut_ranges = [
    (i, cut_samples[i], cut_samples[i + 1]) for i in range(len(cut_samples) - 1)
]

video_sub = {
    str(i): [
        str(GetTime(((cut_samples[i]) / sample_rate))),
        str(GetTime(((cut_samples[i + 1]) / sample_rate))),
    ]
    for i in range(len(cut_samples) - 1)
}

for i, start, stop in tqdm(cut_ranges):
    output_file_path = "{}_{:03d}.wav".format(
        os.path.join(output_dir, output_filename_prefix), i
    )
    if not dry_run:
        print("Writing file {}".format(output_file_path))
        wavfile.write(
            filename=output_file_path, rate=sample_rate, data=samples[start:stop]
        )
    else:
        print("Not writing file {}".format(output_file_path))

with open(output_dir + "\\" + output_filename_prefix + ".json", "w") as output:
    json.dump(video_sub, output)


In [None]:
#@title Automatic preprocessing
!svc pre-resample
!svc pre-config

In [None]:
#@title Copy configs file
!cp configs/44k/config.json drive/MyDrive/so-vits-svc-fork

In [None]:
F0_METHOD = "dio" #@param ["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]
!svc pre-hubert -fm {F0_METHOD}

In [None]:
#@title Train
%load_ext tensorboard
%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k
!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k

In [None]:
#@title Inference
# remove ".wav" on AUDIO
from IPython.display import Audio

AUDIO = "/content/youtubeaudio/separated/htdemucs/audio/vocals"
MODEL = "/content/drive/MyDrive/so-vits-svc-fork/logs/44k/G_10000.pth" #@param {type:"string"}
CONFIG = "/content/drive/MyDrive/so-vits-svc-fork/logs/44k/config.json"
#@markdown Change According to Your Voice Tone. 12 = 1 Octave | -12 = -1 Octave
PITCH = 0 #@param {type:"integer"}

!svc infer {AUDIO}.wav -c {CONFIG} -m {MODEL} -na -t {PITCH}

In [None]:
#@title Combine Vocal and Instrument (Song Cover)
from pydub import AudioSegment

VOCAL = "/content/youtubeaudio/separated/htdemucs/audio/vocals.out.wav"
INSTRUMENT = "/content/youtubeaudio/separated/htdemucs/audio/no_vocals.wav"

sound1 = AudioSegment.from_file(VOCAL)
sound2 = AudioSegment.from_file(INSTRUMENT)

combined = sound1.overlay(sound2)

combined.export("/content/FinalCover.wav", format='wav')
print("Saved to /content/FinalCover.wav")