<div align='center'>
    <h1>Spectrogram Generator</h1>
</div>

- Use `resample_audio()` function to downsample audio samples.
- Use `remove_silence()` function to remove silent parts from audio. `librosa.effects.trim()` can be used too.
- If you don't require the above two functions, provide the **orig_audio** as input to `librosa.stft()` like -

```
librosa.stft(orig_audio, n_fft=256)
```
- Also, try increasing or decreasing the **thresh_amp** parameter of `remove_silence()` function to check if the voice is getting clipped.
- In case you run the code in colab or linux, replace the backward slashes`(\)` with forward slash`(/)` and also remove the letter 'r' written before the path name. For example -

```
ROOT_PATH = 'home/khadija/Desktop/Speech Enhancement'
```
- ***NOTE : Maintain the directory structure provided below to execute the program without any error.***
```
Speech Enhancement
├───Code
│   └───spectrogram_generator.ipynb
├───Test
│   ├───Clean
│   └───Noisy
├───Train
│   ├───Clean
│   └───Noisy
└───Valid
    ├───Clean
    └───Noisy
```

In [2]:
import os
import librosa
import numpy as np
import librosa.display
import matplotlib.pyplot as plt

**Define the root path that contains the directories Test, Train and Valid.**

In [None]:
ROOT_PATH = r'C:\Users\sabbir\Desktop\Speech Enhancement'

TRAIN_PATH = os.path.join(ROOT_PATH, 'Train')
VALID_PATH = os.path.join(ROOT_PATH, 'Valid')
TEST_PATH = os.path.join(ROOT_PATH, 'Test')

In [None]:
# Creating a root directory to save all images
SPECTROGRAM_PATH = os.path.join(ROOT_PATH, 'Generated Spectrograms')

# Directories in the root path
root_path_dirs = os.listdir(ROOT_PATH)

if 'Generated Spectrograms' not in root_path_dirs:
    os.mkdir(SPECTROGRAM_PATH) 

In [None]:
# RESAMPLING AUDIO 
def resample_audio(orig_audio_clip, orig_sr, target_sr):
    '''
    Resamples audio signal to desired sampling frequency.
    
        Parameters:
            orig_audio_clip (numpy.ndarray): An audio clip as numpy array  
            orig_sr (int): Sampling rate of the orig_audio_clip
            target_sr (int): Desired sampling rate 
        
        Returns:
            resampled_audio (numpy.array): Resampled audio having new sampling rate
            target_sr (int): Sampling rate of the resampled audio
    '''
    resampled_audio = librosa.resample(orig_audio_clip, orig_sr=orig_sr, target_sr=target_sr)
    
    return resampled_audio, target_sr

In [None]:
def remove_silence(audio_data, sampling_rate, thresh_amp=0.03):
    '''
    Removes silent parts from audio using maximum amplitude.
    
        Parameters:
            audio_data (numpy.ndarray): An audio sample as numpy array
            sampling_rate (int): Sampling rate of the audio sample
            thresh_amp (float): Threshold level to clip silent parts
            
        Returns:
            audio_without_silence (numpy.ndarray): Audio sample without silent parts
    '''
    audio_normalized = audio_data/np.max(audio_data)
    
    frame_duration = 0.025
    frame_size = int(np.round(frame_duration * sampling_rate))
    audio_len = len(audio_normalized)
    number_of_frames = int(np.floor(audio_len/frame_size))
    
    frames = np.zeros((number_of_frames, frame_size))
    temp = 0
    for frame_no in range(1, number_of_frames):
        frames[frame_no] = audio_normalized[temp:temp + frame_size];
        temp = temp + frame_size
        
    # Remove silence based on maximum amplitude
    max_amplitude = np.amax(np.abs(frames), axis=1)
    frames_above_thresh = np.argwhere(max_amplitude > thresh_amp)
    frames_above_thresh = np.squeeze(frames_above_thresh, axis=1)
    frames_without_silence = frames[frames_above_thresh]
    audio_without_silence = frames_without_silence.flatten()
    
    return audio_without_silence

In [None]:
# DEFINING GLOBAL VARIABLES
TARGET_SR = 8000  # Target sampling rate

def generate_spectrogram(path):
    '''
    Generates and saves spectrogram from directories recursively.
    
        Parameter:
            path (str): Path of the root directory
    '''
    ROOT_DIR_NAME = path.split('\\')[-1]
    SPECTROGRAM_ROOT_DIR = os.path.join(SPECTROGRAM_PATH, ROOT_DIR_NAME)
    os.mkdir(SPECTROGRAM_ROOT_DIR)
    
    for sub_dir in os.listdir(path):
        SUB_DIR_NAME = sub_dir
        SPECTROGRAM_SUB_DIR = os.path.join(SPECTROGRAM_ROOT_DIR, SUB_DIR_NAME)
        os.mkdir(SPECTROGRAM_SUB_DIR)
        
        for sample in os.listdir(os.path.join(path, sub_dir)):
            SAMPLE_NAME = sample.split('.')[0]
            IMG_NAME = os.path.join(SPECTROGRAM_SUB_DIR, SAMPLE_NAME)+ '.png'
            orig_audio, orig_sr = librosa.load(os.path.join(path, sub_dir, sample), sr=None)
            
            # Resample audio to desired sampling rate
            resampled_audio, _ = resample_audio(orig_audio, orig_sr=orig_sr, target_sr=TARGET_SR)
            
            # Remove silence from audio
            audio_ws = remove_silence(resampled_audio, TARGET_SR)
            
            # Calculate STFT
            stft = librosa.stft(audio_ws, n_fft=256)  # For fs=8000, n_fft=32ms
            fig, ax = plt.subplots(figsize=(12, 8))
            # Turn off the ticks and axes labels
            ax.axis('off')
            img = librosa.display.specshow(librosa.amplitude_to_db(np.abs(stft), ref=np.max), y_axis='log', x_axis='time')
            img_fig = img.figure
            img_fig.savefig(IMG_NAME, bbox_inches='tight')
            plt.close(img_fig)

In [None]:
# Generate spectrograms for training folder
generate_spectrogram(TRAIN_PATH)

In [None]:
# Generate spectrograms for validation folder
generate_spectrogram(VALID_PATH)

In [None]:
# Generate spectrograms for testing folder
generate_spectrogram(TEST_PATH)

## Demo

In [None]:
x, fs = librosa.load(r'C:\Users\sabbir\Desktop\Speech Enhancement\Train\Clean\Clean_0.wav', sr=None)

In [None]:
from IPython.display import Audio

In [None]:
Audio(x, rate=fs)

In [None]:
res_x, n_fs = resample_audio(x, fs, 8000)

In [None]:
Audio(res_x, rate=n_fs)

In [None]:
x_ws = remove_silence(res_x, n_fs)

In [None]:
Audio(x_ws, rate=n_fs)

In [None]:
librosa.display.waveshow(res_x, sr=n_fs, alpha=0.5, label='Resampled Audio')
plt.title('Resampled Audio')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.show()

In [None]:
librosa.display.waveshow(x_ws, sr=n_fs, color='r', alpha=0.5)
plt.title('Audio Without Silent Parts')
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.show()