In [3]:
from scipy import signal
from scipy.io import wavfile
from scipy.ndimage import maximum_filter
import matplotlib.pyplot as plt
import numpy as np
import os
import librosa

In [24]:
class VoiceAnalysis:
    def __init__(self, filename, input_path, output_path):
        self.output_path = output_path
        self.filename = filename
        self.input_path = input_path
        if input_path.endswith(('.wav')):
            self.data, self.rate = librosa.load(input_path, sr=None)

    def plot_spectrogram(self):
        freq, time, spect= signal.spectrogram(self.data, self.rate, scaling='spectrum', window='hann')
        log_spec = np.log10(spect+(spect==0))
        plt.pcolormesh(time, freq, log_spec, shading='gouraud')
        plt.savefig(self.output_path+self.filename+"_spectr.png", dpi = 300)
        plt.clf()

    def find_features(self):
        stft = np.abs(librosa.stft(self.data))
        db = librosa.amplitude_to_db(stft, ref=np.max)
        freqs = librosa.fft_frequencies(sr=self.rate)
        means = np.mean(db, axis=1)

        id1 = np.argmax(means > -80)
        id2 = len(means) - np.argmax(means[::-1] > -80) - 1

        print(f"Min freq: {freqs[id1]}, Max freq: {freqs[id2]}")
    
        chroma = librosa.feature.chroma_stft(y=self.data, sr=self.rate)
        f = librosa.piptrack(y=self.data, sr=self.rate, S=chroma)[0]
        main_freq = np.argmax(f)
        print(f"Main tone: {main_freq}")

        self.rate, self.data = wavfile.read(self.input_path)
        freq, t, spect = signal.spectrogram(self.data, self.rate, window=('hann'))
        spect = np.log10(spect + 1)
        filter = maximum_filter(spect, size=(int(50 / (freq[1] - freq[0])), int(0.1 * len(t))))
        mask = (spect == filter)
        peak_vals = spect[mask]
        peak_freqs = freq[mask.any(axis=1)]

        top_ids = np.argsort(peak_vals)[-3:]
        formants = peak_freqs[top_ids]

        print("Three strongest formants: ")
        for f in formants:
            print(f, end=" ")
    

In [26]:
input_path = "./input/A.wav"
output_path = "./output/"
filename = "A"

agent = VoiceAnalysis(filename, input_path, output_path)
# agent.plot_spectrogram()
agent.find_features()

Min freq: 0.0, Max freq: 16242.1875
Main tone: 2398
Three strongest formants: 
4312.5 14250.0 3000.0 

In [27]:
input_path = "./input/I.wav"
output_path = "./output/"
filename = "I"

agent = VoiceAnalysis(filename, input_path, output_path)
# agent.plot_spectrogram()
agent.find_features()

Min freq: 0.0, Max freq: 16078.125
Main tone: 1286
Three strongest formants: 
7312.5 4125.0 5625.0 

In [28]:
input_path = "./input/Meow.wav"
output_path = "./output/"
filename = "Meow"

agent = VoiceAnalysis(filename, input_path, output_path)
# agent.plot_spectrogram()
agent.find_features()

Min freq: 0.0, Max freq: 16242.1875
Main tone: 1281
Three strongest formants: 
15375.0 9937.5 13500.0 