**Import Libraries**

In [98]:
pip install librosa resampy


Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: resampy
Successfully installed resampy-0.4.3


**Import Libraries**

In [6]:
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
import joblib
from scipy.spatial.distance import cosine



**Function to extract features from an audio file**

In [8]:
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    return mfccs_mean

**Load trained audio files**

In [9]:
trained_audio_files = ["/content/audio (2).wav", "/content/audio 4.wav"]
trained_labels = ["speaker1", "speaker2"]

**Extract features from trained audio files**

In [10]:
trained_features = np.array([extract_features(file) for file in trained_audio_files])

** Print the shape of trained_features**

In [11]:
print("Shape of trained_features:", trained_features.shape)

Shape of trained_features: (2, 13)


**Set a threshold for similarit**y

In [12]:
threshold = 0.7

**Function to predict speaker**

In [13]:
def predict_speaker(new_audio_file):
    # Extract features from the new audio file
    new_features = extract_features(new_audio_file)
    print("Shape of new_features before scaling:", new_features.shape)
    # Scale the features
    scaler = StandardScaler()
    trained_features_scaled = scaler.fit_transform(trained_features)
    print("Shape of trained_features_scaled:", trained_features_scaled.shape)
    new_features_scaled = scaler.transform(new_features.reshape(1, -1))
    print("Shape of new_features_scaled:", new_features_scaled.shape)
    # Predict the speaker
    predicted_speaker = None
    for i, speaker_features in enumerate(trained_features_scaled):
        similarity = 1 - cosine(new_features_scaled.flatten(), speaker_features.flatten())
        print("Similarity with", trained_labels[i], ":", similarity)
        if similarity > threshold:
            predicted_speaker = trained_labels[i]
            break
    return predicted_speaker

**Example usage**

In [14]:
if __name__ == "__main__":
    # Path to the new audio file for prediction
    new_audio_file = "/content/priya.wav"

    # Predict the speaker
    predicted_speaker = predict_speaker(new_audio_file)

    if predicted_speaker is not None:
        print("Predicted Speaker:", predicted_speaker)
    else:
        print("Unknown")

Shape of new_features before scaling: (13,)
Shape of trained_features_scaled: (2, 13)
Shape of new_features_scaled: (1, 13)
Similarity with speaker1 : -0.3899553418159485
Similarity with speaker2 : 0.3899553418159485
Unknown


In [15]:
if __name__ == "__main__":
    # Path to the new audio file for prediction
    new_audio_file = "/content/audio (2).wav"

    # Predict the speaker
    predicted_speaker = predict_speaker(new_audio_file)

    if predicted_speaker is not None:
        print("Predicted Speaker:", predicted_speaker)
    else:
        print("Unknown")

Shape of new_features before scaling: (13,)
Shape of trained_features_scaled: (2, 13)
Shape of new_features_scaled: (1, 13)
Similarity with speaker1 : 1
Predicted Speaker: speaker1
