In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [19]:
# Path to the directory containing the audio files
audio_files_dir = '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/flac'

# Path to the protocol file
protocol_file_path = '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt'


In [20]:
import pandas as pd

# Load the protocol file
protocol_df = pd.read_csv(protocol_file_path, delim_whitespace=True, header=None,
                          names=['SpeakerID', 'FileName', 'Env', 'Label', 'SpoofType'])

# Create a dictionary mapping file names to labels
label_dict = dict(zip(protocol_df['FileName'], protocol_df['SpoofType']))

# Print the first few entries to verify
print(protocol_df.head())


  SpeakerID      FileName Env Label SpoofType
0   LA_0079  LA_T_1138215   -     -  bonafide
1   LA_0079  LA_T_1271820   -     -  bonafide
2   LA_0079  LA_T_1272637   -     -  bonafide
3   LA_0079  LA_T_1276960   -     -  bonafide
4   LA_0079  LA_T_1341447   -     -  bonafide


In [None]:
import numpy as np

# Get unique file names and labels
file_names = protocol_df['FileName'].unique()
labels = protocol_df['SpoofType'].unique()

# Filter for each class
bonafide_files = protocol_df[protocol_df['SpoofType'] == 'bonafide']['FileName']
spoof_files = protocol_df[protocol_df['SpoofType'] == 'spoof']['FileName']

# Define sample size
sample_size = 2580  # Use all bonafide files and an equal number of spoof files

# Sample files from each class
sampled_bonafide_files = np.random.choice(bonafide_files, sample_size, replace=False)
sampled_spoof_files = np.random.choice(spoof_files, sample_size, replace=False)

# Combine into one list
sampled_files = np.concatenate([sampled_bonafide_files, sampled_spoof_files])

# Shuffle the files
np.random.shuffle(sampled_files)

print(f"Total sampled files: {len(sampled_files)}")



Total sampled files: 5160


In [None]:
num_bonafide = len(bonafide_files)
num_spoof = len(spoof_files)

print(f"Number of bonafide files: {num_bonafide}")
print(f"Number of spoof files: {num_spoof}")


Number of bonafide files: 2580
Number of spoof files: 22800


In [None]:
import os

# Path to the folder where you want to store sampled files
new_folder_path = '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/ASVspoof2019_LA_Sampled'

# Create the new folder if it doesn't exist
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)


In [None]:
import shutil

# Path to the original folder with the audio files
original_folder_path = '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/flac'

# Move sampled files to the new folder
for file_name in sampled_files:
    original_file_path = f'{original_folder_path}/{file_name}.flac'
    new_file_path = f'{new_folder_path}/{file_name}.flac'
    if os.path.exists(original_file_path):
        shutil.copy(original_file_path, new_file_path)  # Use copy or move as needed
    else:
        print(f"File {original_file_path} not found.")


In [22]:
import pandas as pd

# Define the paths
new_folder_path = '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/ASVspoof2019_LA_Sampled'

# Prepare lists for file names and labels
file_names = []
labels = []

# Append bonafide files
for file_name in sampled_bonafide_files:
    file_names.append(file_name)
    labels.append(0)  # Label for bonafide

# Append spoof files
for file_name in sampled_spoof_files:
    file_names.append(file_name)
    labels.append(1)  # Label for spoof

# Create a DataFrame
labels_df = pd.DataFrame({
    'FileName': file_names,
    'Label': labels
})

# Save to CSV
labels_file_path = '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/labels.csv'
labels_df.to_csv(labels_file_path, index=False)

print(f"Labels file created at {labels_file_path}.")


Labels file created at /content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/labels.csv.


In [21]:
import pandas as pd
import librosa
import numpy as np

# Load labels file
labels_df = pd.read_csv('/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/labels.csv')
labels_dict = dict(zip(labels_df['FileName'], labels_df['Label']))

# Define path to the new folder with sampled files
new_folder_path = '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/ASVspoof2019_LA_Sampled'

# Function to extract MFCC features
def extract_features(file_path):
    audio, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfccs = np.mean(mfccs.T, axis=0)
    return mfccs

# Extract features for each sampled file
X = []
y = []

for file_name in labels_df['FileName']:
    file_path = f'{new_folder_path}/{file_name}.flac'
    try:
        features = extract_features(file_path)
        X.append(features)
        label = labels_dict.get(file_name)
        y.append(label)
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

X = np.array(X)
y = np.array(y)

print(f"Extracted features for {len(X)} files.")


Extracted features for 5160 files.


In [23]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data size: {X_train.shape[0]}")
print(f"Testing data size: {X_test.shape[0]}")


Training data size: 4128
Testing data size: 1032


In [24]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score

# Standardize features by scaling
scaler = StandardScaler()

# Create SVM model
svm_model = make_pipeline(scaler, SVC(kernel='linear', random_state=42))

# Train the model
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8042635658914729
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.86      0.82       522
           1       0.84      0.75      0.79       510

    accuracy                           0.80      1032
   macro avg       0.81      0.80      0.80      1032
weighted avg       0.81      0.80      0.80      1032



##To Improve accuracy further Feature Engineering and Selection.


Increase the number of features: Extract more audio features besides MFCCs, such as chroma features, spectral contrast, or tonnetz features.

In [25]:
def extract_features(new_folder_path):
    audio, sr = librosa.load(new_folder_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=audio, sr=sr)

    features = np.hstack([
        np.mean(mfccs.T, axis=0),
        np.mean(chroma.T, axis=0),
        np.mean(spectral_contrast.T, axis=0),
        np.mean(tonnetz.T, axis=0)
    ])
    return features


##Normalize Features:
Ensure that features are standardized.

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


##Hyperparameter Tuning:
Perform a Grid Search or Random Search to find the best hyperparameters for the SVM model.

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

svm_model = make_pipeline(StandardScaler(), SVC(random_state=42))
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [0.001, 0.01, 0.1, 1],
    'svc__kernel': ['linear', 'rbf']
}

grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)


Best parameters found: {'svc__C': 10, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}


##Experiment with Different Kernels:

In [28]:
svm_model = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=42))


##Perform Cross-Validation:
Use cross-validation to evaluate the model's performance more robustly.

In [29]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(svm_model, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)
print("Mean CV score:", np.mean(scores))


Cross-validation scores: [0.92829457 0.92829457 0.92635659 0.93023256 0.92344961]
Mean CV score: 0.9273255813953488


##Handle Class Imbalance
Use oversampling or undersampling techniques if your dataset is imbalanced.

In [30]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

svm_model.fit(X_resampled, y_resampled)


 ## Model Evaluation


In [31]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9215116279069767
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.95      0.92       522
           1       0.94      0.90      0.92       510

    accuracy                           0.92      1032
   macro avg       0.92      0.92      0.92      1032
weighted avg       0.92      0.92      0.92      1032



## Save Model

In [32]:
import joblib

# Save the trained model to a file
joblib.dump(svm_model, '/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/svm_model.pkl')




['/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/svm_model.pkl']

## Load Model

In [33]:
import joblib
svm_model = joblib.load('/content/drive/MyDrive/ADD_ASV_DATA/LA/ASVspoof2019_LA_train/svm_model.pkl')
