<a href="https://colab.research.google.com/github/rahulkumarrathore/Deepfake-Audio-Detection-with-Statistical-Classifiers/blob/main/Deepfake_Audio_Detection_with_Statistical_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#MOUNT TO SAVE AND LOAD df of built datasets
#to mount the personal 1612 gmail drive (IMP)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_path = '/content/drive/MyDrive/AiAudio_PR_Project/AiAudio_Dataset'
for_2sec_df_path = df_path+'/for_2sec' #will use it to train and select best model
for_2sec_rerec_df_path = df_path+'/for_2sec_rerec' #retrain, for our scenario, on the selected best model

print("Path to dataset files:", for_2sec_df_path)
print("Path to dataset files:", for_2sec_rerec_df_path)

# Getting data

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohammedabdeldayem/the-fake-or-real-dataset")


Downloading from https://www.kaggle.com/api/v1/datasets/download/mohammedabdeldayem/the-fake-or-real-dataset?dataset_version_number=2...


  4%|â–Ž         | 608M/16.0G [00:21<08:43, 31.7MB/s]

In [None]:
import os
cnn_path = os.path.join(path, "for-2sec/for-2seconds")
for_2sec_path=os.path.join(path, "for-2sec/for-2seconds/training")
for_2sec_test_path=os.path.join(path, "for-2sec/for-2seconds/testing")
for_2sec_valid_path=os.path.join(path, "for-2sec/for-2seconds/validation")



print("Path to dataset files:", path)
print("Path to dataset files:", for_2sec_path)

In [None]:
#RERECORDED FoR DATA, FOR INFERENCE
for_rerec_path_real = os.path.join(path, "for-rerec/for-rerecorded/validation/real")
for_rerec_path_fake = os.path.join(path, "for-rerec/for-rerecorded/validation/fake")

print("Path to rerec real files:", for_rerec_path_real)
print("Path to rerec fake files:", for_rerec_path_fake)

#Feature Extraction

##Function

In [None]:
import os
import librosa
import numpy as np
import pandas as pd

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=16000)


    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    delta_mfccs = librosa.feature.delta(mfccs)

    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)

    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)

    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

    zcr = librosa.feature.zero_crossing_rate(y)

    rms = librosa.feature.rms(y=y)


    features = []
    for feature_set in [mfccs, delta_mfccs, spec_cent, spec_bw, chroma, zcr, rms]:
        features.extend(np.mean(feature_set, axis=1))
        features.extend(np.std(feature_set, axis=1))

    return features


def build_dataset(base_dir):
    data = []
    for label_dir in ['real', 'fake']:
        folder = os.path.join(base_dir, label_dir)
        label = 0 if label_dir == 'real' else 1
        for fname in os.listdir(folder):
            fpath = os.path.join(folder, fname)
            try:
                feats = extract_features(fpath)
                data.append([fpath] + feats + [label])
            except Exception as e:
                print(f"Error with {fpath}: {e}")
    return data


feature_names = [f'mfcc{i}' for i in range(13)] + \
                [f'delta_mfcc{i}' for i in range(13)] + \
                ['spec_cent', 'spec_bw'] + \
                [f'chroma{i}' for i in range(12)] + \
                ['zcr', 'rms']

feature_names = [f"{f}_{stat}" for f in feature_names for stat in ['mean', 'std']]
df_cols = ['filename'] + feature_names + ['label']




##for_2sec Dataset

**No need to run Below again, just load pre-save(unprocessed)**

###The training dataset

In [None]:
dataset = build_dataset(for_2sec_path)
df = pd.DataFrame(dataset, columns=df_cols)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
print(for_2sec_df_path+"/training_df.pkl")

In [None]:
df.to_pickle(f"{for_2sec_df_path}/training_df.pkl")

###The testing set

In [None]:
dataset_test = build_dataset(for_2sec_test_path)
df_test = pd.DataFrame(dataset_test, columns=df_cols)

In [None]:
df_test.head()

In [None]:
df_test.tail()

In [None]:
df_test.shape

In [None]:
df_test.to_pickle(f"{for_2sec_df_path}/testing_df.pkl")

### Loading Dataset


**for_2sec**

In [None]:
df = pd.read_pickle(f"{for_2sec_df_path}/training_df.pkl")
df_test = pd.read_pickle(f"{for_2sec_df_path}/testing_df.pkl")

In [None]:
print(df.columns.tolist())

## Preprocessing

In [None]:
X_train = df.drop(columns=['filename', 'label'])
y_train = df['label']

X_test = df_test.drop(columns=['filename', 'label'])
y_test = df_test['label']

In [None]:
#scaling as the ranges of features varies accross all
#We scale because statistical classifiers (like logistic regression, SVM, KNN, LDA) are sensitive to the scale (range) of feature values.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Important: use transform, not fit_transform

In [None]:
X_train

In [None]:
X_train_scaled

# Models Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

###Comparing 3 Statistical classifiers

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
print("RF:", rf.score(X_test_scaled, y_test))


svm = SVC(kernel='rbf', C=1, gamma='scale')
svm.fit(X_train_scaled, y_train)
print("SVM:", svm.score(X_test_scaled, y_test))


lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
print("LR:", lr.score(X_test_scaled, y_test))


In [None]:


print("SVM Report:")
print(classification_report(y_test, svm.predict(X_test_scaled)))

print("LogReg Report:")
print(classification_report(y_test, lr.predict(X_test_scaled)))


**SVM** - because lr is less stable

###SVM

####Tuning & COMPARING svm hyperparameters

ReSeaching The good params - cv=3 gave gamma as scale while 5 giving

In [None]:

from sklearn.model_selection import GridSearchCV

params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(), param_grid=params, cv=5, scoring='f1', verbose=1)
grid.fit(X_train_scaled, y_train)
print("Best Params:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)
print("Best Mean Test Score:", grid.cv_results_['mean_test_score'])
print("Best Mean Train Score:", grid.cv_results_['mean_train_score'])

**SVM**
- Best Params: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
- Best F1 Score: 0.9896433774231864


####The SVM model on 86 feaTURES- gamma 0.01 & scale

In [None]:
svm = SVC(kernel='rbf', C=10, gamma=0.01) #The best svm parameters from tuning and comparing
svm.fit(X_train_scaled, y_train)
print("SVM:", svm.score(X_test_scaled, y_test))

#### svm model with feature reducced- chroma removed (86-24)

**Removing chroma features**

In [None]:
#triming the features
df_reduced = df.loc[:, ~df.columns.str.startswith('chroma')]
df_test_reduced = df_test.loc[:, ~df_test.columns.str.startswith('chroma')]

print(df_reduced.columns.tolist())
print(df_reduced.shape)

print(df_test_reduced.columns.tolist())
print(df_test_reduced.shape)

Preprocessing new reduced dataserrt

In [None]:
X_reduced_train = df_reduced.drop(columns=['filename', 'label'])

X_reduced_test = df_test_reduced.drop(columns=['filename', 'label'])

In [None]:
X_reduced_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler

scaler2 = StandardScaler()
X_reduced_train_scaled = scaler2.fit_transform(X_reduced_train)
X_reduced_test_scaled = scaler2.transform(X_reduced_test)  # Important: use transform, not fit_transform

In [None]:
X_reduced_train

In [None]:
X_reduced_train_scaled

Training and comparing

In [None]:
svm_reduced = SVC(kernel='rbf', C=10, gamma=0.01)
svm_reduced.fit(X_reduced_train_scaled, y_train)
print("SVM:", svm_reduced.score(X_reduced_test_scaled, y_test))

In [None]:
lr_reduced = LogisticRegression(C=0.1, max_iter=500, penalty='l1',solver='liblinear')
lr_reduced.fit(X_reduced_train_scaled, y_train)
print("LR:", lr_reduced.score(X_reduced_test_scaled, y_test))

**CONCLUSION** : NOT TO REDUCE FEATURE SET as it cost ACCURACY ~ Thus not saving the reduceSVM model


#### svm model with feature reducced- PCA applied ()

###LR

####TUning and  choosing hyperparameters

In [None]:
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],   # Regularization strength (inverse of lambda)
    'penalty': ['l1', 'l2', 'elasticnet'], # Different regularization types
    'solver': ['liblinear','saga'],       # solvers that support l1 and elasticnet
    'max_iter': [500, 1000]
}


log_reg = LogisticRegression()
grid = GridSearchCV(log_reg, param_grid=params, cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid.fit(X_train_scaled, y_train)

# Display best parameters and best score
print("Best Parameters:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)

# Best model
best_lr = grid.best_estimator_


**LR**
- Best Parameters: {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
- Best F1 Score: 0.9098175641027335

### **LORIS** - ensumbled LR & SVM

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [None]:



svm_loris = SVC(kernel='rbf', C=10, gamma=0.01, probability=True)
lr_loris  = LogisticRegression(C=0.1, max_iter=500, penalty='l1',solver='liblinear')

# For storing results
weights = np.arange(0.5, 1.0, 0.1)  #this weight for the lr
ai_f1_scores = []
human_f1_scores = []
macro_f1_scores = []
accuracies = []

for w in weights:
    svm_w = 1 - w
    lr_w  = w

    ensemble = VotingClassifier(
        estimators=[('svm', svm_loris), ('lr', lr_loris)],
        voting='soft',
        weights=[svm_w, lr_w]
    )


    ensemble.fit(X_train_scaled, y_train)

    y_pred = ensemble.predict(X_test_scaled)

    report = classification_report(y_test, y_pred, output_dict=True)

    human_f1 = report['0']['f1-score']
    ai_f1    = report['1']['f1-score']
    macro_f1 = report['macro avg']['f1-score']
    acc      = accuracy_score(y_test, y_pred)

    human_f1_scores.append(human_f1)
    ai_f1_scores.append(ai_f1)
    macro_f1_scores.append(macro_f1)
    accuracies.append(acc)

    print(f"LR weight={w:.1f} | AI F1={ai_f1:.3f} | Human F1={human_f1:.3f} | Macro F1={macro_f1:.3f}")



In [None]:

# -----------------------------
# Plot Performance vs LR Weight
# -----------------------------
plt.figure(figsize=(10,6))
plt.plot(weights, ai_f1_scores, marker='o', label='AI Class F1-score')
plt.plot(weights, human_f1_scores, marker='o', label='Human Class F1-score')
plt.plot(weights, macro_f1_scores, marker='o', label='Macro F1-score')
plt.plot(weights, accuracies, marker='o', label='Accuracy')

plt.title("Performance vs Logistic Regression Weight")
plt.xlabel("Weight for Logistic Regression (LR)")
plt.ylabel("Score")
plt.grid(True)
plt.legend()
plt.show()

#MODEL


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [None]:
model_path = '/content/drive/MyDrive/AiAudio_PR_Project/AiAudio_Model'
svm_m_path = model_path+'/svm' #will use it to train and select best model
svm_reduced_m_path = model_path+'/svm_reduced'
lr_m_path = model_path+'/lr'
lr_reduced_m_path = model_path + '/lr_reduced' #retrain, for our scenario, on the selected best model
loris1_m_path = model_path + '/loris1'
loris2_m_path = model_path + '/loris2'

print("Path to SVM Model:", svm_m_path)
print("Path to SVM Reduced Model:", svm_reduced_m_path)
print("Path to LR Model:", lr_m_path)

### Saving Model


SVM

In [None]:
import pickle

with open(svm_m_path, "wb") as f:
    pickle.dump(svm, f)

print(f"Model saved at: {svm_m_path}")


LR

In [None]:
import pickle

with open(lr_m_path, "wb") as f:
    pickle.dump(lr, f)

print(f"Model saved at: {lr_m_path}")


SVM_REDUCED (NO CHROMA)

In [None]:
import pickle

with open(svm_reduced_m_path, "wb") as f:
    pickle.dump(svm_reduced, f)

print(f"Model saved at: {svm_reduced_m_path}")


LR_Reduced

In [None]:
import pickle

with open(lr_reduced_m_path, "wb") as f:
    pickle.dump(lr_reduced, f)

print(f"Model saved at: {lr_reduced_m_path}")


Loris 1 and 2

In [None]:
import pickle

with open(loris1_m_path, "wb") as f:
    pickle.dump(loris1, f)

print(f"Model saved at: {loris1_m_path}")

In [None]:
import pickle

with open(loris2_m_path, "wb") as f:
    pickle.dump(loris2, f)

print(f"Model saved at: {loris2_m_path}")

###Loading Model


In [None]:
import pickle

with open(svm_m_path, "rb") as f:
    svm = pickle.load(f)

In [None]:
import pickle

with open(svm_reduced_m_path, "rb") as f:
    svm_reduced = pickle.load(f)

In [None]:
import pickle

with open(lr_reduced_m_path, "rb") as f:
    lr_reduced = pickle.load(f)

In [None]:
import pickle

with open(loris1_m_path, "rb") as f:
    loris1 = pickle.load(f)

In [None]:
import pickle

with open(loris1_m_path, "rb") as f:
    loris1 = pickle.load(f)

In [None]:
import pickle

with open(loris2_m_path, "rb") as f:
    loris2 = pickle.load(f)