In [None]:
import opensmile
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals)

In [None]:
def load_files(path):
    
    results = pd.DataFrame()
    
    for file in os.listdir(path):
        if file.endswith('.wav'):
            df = smile.process_file(os.path.join(path,file))
            results = pd.concat([results,df])
        
    return results

In [None]:
base_path = "/home/ubuntu/"

In [None]:
real_path = base_path + 'data/biden_wav_audio'
biden_df = load_files(real_path)
biden_df['label'] = 0

fake_path = base_path + 'data/11LabsDeepFakes'
fake_biden_df = load_files(fake_path)
fake_biden_df['label'] = 1

In [None]:
df = pd.concat([biden_df, fake_biden_df]).reset_index()
df = df.drop(columns=['file', 'start', 'end'])
df.head()

In [None]:
y = df['label'].copy()
X = df.drop(columns=['label']).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [None]:
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test) 

In [None]:
components = np.arange(1,100,1)
acc_plot = []

for component in components:
    model = LogisticRegression()
    model.fit(X_train_pca[:,:component], y_train)
    pred = model.predict(X_test_pca[:,:component])
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    acc_plot.append(acc)

In [None]:
sns.lineplot(x=components, y=acc_plot)
plt.xlabel('# of Components')
plt.ylabel('Acc %')
plt.title('real vs. fake Biden Accuracy')
plt.show()