In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import itertools
import json

In [2]:
# Load the predictions from CSV files

mel_spec_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/mel-spec/mel-spec_test_large_with_aug_predictions.csv"
cqt_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/cqt/cqt_test_large_with_aug_predictions.csv"
mfcc_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/mfcc/mfcc_test_large_with_aug_predictions.csv"
plp_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/plp/plp_test_large_with_aug_predictions.csv"
chrm_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/chromagram/chromagram_test_large_with_aug_predictions.csv"
clean_lyrics_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/lyric_detection/large/clean_lyrics_test_large_predictions.csv"

# Read the CSV files into pandas DataFrames
df_mel = pd.read_csv(mel_spec_csv)
df_clean_lyrics = pd.read_csv(clean_lyrics_csv)
df_mfcc = pd.read_csv(mfcc_csv)
df_plp = pd.read_csv(plp_csv)
df_cqt = pd.read_csv(cqt_csv)
df_chrm = pd.read_csv(chrm_csv)

print(len(df_mel.index))

df_mel['base_filename'] = df_mel['filename'].str.replace(r'-Mel_Spectrogram\.png$', '', regex=True)
df_clean_lyrics['base_filename'] = df_clean_lyrics['filename'].str.replace(r'_lyrics\.txt$', '', regex=True)
df_mfcc['base_filename'] = df_mfcc['filename'].str.replace(r'-MFCC\.png$', '', regex=True)
df_plp['base_filename'] = df_plp['filename'].str.replace(r'_plp\.png$', '', regex=True)
df_cqt['base_filename'] = df_cqt['filename'].str.replace(r'-CQT\.png$', '', regex=True)
df_chrm['base_filename'] = df_chrm['filename'].str.replace(r'-Chromagram\.png$', '', regex=True)

print(len(df_mel.index))
print(len(df_plp.index))
print(len(df_clean_lyrics.index))

7995
7995
7995
2967


In [3]:
def rename_columns(df, suffix):
    return df.rename(columns={col: f"{col}{suffix}" for col in df.columns if col != 'base_filename'})

# Add suffixes to avoid column name clashes
df_mel = rename_columns(df_mel, '_mel')
df_clean_lyrics = rename_columns(df_clean_lyrics, '_lyrics')
df_mfcc = rename_columns(df_mfcc, '_mfcc')
df_plp = rename_columns(df_plp, '_plp')
df_cqt = rename_columns(df_cqt, '_cqt')
df_chrm = rename_columns(df_chrm, '_chrm')

merged_df = df_mel.copy()
# Merge sequentially on 'base_filename'
for df in [df_clean_lyrics, df_mfcc, df_plp, df_cqt,  df_chrm]:
    print(df.columns)  # Check before merge
    assert 'base_filename' in df.columns
    merged_df = pd.merge(merged_df, df, on='base_filename', how='left')

#print(merged_df.head())
print(len(merged_df.index))

Index(['filename_lyrics', 'prob_ai_lyrics', 'prob_human_lyrics',
       'true_label_lyrics', 'pred_label_lyrics', 'base_filename'],
      dtype='object')
Index(['filename_mfcc', 'prob_ai_mfcc', 'prob_human_mfcc', 'true_label_mfcc',
       'pred_label_mfcc', 'base_filename'],
      dtype='object')
Index(['filename_plp', 'prob_ai_plp', 'prob_human_plp', 'true_label_plp',
       'pred_label_plp', 'base_filename'],
      dtype='object')
Index(['filename_cqt', 'prob_ai_cqt', 'prob_human_cqt', 'true_label_cqt',
       'pred_label_cqt', 'base_filename'],
      dtype='object')
Index(['filename_chrm', 'prob_ai_chrm', 'prob_human_chrm', 'true_label_chrm',
       'pred_label_chrm', 'base_filename'],
      dtype='object')
7995


In [4]:
# Define the AI probability columns
human_prob_cols = [
    'prob_human_mel',
    'prob_human_lyrics',
    'prob_human_mfcc',
    'prob_human_plp',
    'prob_human_cqt',
    'prob_human_chrm'
]

missing_cols = [col for col in human_prob_cols if col not in merged_df.columns]
if missing_cols:
    print(f"Warning: Missing columns: {missing_cols}")
    # Filter to only existing columns
    human_prob_cols = [col for col in human_prob_cols if col in merged_df.columns]
    print(f"Using available columns: {human_prob_cols}")

# Compute confidence = abs(prob_ai - 0.5)
confidences = merged_df[human_prob_cols].apply(lambda x: np.abs(x - 0.5))

# Get the column name of the highest-confidence prediction
best_model_col = confidences.idxmax(axis=1)

def get_best_prob(row):
    best_col = confidences.loc[row.name].idxmax()
    return row[best_col]

merged_df['final_prob_ai_confidence_based'] = merged_df.apply(get_best_prob, axis=1)
merged_df['final_pred_label_confidence_based'] = (merged_df['final_prob_ai_confidence_based'] >= 0.5).astype(int)


In [5]:
y_true = merged_df['true_label_mel']
y_pred = merged_df['final_pred_label_confidence_based']

# Calculate and display metrics
accuracy = accuracy_score(y_true, y_pred)
auc = roc_auc_score(y_true, merged_df['final_prob_ai_confidence_based'])

print(f"\nFinal Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")

# Show distribution of which models were selected most often
print(f"\nModel selection frequency:")
print(best_model_col.value_counts())



Final Results:
Accuracy: 0.9711
AUC: 0.9940

Model selection frequency:
prob_human_mel       4325
prob_human_mfcc      2360
prob_human_cqt       1202
prob_human_chrm       100
prob_human_lyrics       6
prob_human_plp          2
Name: count, dtype: int64
