In [23]:
print("HELLO")

HELLO


In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import itertools
import json
from tqdm import tqdm

In [25]:
# Load the predictions from CSV files

mel_spec_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/mel-spec/mel-spec_test_large_with_aug_predictions.csv"
cqt_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/cqt/cqt_test_large_with_aug_predictions.csv"
mfcc_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/mfcc/mfcc_test_large_with_aug_predictions.csv"
plp_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/plp/plp_test_large_with_aug_predictions.csv"
chrm_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/chromagram/chromagram_test_large_with_aug_predictions.csv"
clean_lyrics_csv = "/vol/bitbucket/sg2121/fyp/aimusicdetector/lyric_detection/large/clean_lyrics_test_large_predictions.csv"

# Read the CSV files into pandas DataFrames
df_mel = pd.read_csv(mel_spec_csv)
df_clean_lyrics = pd.read_csv(clean_lyrics_csv)
df_mfcc = pd.read_csv(mfcc_csv)
df_plp = pd.read_csv(plp_csv)
df_cqt = pd.read_csv(cqt_csv)
df_chrm = pd.read_csv(chrm_csv)

print(len(df_mel.index))

df_mel['base_filename'] = df_mel['filename'].str.replace(r'-Mel_Spectrogram\.png$', '', regex=True)
df_clean_lyrics['base_filename'] = df_clean_lyrics['filename'].str.replace(r'_lyrics\.txt$', '', regex=True)
df_mfcc['base_filename'] = df_mfcc['filename'].str.replace(r'-MFCC\.png$', '', regex=True)
df_plp['base_filename'] = df_plp['filename'].str.replace(r'_plp\.png$', '', regex=True)
df_cqt['base_filename'] = df_cqt['filename'].str.replace(r'-CQT\.png$', '', regex=True)
df_chrm['base_filename'] = df_chrm['filename'].str.replace(r'-Chromagram\.png$', '', regex=True)

print(len(df_mel.index))
print(len(df_plp.index))
print(len(df_clean_lyrics.index))

7995
7995
7995
2967


In [26]:
def rename_columns(df, suffix):
    return df.rename(columns={col: f"{col}{suffix}" for col in df.columns if col != 'base_filename'})

# Add suffixes to avoid column name clashes
df_mel = rename_columns(df_mel, '_mel')
df_clean_lyrics = rename_columns(df_clean_lyrics, '_lyrics')
df_mfcc = rename_columns(df_mfcc, '_mfcc')
df_plp = rename_columns(df_plp, '_plp')
df_cqt = rename_columns(df_cqt, '_cqt')
df_chrm = rename_columns(df_chrm, '_chrm')

merged_df = df_mel.copy()
# Merge sequentially on 'base_filename'
for df in [df_clean_lyrics, df_mfcc, df_plp, df_cqt,  df_chrm]:
    print(df.columns)  # Check before merge
    assert 'base_filename' in df.columns
    merged_df = pd.merge(merged_df, df, on='base_filename', how='left')
    
# Optional: view result
#print(merged_df.head())
print(len(merged_df.index))

Index(['filename_lyrics', 'prob_ai_lyrics', 'prob_human_lyrics',
       'true_label_lyrics', 'pred_label_lyrics', 'base_filename'],
      dtype='object')
Index(['filename_mfcc', 'prob_ai_mfcc', 'prob_human_mfcc', 'true_label_mfcc',
       'pred_label_mfcc', 'base_filename'],
      dtype='object')
Index(['filename_plp', 'prob_ai_plp', 'prob_human_plp', 'true_label_plp',
       'pred_label_plp', 'base_filename'],
      dtype='object')
Index(['filename_cqt', 'prob_ai_cqt', 'prob_human_cqt', 'true_label_cqt',
       'pred_label_cqt', 'base_filename'],
      dtype='object')
Index(['filename_chrm', 'prob_ai_chrm', 'prob_human_chrm', 'true_label_chrm',
       'pred_label_chrm', 'base_filename'],
      dtype='object')
7995


In [27]:
def apply_weighted_ensemble(df, weights=None):
    # Find all prob columns for AI and Human
    ai_cols = [col for col in df.columns if col.startswith('prob_ai_')]
    human_cols = [col for col in df.columns if col.startswith('prob_human_')]
    
    assert len(ai_cols) == len(human_cols), "Mismatch in number of AI and Human columns"
    
    model_keys = [col.replace('prob_ai_', '') for col in ai_cols]
    
    # If no weights provided, use equal weighting
    if weights is None:
        weights = {key: 1 / len(model_keys) for key in model_keys}
    
    # Sanity check
    assert abs(sum(weights.values()) - 1.0) < 1e-6, "Weights must sum to 1"
    for key in model_keys:
        assert key in weights, f"Missing weight for model: {key}"

    def compute_weighted_prob(row, prob_prefix, weights, keys):
        total_weight = 0.0
        weighted_sum = 0.0
        for key in keys:
            col_name = f"{prob_prefix}_{key}"
            value = row.get(col_name)
            if pd.notna(value):
                weighted_sum += value * weights[key]
                total_weight += weights[key]
        return weighted_sum / total_weight if total_weight > 0 else np.nan
    
    # Apply to each row
    df['weighted_prob_ai'] = df.apply(lambda row: compute_weighted_prob(row, 'prob_ai', weights, model_keys), axis=1)
    df['weighted_prob_human'] = df.apply(lambda row: compute_weighted_prob(row, 'prob_human', weights, model_keys), axis=1)
    
    # Final prediction: 0 = AI, 1 = Human
    df['final_pred_label'] = df.apply(
        lambda row: 0 if row['weighted_prob_ai'] > row['weighted_prob_human'] else 1,
        axis=1
    )
    
    return df


In [28]:
# Option 1: Use equal weights
merged_df = apply_weighted_ensemble(merged_df)
print(merged_df[['base_filename', 'weighted_prob_ai', 'weighted_prob_human', 'final_pred_label']].head())

# Option 2: Use custom weights
custom_weights = {
    'mel': 0.2,
    'lyrics': 0.2,
    'mfcc': 0.15,
    'plp': 0.15,
    'cqt': 0.15,
    'chrm': 0.15
}

merged_df = apply_weighted_ensemble(merged_df, weights=custom_weights)

# Preview results
print(merged_df[['base_filename', 'weighted_prob_ai', 'weighted_prob_human', 'final_pred_label']].head())


                 base_filename  weighted_prob_ai  weighted_prob_human  \
0                        H279N          0.076384             0.923616   
1                       H8167N          0.057440             0.942560   
2            S4594RN_segment_1          0.959892             0.040108   
3   U524RN_segment_1_stretched          0.996961             0.003039   
4  U1301RN_segment_2_stretched          0.977369             0.022631   

   final_pred_label  
0                 1  
1                 1  
2                 0  
3                 0  
4                 0  
                 base_filename  weighted_prob_ai  weighted_prob_human  \
0                        H279N          0.071610             0.928390   
1                       H8167N          0.053850             0.946150   
2            S4594RN_segment_1          0.962399             0.037601   
3   U524RN_segment_1_stretched          0.997151             0.002849   
4  U1301RN_segment_2_stretched          0.978783             0.0

In [29]:
# Optional: Calculate accuracy, precision, recall, etc. based on the final prediction
y_true = merged_df['true_label_mel']
y_pred = merged_df['final_pred_label']

accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1])

# Log the results
print(f"Ensembled Model Accuracy: {accuracy:.4f}")
for i, label in enumerate(['ai', 'human']):
    print(f"Precision for {label}: {precision[i]:.4f}")
    print(f"Recall for {label}: {recall[i]:.4f}")
    print(f"F1-score for {label}: {f1[i]:.4f}")

# Optionally save the ensembled results to a new CSV file
ensembled_csv_file = "ensembled_predictions.csv"
merged_df[['base_filename', 'weighted_prob_ai', 'weighted_prob_human', 'final_pred_label', 'true_label_mel']].to_csv(ensembled_csv_file, index=False)
merged_df.to_csv("full_csv.csv", index=False)

print(f"Ensembled results saved to {ensembled_csv_file}")

Ensembled Model Accuracy: 0.9704
Precision for ai: 0.9808
Recall for ai: 0.9630
F1-score for ai: 0.9718
Precision for human: 0.9590
Recall for human: 0.9787
F1-score for human: 0.9687
Ensembled results saved to ensembled_predictions.csv


In [30]:
# Define step size and base modalities
modalities = ['mel', 'lyrics', 'mfcc', 'plp', 'cqt', 'chrm']
step = 0.05

# Generate grid of weights summing to 1
def generate_weight_combinations(modalities, step=0.1):
    ranges = [np.arange(0, 1 + step, step) for _ in modalities]
    all_combinations = list(itertools.product(*ranges))
    valid_combinations = [
        combo for combo in all_combinations if abs(sum(combo) - 1.0) < 1e-6
    ]
    return [dict(zip(modalities, combo)) for combo in valid_combinations]

# Prepare grid
weight_combinations = generate_weight_combinations(modalities, step=0.1)

print(len(weight_combinations))

# Prepare ground truth
y_true = merged_df['true_label_mel'].values  # Assumes 0 for AI, 1 for human

best_score = 0
best_weights = None
best_metrics = None

for weights in tqdm(weight_combinations, desc="Searching best weights"):
    
    df_copy = merged_df.copy()
    df_copy = apply_weighted_ensemble(df_copy, weights)

    y_pred = df_copy['final_pred_label'].values
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

    # Choose metric to optimize (you can change this to f1 or accuracy)
    score = accuracy  # macro F1
    if score > best_score:
        best_score = score
        best_weights = weights
        best_metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }

# Show best weights and metrics
print("\n🎯 Best Ensemble Weights:")
for k, v in best_weights.items():
    print(f"{k}: {v:.2f}")
print("\n📊 Metrics for Best Weights:")
for k, v in best_metrics.items():
    print(f"{k.capitalize()}: {v:.4f}")

3003


Searching best weights: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3003/3003 [11:08<00:00,  4.49it/s]


🎯 Best Ensemble Weights:
mel: 0.30
lyrics: 0.40
mfcc: 0.20
plp: 0.00
cqt: 0.10
chrm: 0.00

📊 Metrics for Best Weights:
Accuracy: 0.9777
Precision: 0.9773
Recall: 0.9782
F1: 0.9777





In [31]:
# Save best weights to a JSON file
with open("best_ensemble_weights.json", "w") as f:
    json.dump(best_weights, f, indent=4)

print("✅ Best weights saved to best_ensemble_weights.json")

merged_df = apply_weighted_ensemble(merged_df, weights=best_weights)

# Save only the key prediction outputs
ensembled_csv_file = "best_weights_ensembled_predictions.csv"
merged_df[['base_filename', 'weighted_prob_ai', 'weighted_prob_human', 'final_pred_label', 'true_label_mel']].to_csv(ensembled_csv_file, index=False)

# Optionally save the full DataFrame (with all features/columns)
merged_df.to_csv("full_ensembled_output.csv", index=False)

print(f"✅ Final ensembled predictions saved to {ensembled_csv_file}")
print(f"📁 Full data (with all features) saved to full_ensembled_output.csv")


✅ Best weights saved to best_ensemble_weights.json
✅ Final ensembled predictions saved to best_weights_ensembled_predictions.csv
📁 Full data (with all features) saved to full_ensembled_output.csv
