In [2]:
import pandas as pd

In [5]:
# Load the predictions from the two CSV files
#music_csv = "/data/sg2121/aimusicdetector/music_cnn/music_test_predictions.csv"
#lyric_csv = "/data/sg2121/aimusicdetector/lyric_detection/lyrics_test_predictions.csv"

mel_spec_csv = "/data/sg2121/aimusicdetector/music_cnn/large/mel-spec/mel-spec_test_large_predictions.csv"
cqt_csv = "/data/sg2121/aimusicdetector/music_cnn/large/cqt/cqt_test_large_predictions.csv"
mfcc_csv = "/data/sg2121/aimusicdetector/music_cnn/large/mfcc/mfcc_test_large_predictions.csv"
plp_csv = "/data/sg2121/aimusicdetector/music_cnn/large/plp/plp_test_large_predictions.csv"
chrm_csv = "/data/sg2121/aimusicdetector/music_cnn/large/chromagram/chromagram_test_large_predictions.csv"
clean_lyrics_csv = "/data/sg2121/aimusicdetector/lyric_detection/large/clean_lyrics_test_large_predictions.csv"

# Read the CSV files into pandas DataFrames
#df_music = pd.read_csv(music_csv)
#df_lyrics = pd.read_csv(lyric_csv)

df_mel = pd.read_csv(mel_spec_csv)
df_clean_lyrics = pd.read_csv(clean_lyrics_csv)
df_mfcc = pd.read_csv(mfcc_csv)
df_plp = pd.read_csv(plp_csv)
df_cqt = pd.read_csv(cqt_csv)
df_chrm = pd.read_csv(chrm_csv)

#df_music['base_filename'] = df_music['filename'].str.replace(r'-Mel_Spectrogram\.png$', '', regex=True)
#df_lyrics['base_filename'] = df_lyrics['filename'].str.replace(r'_lyrics\.txt$', '', regex=True)

df_mel['base_filename'] = df_mel['filename'].str.replace(r'-Mel_Spectrogram\.png$', '', regex=True)
df_clean_lyrics['base_filename'] = df_clean_lyrics['filename'].str.replace(r'_lyrics\.txt$', '', regex=True)
df_mfcc['base_filename'] = df_mfcc['filename'].str.replace(r'-MFCC\.png$', '', regex=True)
df_plp['base_filename'] = df_plp['filename'].str.replace(r'_PLP\.png$', '', regex=True)
df_cqt['base_filename'] = df_cqt['filename'].str.replace(r'-CQT\.png$', '', regex=True)
df_chrm['base_filename'] = df_chrm['filename'].str.replace(r'-Chromagram\.png$', '', regex=True)

df_mel.head()


Unnamed: 0,filename,prob_ai,prob_human,true_label,pred_label,base_filename
0,S5304RN_segment_1-Mel_Spectrogram.png,0.835666,0.164334,0,0,S5304RN_segment_1
1,S4865RN_segment_1-Mel_Spectrogram.png,0.968372,0.031628,0,0,S4865RN_segment_1
2,H2884N-Mel_Spectrogram.png,0.369995,0.630006,1,1,H2884N
3,S1553RN_segment_1-Mel_Spectrogram.png,0.995993,0.004007,0,0,S1553RN_segment_1
4,S171RN_segment_2-Mel_Spectrogram.png,0.99096,0.00904,0,0,S171RN_segment_2


In [8]:
def rename_columns(df, suffix):
    return df.rename(columns={col: f"{col}{suffix}" for col in df.columns if col != 'base_filename'})

# Add suffixes to avoid column name clashes
df_mel = rename_columns(df_mel, '_mel')
df_clean_lyrics = rename_columns(df_clean_lyrics, '_lyrics')
df_mfcc = rename_columns(df_mfcc, '_mfcc')
df_plp = rename_columns(df_plp, '_plp')
df_cqt = rename_columns(df_cqt, '_cqt')
df_chrm = rename_columns(df_chrm, '_chrm')

# Merge sequentially on 'base_filename'
merged_df = df_mel
for df in [df_clean_lyrics, df_mfcc, df_plp, df_cqt, df_chrm]:
    merged_df = pd.merge(merged_df, df, on='base_filename')

# Optional: view result
print(merged_df.head())


                            filename_mel  prob_ai_mel  prob_human_mel  \
0  S4865RN_segment_1-Mel_Spectrogram.png     0.968372        0.031628   
1   S171RN_segment_2-Mel_Spectrogram.png     0.990960        0.009040   
2  S3184RN_segment_1-Mel_Spectrogram.png     0.742062        0.257938   
3             H4723N-Mel_Spectrogram.png     0.210246        0.789754   
4    U116R_segment_1-Mel_Spectrogram.png     0.762164        0.237836   

   true_label_mel  pred_label_mel      base_filename  \
0               0               0  S4865RN_segment_1   
1               0               0   S171RN_segment_2   
2               0               0  S3184RN_segment_1   
3               1               1             H4723N   
4               0               0    U116R_segment_1   

                filename_lyrics  prob_ai_lyrics  prob_human_lyrics  \
0  S4865RN_segment_1_lyrics.txt        0.813849           0.186151   
1   S171RN_segment_2_lyrics.txt        0.998098           0.001902   
2  S3184RN_seg

In [9]:
def apply_weighted_ensemble(df, weights=None):
    # Find all prob columns for AI and Human
    ai_cols = [col for col in df.columns if col.startswith('prob_ai_')]
    human_cols = [col for col in df.columns if col.startswith('prob_human_')]
    
    assert len(ai_cols) == len(human_cols), "Mismatch in number of AI and Human columns"
    
    model_keys = [col.replace('prob_ai_', '') for col in ai_cols]
    
    # If no weights provided, use equal weighting
    if weights is None:
        weights = {key: 1 / len(model_keys) for key in model_keys}
    
    # Sanity check
    assert abs(sum(weights.values()) - 1.0) < 1e-6, "Weights must sum to 1"
    for key in model_keys:
        assert key in weights, f"Missing weight for model: {key}"
    
    # Weighted sum of probabilities
    df['weighted_prob_ai'] = sum(df[f'prob_ai_{key}'] * weights[key] for key in model_keys)
    df['weighted_prob_human'] = sum(df[f'prob_human_{key}'] * weights[key] for key in model_keys)
    
    # Final prediction: 0 = AI, 1 = Human
    df['final_pred_label'] = df.apply(
        lambda row: 0 if row['weighted_prob_ai'] > row['weighted_prob_human'] else 1,
        axis=1
    )
    
    return df


In [10]:
# Option 1: Use equal weights
merged_df = apply_weighted_ensemble(merged_df)

# Option 2: Use custom weights
custom_weights = {
    'mel': 0.2,
    'lyrics': 0.2,
    'mfcc': 0.15,
    'plp': 0.15,
    'cqt': 0.15,
    'chrm': 0.15
}
merged_df = apply_weighted_ensemble(merged_df, weights=custom_weights)

# Preview results
print(merged_df[['base_filename', 'weighted_prob_ai', 'weighted_prob_human', 'final_pred_label']].head())


       base_filename  weighted_prob_ai  weighted_prob_human  final_pred_label
0  S4865RN_segment_1          0.880186             0.119814                 0
1   S171RN_segment_2          0.969434             0.030566                 0
2  S3184RN_segment_1          0.855159             0.144841                 0
3             H4723N          0.346350             0.653650                 1
4    U116R_segment_1          0.807509             0.192491                 0


In [10]:

# Optional: Calculate accuracy, precision, recall, etc. based on the final prediction
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

y_true = merged_df['true_label_mel']
y_pred = merged_df['final_pred_label']

accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1])

# Log the results
print(f"Ensembled Model Accuracy: {accuracy:.4f}")
for i, label in enumerate(['ai', 'human']):
    print(f"Precision for {label}: {precision[i]:.4f}")
    print(f"Recall for {label}: {recall[i]:.4f}")
    print(f"F1-score for {label}: {f1[i]:.4f}")

# Optionally save the ensembled results to a new CSV file
ensembled_csv_file = "ensembled_predictions.csv"
merged_df[['base_filename', 'weighted_prob_ai', 'weighted_prob_human', 'final_pred_label']].to_csv(ensembled_csv_file, index=False)

print(f"Ensembled results saved to {ensembled_csv_file}")

Ensembled Model Accuracy: 0.9219
Precision for ai: 1.0000
Recall for ai: 0.8276
F1-score for ai: 0.9057
Precision for human: 0.8750
Recall for human: 1.0000
F1-score for human: 0.9333
Ensembled results saved to ensembled_predictions.csv


In [None]:
# OLD CODE BELOW

In [7]:
#merged_df = pd.merge(df_music, df_lyrics, on='base_filename', suffixes=('_mel', '_lyrics'))
merged_df = pd.merge(df_mel, df_clean_lyrics, df_mfcc, df_plp, df_cqt, df_chrm, on='base_filename', suffixes=('_mel', '_lyrics', '_mfcc,', '_plp', '_cqt', '_chrm'))
merged_df.head()

TypeError: merge() got multiple values for argument 'on'

In [9]:
# Define the weights for each model
weight_model1 = 0.6 # music model
weight_model2 = 0.4 # lyric model

# Calculate the weighted averages of the probabilities for both AI and Human classes
merged_df['weighted_prob_ai'] = (merged_df['prob_ai_mel'] * weight_model1 +
                                  merged_df['prob_ai_lyrics'] * weight_model2)

merged_df['weighted_prob_human'] = (merged_df['prob_human_mel'] * weight_model1 +
                                     merged_df['prob_human_lyrics'] * weight_model2)

# Decide the final predicted class based on the weighted average probabilities
merged_df['final_pred_label'] = merged_df.apply(
    lambda row: 0 if row['weighted_prob_ai'] > row['weighted_prob_human'] else 1,
    axis=1
)
merged_df.head()

Unnamed: 0,filename_mel,prob_ai_mel,prob_human_mel,true_label_mel,pred_label_mel,base_filename,filename_lyrics,prob_ai_lyrics,prob_human_lyrics,true_label_lyrics,pred_label_lyrics,weighted_prob_ai,weighted_prob_human,final_pred_label
0,H199N-Mel_Spectrogram.png,0.247414,0.752586,1,1,H199N,H199N_lyrics.txt,0.138137,0.861863,1,1,0.203703,0.796297,1
1,H405N-Mel_Spectrogram.png,0.143636,0.856364,1,1,H405N,H405N_lyrics.txt,0.356109,0.643892,1,1,0.228625,0.771375,1
2,H396N-Mel_Spectrogram.png,0.194314,0.805686,1,1,H396N,H396N_lyrics.txt,0.021202,0.978798,1,1,0.125069,0.874931,1
3,U63RN_segment_2-Mel_Spectrogram.png,0.848963,0.151037,0,0,U63RN_segment_2,U63RN_segment_2_lyrics.txt,0.467536,0.532464,0,1,0.696392,0.303608,0
4,H361N-Mel_Spectrogram.png,0.042401,0.957599,1,1,H361N,H361N_lyrics.txt,0.345887,0.654113,1,1,0.163796,0.836204,1
