
## Task 3 Part d)

In [None]:
 # Part D: Audio Analysis
    audio_features = load_anthem_audio()
    audio_df = analyze_anthem_audio(audio_features)
    
    # Part E: Multimodal Analysis
    analyze_multimodal_correlations(processed_texts, audio_df, tfidf_df)
    

In [None]:
ANTHEM_AUDIO_DIR = "anthem_audio"     # Folder containing anthem audio files


In [None]:
#################################################################
# PART D: AUDIO ANALYSIS
#################################################################

def load_anthem_audio():
    """Load audio features for all anthem audio files."""
    audio_features = {}
    
    for file_path in glob.glob(os.path.join(ANTHEM_AUDIO_DIR, "*.mp3")):
        try:
            country_name = os.path.basename(file_path).replace('.mp3', '')
            
            # Load audio file
            y, sr = librosa.load(file_path, sr=None)
            
            # Extract features
            # Tempo and beat information
            tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
            
            # Spectral features
            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
            
            # MFCCs
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            
            # Chroma features
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            
            # Extract summary statistics for each feature
            features = {
                'tempo': tempo,
                'spectral_centroid_mean': np.mean(spectral_centroids),
                'spectral_centroid_std': np.std(spectral_centroids),
                'spectral_rolloff_mean': np.mean(spectral_rolloff),
                'spectral_rolloff_std': np.std(spectral_rolloff),
            }
            
            # Add MFCC statistics
            for i in range(13):
                features[f'mfcc{i+1}_mean'] = np.mean(mfccs[i])
                features[f'mfcc{i+1}_std'] = np.std(mfccs[i])
            
            # Add chroma statistics
            chroma_means = np.mean(chroma, axis=1)
            for i in range(12):
                features[f'chroma{i+1}'] = chroma_means[i]
            
            audio_features[country_name] = features
            
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    print(f"Loaded audio features for {len(audio_features)} anthems.")
    return audio_features

def analyze_anthem_audio(audio_features):
    """Perform analysis on audio features."""
    print("\n--- AUDIO ANALYSIS ---")
    
    if not audio_features:
        print("No audio features available for analysis.")
        return None
    
    # Convert to DataFrame
    audio_df = pd.DataFrame.from_dict(audio_features, orient='index')
    
    # Basic statistics
    print("\nAudio feature statistics:")
    print(f"Average tempo: {audio_df['tempo'].mean():.2f} BPM")
    print(f"Tempo range: {audio_df['tempo'].min():.2f} - {audio_df['tempo'].max():.2f} BPM")
    
    # Find fastest and slowest anthems
    fastest = audio_df['tempo'].idxmax()
    slowest = audio_df['tempo'].idxmin()
    print(f"Fastest anthem: {fastest} ({audio_df.loc[fastest, 'tempo']:.2f} BPM)")
    print(f"Slowest anthem: {slowest} ({audio_df.loc[slowest, 'tempo']:.2f} BPM)")
    
    # Save audio features
    audio_df.to_csv(os.path.join(RESULTS_DIR, 'anthem_audio_features.csv'))
    
    # Correlation heatmap of audio features
    plt.figure(figsize=(12, 10))
    correlation = audio_df.corr()
    sns.heatmap(correlation, annot=False, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Between Audio Features')
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'audio_correlation.png'))
    
    # PCA on audio features
    # Normalize features first
    audio_df_norm = (audio_df - audio_df.mean()) / audio_df.std()
    
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(audio_df_norm.fillna(0))  # Handle any NaN values
    
    # Create PCA plot
    plt.figure(figsize=(12, 8))
    plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7)
    
    # Add country labels
    for i, country in enumerate(audio_df.index):
        plt.annotate(country, (pca_result[i, 0], pca_result[i, 1]), 
                    fontsize=8, alpha=0.8)
    
    plt.title('PCA of National Anthem Audio Features')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'anthem_audio_pca.png'))
    
    # Clustering analysis
    kmeans = KMeans(n_clusters=5, random_state=42)
    clusters = kmeans.fit_predict(audio_df_norm.fillna(0))
    
    # Create a DataFrame with clustering results
    cluster_df = pd.DataFrame({
        'Country': audio_df.index,
        'Cluster': clusters
    })
    
    # Print clusters
    print("\nAnthem audio clusters:")
    for cluster_id in range(5):
        countries = cluster_df[cluster_df['Cluster'] == cluster_id]['Country'].tolist()
        print(f"Cluster {cluster_id}: {', '.join(countries[:5])}{'...' if len(countries) > 5 else ''}")
    
    # Save clustering results
    cluster_df.to_csv(os.path.join(RESULTS_DIR, 'anthem_audio_clusters.csv'), index=False)
    
    # Feature importance analysis
    print("\nMost influential audio features for each principal component:")
    feature_names = audio_df.columns
    for i, component in enumerate(pca.components_):
        sorted_idx = np.argsort(np.abs(component))[::-1]
        top_features = [(feature_names[idx], component[idx]) for idx in sorted_idx[:5]]
        print(f"PC{i+1}: {top_features}")
    
    return audio_df