# Install

In [None]:
!pip install music21 --quiet
!pip install pretty_midi --quiet
!pip install scikit-learn --quiet

# Import

In [None]:
import music21 as m21
import pretty_midi
import numpy as np
import os
import collections
import math
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from collections import OrderedDict

# Prediction

In [None]:
def detect_bpm_and_extract_features(filepath):
    """
    Analyzes a symbolic music file (e.g., MIDI) to detect BPM and
    prepare the basic data structures for feature extraction.

    Args:
        filepath (str): The path to the MIDI or MusicXML file.

    Returns:
        tuple: (detected_bpm, note_data_stream, total_duration_quarter_notes, full_score)
    """
    try:
        # 1. Parse the file using music21
        score = m21.converter.parse(filepath)
        # Use .flatten() to get all notes in one stream (fixes the warning)
        note_data_stream = score.flatten().notesAndRests

        print(f"Successfully parsed file: {os.path.basename(filepath)}")

        # --- 1.2 BPM Detection ---

        # A. Metadata Extraction (The easiest way)
        tempo = score.flat.getElementsByClass(m21.tempo.MetronomeMark)
        detected_bpm = None

        if tempo:
            # Get the value from the first detected tempo mark
            detected_bpm = tempo[0].number
            print(f"  -> BPM (from Metadata): {detected_bpm:.2f}")
        else:
            # B. Onset Periodicity Analysis (If no metadata is found)
            estimated_tempo = m21.analysis.simple.getEstimatedTempo(score)
            detected_bpm = estimated_tempo.number
            print(f"  -> BPM (Estimated using simple analysis): {detected_bpm:.2f}")

        # --- Data Preparation for other Features ---

        # Count notes/chords and calculate total duration for Note Density
        note_count = len([e for e in note_data_stream if e.isNote or e.isChord])

        # Calculate the total duration of the piece in quarter lengths (beats)
        total_duration_quarter_notes = score.duration.quarterLength

        print(f"  -> Total notes/chords extracted: {note_count}")

        return detected_bpm, note_data_stream, total_duration_quarter_notes, score

    except Exception as e:
        print(f"An error occurred during analysis: {e}")
        return None, None, 0, None

# ----------------------------------------------------------------------
# STEP 2: HARMONIC AND PITCH FEATURES (Helper Functions)
# ----------------------------------------------------------------------

def get_pitch_features(note_data_stream):
    """
    Calculates features related to melody and pitch range.
    """
    pitch_values = []

    for element in note_data_stream:
        if element.isNote:
            pitch_values.append(element.pitch.midi)
        elif element.isChord:
            for p in element.pitches:
                pitch_values.append(p.midi)

    if not pitch_values:
        return {'average_pitch': 0, 'pitch_range': 0}

    average_pitch = np.mean(pitch_values)
    pitch_range = np.max(pitch_values) - np.min(pitch_values)

    return {
        'average_pitch': average_pitch,
        'pitch_range': pitch_range
    }

def get_harmonic_features(score):
    """
    Calculates features related to tonality, key, and consonance (chord complexity).
    """
    # 1. Tonal Center (Key Detection)
    key_analysis = score.analyze('key')

    key_name = key_analysis.name
    key_stability = key_analysis.correlationCoefficient
    mode = 1 if key_analysis.mode == 'major' else 0 # 1 for Major, 0 for Minor/Other

    return {
        # We don't return key_name because ML models need numbers, not strings
        'key_stability': key_stability,
        'mode_major': mode
    }

# ----------------------------------------------------------------------
# STEP 3: RHYTHMIC FEATURES (Helper Function)
# ----------------------------------------------------------------------

def get_rhythmic_features(note_data_stream, total_duration_quarter_notes):
    """
    Calculates features related to rhythm, tempo, and note density.
    """
    note_durations = []

    # Collect all note/chord durations
    for element in note_data_stream:
        if element.isNote or element.isChord:
            note_durations.append(element.duration.quarterLength)

    if not note_durations:
        return {'note_density': 0, 'rhythm_variety': 0}

    # 1. Note Density (Notes per quarter note/beat)
    note_count = len(note_durations)
    note_density = note_count / total_duration_quarter_notes if total_duration_quarter_notes > 0 else 0

    # 2. Rhythm Variety (Entropy of Durations)
    duration_counts = collections.Counter(note_durations)
    total_notes = len(note_durations)

    rhythm_variety = 0
    for count in duration_counts.values():
        probability = count / total_notes
        rhythm_variety -= probability * math.log2(probability)

    return {
        'note_density': note_density,
        'rhythm_variety': rhythm_variety
    }


# ----------------------------------------------------------------------
# STEP 4: DATA CONSOLIDATION AND PREPARATION
# This function combines all feature extractions into one, clean dictionary.
# ----------------------------------------------------------------------

def extract_all_features(filepath):
    """
    Runs all feature extraction methods and combines the results.

    Args:
        filepath (str): The path to the MIDI file.

    Returns:
        dict: A single dictionary containing all numeric features.
    """
    # Step 1: Get core data
    bpm_result, data_stream, total_duration, full_score = detect_bpm_and_extract_features(filepath)

    if bpm_result is None or data_stream is None or total_duration == 0 or full_score is None:
        print("Error: Core data extraction failed or file is empty.")
        return None

    # Initialize the main feature dictionary with BPM
    all_features = {'bpm': bpm_result}

    # Step 2: Pitch and Harmonic Features
    pitch_features = get_pitch_features(data_stream)
    harmonic_features = get_harmonic_features(full_score)
    all_features.update(pitch_features)
    all_features.update(harmonic_features)

    # Step 3: Rhythmic Features
    rhythmic_features = get_rhythmic_features(data_stream, total_duration)
    all_features.update(rhythmic_features)

    return all_features


# ----------------------------------------------------------------------
# FINAL EXECUTION BLOCK (Updated to run all steps)
# ----------------------------------------------------------------------

# 1. <<< --- CHANGE THIS LINE FOR EVERY NEW FILE --- >>>
#    Replace 'ceremony_basic_pitch.mid' with the name of your new uploaded MIDI file.
midi_file_path = 'sound2.mid' # YOUR NEW FILE GOES HERE

# Create a mock file for demonstration if running without a real file uploaded
if not os.path.exists(midi_file_path):
    print("Creating a mock MIDI file for demonstration...")
    mock_score = m21.stream.Score()
    mock_score.append(m21.key.Key('C', 'major'))
    mock_score.insert(m21.tempo.MetronomeMark(number=120))
    part = m21.stream.Part()
    part.append(m21.note.Note('C4', quarterLength=1.0))
    part.append(m21.note.Note('E4', quarterLength=0.5))
    part.append(m21.note.Note('F4', quarterLength=0.5))
    part.append(m21.chord.Chord(['G4', 'C5'], quarterLength=2.0))
    mock_score.append(part)
    mock_score.write('midi', fp=midi_file_path)
    print(f"Mock file '{midi_file_path}' created with BPM 120 (C Major).")


# Run the full feature extraction pipeline
final_features = extract_all_features(midi_file_path)

if final_features:
    # Add the file name to the dictionary for tracking in the next cell
    final_features['original_file'] = midi_file_path

    # 2. <<< --- NEW: SET GLOBAL VARIABLE FOR THE NEXT CELL --- >>>
    # This makes the features available to the 'song_recommender.py' script.
    global LAST_ANALYZED_FEATURES
    LAST_ANALYZED_FEATURES = final_features # Store the results globally

    print("\n\n####################################################")
    print("FINAL CONSOLIDATED FEATURES (READY FOR ML):")
    for key, value in final_features.items():
        if isinstance(value, float):
            print(f"- {key.replace('_', ' ').title():<20}: {value:.4f}")
        else:
            # Only print the original file path if it's not a float
            if key == 'original_file':
                 print(f"- {key.replace('_', ' ').title():<20}: {value} (USED FOR ANALYSIS)")
            else:
                 print(f"- {key.replace('_', ' ').title():<20}: {value}")
    print("####################################################")

    print(f"\nFeatures for '{midi_file_path}' successfully stored globally.")
    print("Now run the 'Song Recommendation Engine' cell to get recommendations.")

In [None]:
REAL_SONG_LIBRARY = [
    # Classical/Chill Cluster (Low BPM, High Stability, Low Density)
    ("Clair de Lune", "Debussy", 60, 65.0, 35.0, 0.95, 0, 0.7, 1.2),
    ("Gymnopédie No. 1", "Satie", 80, 50.0, 20.0, 0.90, 0, 0.5, 1.0),
    ("The Four Seasons: Spring", "Vivaldi", 130, 70.0, 45.0, 0.88, 1, 2.8, 3.5),
    ("Canon in D", "Pachelbel", 60, 55.0, 25.0, 0.98, 1, 1.0, 1.5),
    ("Für Elise", "Beethoven", 130, 68.0, 38.0, 0.85, 0, 1.5, 2.2),

    # Pop/Rock Cluster (Mid BPM, Mid Range, Major Mode)
    ("Can't Stop The Feeling!", "Timberlake", 113, 56.0, 30.0, 0.70, 1, 2.5, 2.0),
    ("Imagine", "Lennon", 75, 52.0, 25.0, 0.80, 1, 1.0, 1.5),
    ("A Thousand Miles", "Carlton", 95, 60.0, 30.0, 0.78, 1, 1.8, 1.8),
    ("Happy", "Pharrell Williams", 160, 54.0, 28.0, 0.65, 1, 3.0, 2.5),
    ("Shape of You", "Ed Sheeran", 96, 50.0, 22.0, 0.72, 0, 2.2, 1.6),
    ("Blinding Lights", "The Weeknd", 171, 58.0, 35.0, 0.60, 0, 3.5, 3.0),

    # Jazz/Electronic/Fast Cluster (High BPM, High Variety/Density, Lower Stability)
    ("Take Five", "Dave Brubeck", 170, 60.5, 45.0, 0.50, 0, 3.5, 4.0),
    ("So What", "Miles Davis", 140, 65.0, 55.0, 0.55, 1, 3.0, 4.5),
    ("Rhapsody in Blue", "Gershwin", 100, 72.0, 50.0, 0.65, 1, 2.8, 3.8),
    ("Strobe", "deadmau5", 128, 55.0, 30.0, 0.58, 0, 3.2, 3.2),
    ("Levels", "Avicii", 126, 62.0, 40.0, 0.62, 1, 3.8, 3.5),
    ("Aerodynamic", "Daft Punk", 120, 68.0, 42.0, 0.53, 0, 4.0, 4.2),

    # World/Diverse Cluster
    ("Libertango", "Piazzolla", 120, 65.0, 35.0, 0.70, 0, 2.5, 3.0),
    ("Oye Como Va", "Santana", 128, 55.0, 30.0, 0.68, 1, 3.0, 3.5),
    ("The Girl from Ipanema", "Jobim", 115, 58.0, 25.0, 0.75, 1, 1.5, 2.0)
]

# --- Step 1: Define the Recommendation Model Function ---

def recommend_songs(input_features, k=5):
    """
    Uses the K-Nearest Neighbors (KNN) algorithm to find the 'k' most
    musically similar songs to the input song based on their features.
    """
    # Exclude the filename from the feature vector calculation
    song_name = input_features.get('original_file', 'The Analyzed Song')

    # 1. Define the Feature Columns (MUST match the order in the library list)
    FEATURE_COLUMNS = [
        'bpm', 'average_pitch', 'pitch_range',
        'key_stability', 'mode_major',
        'note_density', 'rhythm_variety'
    ]

    # 2. Extract Features and Metadata from the Real Song Library
    X_library = np.array([song[2:] for song in REAL_SONG_LIBRARY])
    Y_metadata = [(song[0], song[1]) for song in REAL_SONG_LIBRARY] # (Title, Artist)

    print(f"\n--- Starting Song Recommendation Engine (Step 5) for: {song_name} ---")
    print(f"--- Library Size: {len(Y_metadata)} real songs loaded ---")


    # 3. Scale the Data
    scaler = StandardScaler()
    X_library_scaled = scaler.fit_transform(X_library)

    # 4. Prepare the New Song's Data (Your uploaded file)
    input_vector = [input_features[col] for col in FEATURE_COLUMNS]
    X_input_scaled = scaler.transform([input_vector]) # Scale the new song vector

    # 5. Find Nearest Neighbors
    # We use NearestNeighbors to find the closest points in musical space
    nn_model = NearestNeighbors(n_neighbors=k, metric='euclidean')
    nn_model.fit(X_library_scaled)

    # Calculate distances and indices of the nearest neighbors
    # We use k for n_neighbors, as we assume your uploaded song is NOT in the library
    distances, indices = nn_model.kneighbors(X_input_scaled)

    # 6. Process Recommendations
    recommendations = []

    for i in range(len(indices[0])):
        song_index = indices[0][i]
        distance = distances[0][i]

        if song_index < len(Y_metadata):
            title, artist = Y_metadata[song_index]

            # Simple conversion to a 0-100 score. Lower distance (closer match) means higher score.
            # Using a scale factor of 8 helps keep the scores realistic for this dataset size.
            similarity_score = max(0, 100 - (distance * 8))

            recommendations.append({
                'title': title,
                'artist': artist,
                'similarity_score': similarity_score
            })

    # Sort by the highest similarity score (should already be sorted, but safer to confirm)
    recommendations.sort(key=lambda x: x['similarity_score'], reverse=True)

    return recommendations[:k] # Return only the top 'k' recommendations

# ----------------------------------------------------------------------
# FINAL EXECUTION BLOCK
# ----------------------------------------------------------------------

# 1. CHECK FOR GLOBAL FEATURES
if 'LAST_ANALYZED_FEATURES' in globals():
    your_song_features = LAST_ANALYZED_FEATURES
    file_analyzed = your_song_features.get('original_file', 'The Last Analyzed Song')
    print(f"Successfully loaded features for: {file_analyzed}")
else:
    # If the feature extractor wasn't run, use a generic default song
    print("WARNING: Global features not found. Using a default mock Pop/Rock song profile.")
    your_song_features = {
        'bpm': 100.0,
        'average_pitch': 60.0,
        'pitch_range': 25.0,
        'key_stability': 0.80,
        'mode_major': 1,
        'note_density': 1.5,
        'rhythm_variety': 2.0,
        'original_file': 'Default Pop/Rock Profile'
    }
    file_analyzed = your_song_features['original_file']


if your_song_features:
    # Run the recommendation engine, now requesting 5 songs (k=5)
    top_recommendations = recommend_songs(your_song_features, k=5)

    print("\n####################################################")
    print(f"Recommendations for '{file_analyzed}':")
    print("####################################################")

    if top_recommendations:
        for rec in top_recommendations:
            # --- FINAL OUTPUT FORMAT: Title by Artist (Score%) ---
            print(f"- {rec['title']} by {rec['artist']} ({rec['similarity_score']:.0f}%)")
    else:
        print("Could not find any recommendations.")