In [1]:
from moviepy.editor import *
from sqlalchemy import create_engine
from datetime import datetime
import pandas as pd
import os
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import librosa
import librosa.display
import matplotlib.pyplot as plt

# Your database connection
engine = create_engine('mysql+pymysql://root@localhost:3306/music_development')
data_path = '../data/'

# Get song data
sql = '''
SELECT * FROM songs
WHERE name LIKE "imagine"
'''
songs = pd.read_sql(sql, engine)
songId = songs['id'].iloc[0]

# Get lyrics
sql = f'SELECT * FROM lyrics WHERE song_id = {songId}'
lyrics = pd.read_sql(sql, engine)
lyrics_content = lyrics.content.iloc[0]

def detect_vocal_segments(audio_path, plot_analysis=False):
    """Use librosa to detect vocal segments and onsets - FIXED VERSION"""
    try:
        print("🎤 Analyzing audio for vocal segments...")
        
        # Load audio file
        y, sr = librosa.load(audio_path, sr=None)
        duration = librosa.get_duration(y=y, sr=sr)
        
        print(f"📊 Audio loaded: {duration:.2f}s, SR: {sr}Hz")
        
        # Extract features for vocal detection
        print("📊 Extracting audio features...")
        
        # Harmonic-percussive source separation
        y_harmonic, y_percussive = librosa.effects.hpss(y)
        
        # Detect onsets (places where vocals/instruments start)
        onset_frames = librosa.onset.onset_detect(
            y=y, 
            sr=sr, 
            hop_length=512, 
            backtrack=True,
            delta=0.1
        )
        onset_times = librosa.frames_to_time(onset_frames, sr=sr, hop_length=512)
        
        # Detect beats for rhythm analysis
        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr, hop_length=512)
        beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=512)
        
        # Vocal activity detection using spectral features
        stft = librosa.stft(y_harmonic)
        spectral_centroids = librosa.feature.spectral_centroid(y=y_harmonic, sr=sr)
        
        # FIX: Handle the shape of spectral_centroids properly
        print(f"📐 Spectral centroids shape: {spectral_centroids.shape}")
        
        # Flatten and normalize features
        spectral_flat = spectral_centroids.flatten()
        spectral_centroids_normalized = (spectral_flat - np.min(spectral_flat)) / (np.max(spectral_flat) - np.min(spectral_flat))
        
        # Create time array for features - FIXED
        times = librosa.times_like(spectral_centroids, sr=sr, hop_length=512)
        times_flat = times.flatten()
        
        print(f"📐 Times shape: {times.shape}, Normalized centroids shape: {spectral_centroids_normalized.shape}")
        
        # Detect vocal segments based on spectral features - FIXED
        vocal_segments = []
        current_segment = None
        vocal_threshold = 0.4  # Adjusted threshold
        
        # Ensure arrays are the same length
        min_length = min(len(times_flat), len(spectral_centroids_normalized))
        
        for i in range(min_length):
            time = times_flat[i]
            centroid = spectral_centroids_normalized[i]
            
            if centroid > vocal_threshold:
                if current_segment is None:
                    current_segment = {'start': time, 'end': time}
                else:
                    current_segment['end'] = time
            else:
                if current_segment is not None:
                    # Only keep segments longer than 0.5 seconds
                    if current_segment['end'] - current_segment['start'] > 0.5:
                        vocal_segments.append(current_segment)
                    current_segment = None
        
        # Add the last segment if exists
        if current_segment is not None and current_segment['end'] - current_segment['start'] > 0.5:
            vocal_segments.append(current_segment)
        
        # If no vocal segments detected, use a fallback approach
        if not vocal_segments:
            print("⚠️ No vocal segments detected, using onset-based segmentation")
            # Use onsets to create segments
            for i in range(len(onset_times) - 1):
                vocal_segments.append({
                    'start': onset_times[i],
                    'end': onset_times[i + 1]
                })
        
        # Plot analysis if requested
        if plot_analysis:
            plot_audio_analysis(y, sr, times_flat[:min_length], spectral_centroids_normalized[:min_length], onset_times, vocal_segments)
        
        print(f"✅ Detected {len(vocal_segments)} vocal segments")
        print(f"✅ Detected {len(onset_times)} musical onsets")
        print(f"🎵 Estimated tempo: {tempo:.1f} BPM")
        
        return {
            'duration': duration,
            'onset_times': onset_times,
            'beat_times': beat_times,
            'vocal_segments': vocal_segments,
            'tempo': tempo
        }
        
    except Exception as e:
        print(f"❌ Audio analysis error: {e}")
        import traceback
        traceback.print_exc()
        # Return basic analysis with duration only
        return {
            'duration': 185,  # Fallback duration for Imagine
            'onset_times': [],
            'beat_times': [],
            'vocal_segments': [{'start': 5, 'end': 180}],  # Assume most of song has vocals
            'tempo': 120
        }

def plot_audio_analysis(y, sr, times, spectral_centroids, onset_times, vocal_segments):
    """Plot audio analysis for debugging - FIXED"""
    plt.figure(figsize=(12, 8))
    
    # Plot waveform
    plt.subplot(3, 1, 1)
    librosa.display.waveshow(y, sr=sr, alpha=0.6)
    plt.title('Audio Waveform')
    plt.ylabel('Amplitude')
    
    # Plot spectral centroids
    plt.subplot(3, 1, 2)
    plt.plot(times, spectral_centroids, label='Spectral Centroid', color='r', linewidth=1)
    plt.axhline(y=0.4, color='g', linestyle='--', label='Vocal Threshold')
    plt.title('Spectral Centroid (Vocal Activity Indicator)')
    plt.ylabel('Normalized Value')
    plt.legend()
    
    # Plot onsets and vocal segments
    plt.subplot(3, 1, 3)
    for segment in vocal_segments:
        plt.axvspan(segment['start'], segment['end'], alpha=0.3, color='red', label='Vocal Segments' if segment == vocal_segments[0] else "")
    
    for onset in onset_times:
        plt.axvline(x=onset, color='blue', alpha=0.7, linestyle='--', label='Onsets' if onset == onset_times[0] else "")
    
    plt.title('Vocal Segments and Musical Onsets')
    plt.xlabel('Time (seconds)')
    plt.ylabel('Detection')
    plt.legend()
    
    plt.tight_layout()
    os.makedirs('../data', exist_ok=True)
    plt.savefig('../data/audio_analysis.png', dpi=150, bbox_inches='tight')
    print("📊 Analysis plot saved: ../data/audio_analysis.png")
    plt.close()

def assign_lyrics_to_segments(lyrics_text, audio_analysis):
    """Intelligently assign lyrics to vocal segments - IMPROVED"""
    lines = [line.strip() for line in lyrics_text.split('\n') if line.strip()]
    
    if not lines:
        return None
    
    vocal_segments = audio_analysis.get('vocal_segments', [])
    onset_times = audio_analysis.get('onset_times', [])
    duration = audio_analysis.get('duration', 180)
    
    print(f"📝 Assigning {len(lines)} lyrics lines to {len(vocal_segments)} vocal segments")
    
    # If we have vocal segments, distribute lyrics among them
    if vocal_segments:
        lyrics_with_timing = []
        total_vocal_duration = sum(seg['end'] - seg['start'] for seg in vocal_segments)
        
        # Estimate words per second in vocal sections
        total_words = sum(len(line.split()) for line in lines)
        words_per_second = total_words / total_vocal_duration if total_vocal_duration > 0 else 2
        
        current_line_idx = 0
        current_time = 0
        
        for segment in vocal_segments:
            seg_start = segment['start']
            seg_end = segment['end']
            seg_duration = seg_end - seg_start
            
            # Estimate how many lines fit in this segment based on word count
            lines_in_segment = []
            while current_line_idx < len(lines):
                line = lines[current_line_idx]
                word_count = len(line.split())
                estimated_duration = word_count / words_per_second
                
                # If adding this line doesn't exceed segment duration, add it
                if current_time + estimated_duration <= seg_duration:
                    lines_in_segment.append(line)
                    current_time += estimated_duration
                    current_line_idx += 1
                else:
                    break
            
            # Assign timing to lines in this segment
            if lines_in_segment:
                line_duration = seg_duration / len(lines_in_segment)
                for i, line in enumerate(lines_in_segment):
                    line_start = seg_start + (i * line_duration)
                    line_end = seg_start + ((i + 1) * line_duration)
                    
                    lyrics_with_timing.append({
                        'text': line,
                        'start_time': line_start,
                        'end_time': line_end
                    })
            
            current_time = 0  # Reset for next segment
        
        # Fill any remaining lines at the end using simple distribution
        remaining_lines = len(lines) - current_line_idx
        if remaining_lines > 0:
            print(f"⚠️ {remaining_lines} lines not assigned to vocal segments, using fallback timing")
            time_per_line = duration / len(lines)
            for i in range(current_line_idx, len(lines)):
                start_time = i * time_per_line
                end_time = (i + 1) * time_per_line
                lyrics_with_timing.append({
                    'text': lines[i],
                    'start_time': start_time,
                    'end_time': end_time
                })
            
    else:
        # Fallback: simple linear timing
        print("⚠️ No vocal segments, using linear timing")
        lyrics_with_timing = []
        time_per_line = duration / len(lines)
        for i, line in enumerate(lines):
            lyrics_with_timing.append({
                'text': line,
                'start_time': i * time_per_line,
                'end_time': (i + 1) * time_per_line
            })
    
    return lyrics_with_timing

def get_current_lyric(current_time, lyrics_with_timing):
    """Find which lyric should be displayed at current time"""
    for lyric in lyrics_with_timing:
        if lyric['start_time'] <= current_time < lyric['end_time']:
            return lyric['text']
    return None

def create_audio_synced_video(song_id=songId, plot_analysis=True):
    """Create video with audio-analysis-based lyrics synchronization - FIXED"""
    
    try:
        # Get song data
        query = f"""
        SELECT s.name as song_name, s.location as audio_file,
               l.content as lyrics, a.first_name, a.last_name
        FROM songs s 
        JOIN lyrics l ON s.id = l.song_id 
        JOIN artists a ON s.artist_id = a.id 
        WHERE s.id = {song_id}
        """
        
        df = pd.read_sql(query, engine)
        song_data = df.iloc[0]
        
        print(f"🎵 Creating AUDIO-SYNCED video for: {song_data['song_name']}")
        
        # Construct file paths
        audio_dir = os.path.join(r"C:\ruby\music\public\uploads\song\location", str(song_id))
        audio_path = os.path.join(audio_dir, song_data['audio_file'])
        background_image_path = os.path.join(audio_dir, "Folder.jpg")
        
        print(f"🔊 Audio: {os.path.basename(audio_path)}")
        print(f"🖼️ Background: {os.path.basename(background_image_path)}")
        
        if not os.path.exists(audio_path):
            print("❌ Audio file not found")
            return None
        
        # Perform audio analysis
        audio_analysis = detect_vocal_segments(audio_path, plot_analysis=plot_analysis)
        
        # Assign lyrics to timing segments
        lyrics_with_timing = assign_lyrics_to_segments(song_data['lyrics'], audio_analysis)
        
        if not lyrics_with_timing:
            print("❌ Could not assign lyrics timing")
            return None
        
        print(f"📝 Successfully assigned {len(lyrics_with_timing)} lyrics lines")
        
        # Load audio clip
        audio_clip = AudioFileClip(audio_path)
        duration = audio_analysis.get('duration', audio_clip.duration)
        
        print(f"⏱️ Full song duration: {duration:.1f}s ({duration/60:.1f} minutes)")
        
        # Video settings
        fps = 24
        width, height = 640, 480
        
        def make_synced_frame(t):
            try:
                # Load background
                if os.path.exists(background_image_path):
                    bg_image = Image.open(background_image_path)
                    bg_image = bg_image.resize((width, height), Image.Resampling.LANCZOS)
                    frame = np.array(bg_image)
                else:
                    frame = np.full((height, width, 3), [40, 40, 80], dtype=np.uint8)
                
                # Convert to PIL for text drawing
                pil_img = Image.fromarray(frame)
                draw = ImageDraw.Draw(pil_img)
                
                # Load font
                try:
                    font = ImageFont.truetype("arial.ttf", 32)
                except:
                    try:
                        font = ImageFont.truetype("C:/Windows/Fonts/arial.ttf", 32)
                    except:
                        font = ImageFont.load_default()
                
                # Get current lyric based on audio analysis
                current_line = get_current_lyric(t, lyrics_with_timing)
                
                if current_line:
                    # Calculate text position
                    try:
                        bbox = draw.textbbox((0, 0), current_line, font=font)
                    except AttributeError:
                        bbox = draw.textsize(current_line, font=font)
                        bbox = (0, 0, bbox[0], bbox[1])
                    
                    text_width = bbox[2] - bbox[0]
                    text_height = bbox[3] - bbox[1]
                    x = (width - text_width) // 2
                    y = (height - text_height) // 2
                    
                    # Semi-transparent background for text
                    padding = 10
                    draw.rectangle([
                        x - padding, y - padding,
                        x + text_width + padding, y + text_height + padding
                    ], fill=(0, 0, 0, 180))
                    
                    # Text with shadow for readability
                    shadow_color = (0, 0, 0)
                    text_color = (255, 255, 255)
                    
                    # Shadow
                    draw.text((x+2, y+2), current_line, font=font, fill=shadow_color)
                    # Main text
                    draw.text((x, y), current_line, font=font, fill=text_color)
                
                return np.array(pil_img)
                
            except Exception as e:
                print(f"❌ Frame error at {t:.1f}s: {e}")
                return np.zeros((height, width, 3), dtype=np.uint8)
        
        # Create video
        print("🎬 Creating audio-synced video frames...")
        video = VideoClip(make_synced_frame, duration=duration)
        video = video.set_audio(audio_clip)
        
        # Export
        output_dir = '../data/videos'
        os.makedirs(output_dir, exist_ok=True)
        output_file = os.path.join(output_dir, f"{song_data['song_name']}_audio_synced.mp4")
        
        print("📹 Exporting audio-synced video...")
        video.write_videofile(
            output_file, 
            fps=fps, 
            codec='libx264',
            audio_codec='aac',
            verbose=False,
            logger=None
        )
        
        print(f"✅ AUDIO-SYNCED video created: {output_file}")
        print(f"📊 File size: {os.path.getsize(output_file) / (1024*1024):.1f} MB")
        
        # Display timing information
        print("\n📋 Lyrics Timing Summary:")
        print("-" * 50)
        for i, lyric in enumerate(lyrics_with_timing):
            print(f"{i+1:2d}. {lyric['start_time']:5.1f}s - {lyric['end_time']:5.1f}s: {lyric['text'][:40]}{'...' if len(lyric['text']) > 40 else ''}")
        
        # Clean up
        video.close()
        audio_clip.close()
        
        return output_file
        
    except Exception as e:
        print(f"❌ Audio-synced video error: {e}")
        import traceback
        traceback.print_exc()
        return None

# Install required packages if not already installed
def install_required_packages():
    """Install required audio analysis packages"""
    try:
        import librosa
        import matplotlib
    except ImportError:
        print("📦 Installing required audio analysis packages...")
        import subprocess
        import sys
        subprocess.check_call([sys.executable, "-m", "pip", "install", "librosa", "matplotlib"])
        print("✅ Packages installed successfully")

# Run the audio-synced version
if __name__ == "__main__":
    # Install required packages
    install_required_packages()
    
    print("=" * 70)
    print("🎬 CREATING AUDIO-ANALYSIS SYNCED VIDEO")
    print("=" * 70)
    
    result = create_audio_synced_video(song_id=songId, plot_analysis=True)
    
    if result:
        print(f"\n🎉 SUCCESS! Audio-synced video created: {result}")
        print("\n✨ Features:")
        print("   ✅ Automatic vocal segment detection")
        print("   ✅ Intro handling (no lyrics during instrumental intro)")
        print("   ✅ Beat and onset detection for better timing")
        print("   ✅ Spectral analysis for vocal activity")
    else:
        print("\n❌ Audio-synced video creation failed")

🎬 CREATING AUDIO-ANALYSIS SYNCED VIDEO
🎵 Creating AUDIO-SYNCED video for: Imagine
🔊 Audio: Imagine.mp3
🖼️ Background: Folder.jpg
🎤 Analyzing audio for vocal segments...
📊 Audio loaded: 185.17s, SR: 44100Hz
📊 Extracting audio features...
📐 Spectral centroids shape: (1, 15950)
📐 Times shape: (15950,), Normalized centroids shape: (15950,)
⚠️ No vocal segments detected, using onset-based segmentation
📊 Analysis plot saved: ../data/audio_analysis.png
✅ Detected 344 vocal segments
✅ Detected 345 musical onsets
❌ Audio analysis error: unsupported format string passed to numpy.ndarray.__format__
📝 Assigning 26 lyrics lines to 1 vocal segments
⚠️ 1 lines not assigned to vocal segments, using fallback timing
📝 Successfully assigned 26 lyrics lines
⏱️ Full song duration: 185.0s (3.1 minutes)
🎬 Creating audio-synced video frames...
📹 Exporting audio-synced video...


Traceback (most recent call last):
  File "C:\Users\PC1\AppData\Local\Temp\ipykernel_482592\1418798906.py", line 121, in detect_vocal_segments
    print(f"🎵 Estimated tempo: {tempo:.1f} BPM")
                                ^^^^^^^^^^^
TypeError: unsupported format string passed to numpy.ndarray.__format__


✅ AUDIO-SYNCED video created: ../data/videos\Imagine_audio_synced.mp4
📊 File size: 4.3 MB

📋 Lyrics Timing Summary:
--------------------------------------------------
 1.   5.0s -  12.0s: Imagine there's no heaven
 2.  12.0s -  19.0s: It's easy if you try
 3.  19.0s -  26.0s: No hell below us
 4.  26.0s -  33.0s: Above us only sky
 5.  33.0s -  40.0s: Imagine all the people
 6.  40.0s -  47.0s: Living for today
 7.  47.0s -  54.0s: Imagine there's no countries
 8.  54.0s -  61.0s: It isn't hard to do
 9.  61.0s -  68.0s: Nothing to kill or die for
10.  68.0s -  75.0s: And no religion, too
11.  75.0s -  82.0s: Imagine all the people
12.  82.0s -  89.0s: Living life in peace
13.  89.0s -  96.0s: You may say I'm a dreamer
14.  96.0s - 103.0s: But I'm not the only one
15. 103.0s - 110.0s: I hope someday you will join us
16. 110.0s - 117.0s: And the world will be as one
17. 117.0s - 124.0s: Imagine no possessions
18. 124.0s - 131.0s: I wonder if you can
19. 131.0s - 138.0s: No need for gree