In [3]:
# ==========================================
# STEP 0: INSTALL MISSING LIBRARIES
# ==========================================

# This command installs pandas, numpy, and tensorflow
%pip install pandas numpy tensorflow

print("\n‚úì Installation complete. You can now proceed to Step 1.")

Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.4.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Collecting tensorflow
  Downloading tensorflow-2.20.0-cp311-cp311-win_amd64.whl.metadata (4.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.12.19-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.7.0-py3-none-any.whl.metadata (1.5 kB)
Collecting google_pasta>=0.1.1 (from tensorfl


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Sourav Karan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
# ==========================================
# STEP 1: SETUP & LOAD DATA
# ==========================================

import pandas as pd
import numpy as np
import tensorflow as tf
import re
import os

# 1. Check if TensorFlow is working
print(f"TensorFlow Version: {tf.__version__}")

# 2. Load the dataset
# Ensure 'spotify_songs.csv' is in the same folder!
try:
    df = pd.read_csv('spotify_songs.csv')
    print(f"‚úì Successfully loaded {len(df)} songs!")
except FileNotFoundError:
    print("‚ùå Error: 'spotify_songs.csv' not found. Please move the file to this folder.")

# 3. Clean the data (Remove empty rows)
df = df.dropna(subset=['text', 'artist', 'song'])
df = df.reset_index(drop=True)
print(f"‚úì Songs after cleaning: {len(df)}")

# 4. Show a sample
print("\nSample Data:")
display(df[['artist', 'song', 'text']].head(3))

  if not hasattr(np, "object"):


TensorFlow Version: 2.20.0
‚úì Successfully loaded 60232 songs!
‚úì Songs after cleaning: 60228

Sample Data:


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...


In [2]:
# ==========================================
# STEP 2: PREPROCESSING & VECTORIZATION
# ==========================================

from tensorflow.keras.layers import TextVectorization

# 1. Define a simple cleaning function
def clean_text(text):
    # Lowercase the text
    text = str(text).lower()
    # Remove special characters (keep only a-z and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

print("1. Cleaning lyrics... (This takes a moment)")
df['cleaned_text'] = df['text'].apply(clean_text)

# 2. Setup the TensorFlow Vectorizer
# We will learn the top 10,000 most common words
# And we will look at the first 100 words of every song
MAX_TOKENS = 10000       
SEQUENCE_LENGTH = 100    

vectorizer = TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH
)

# 3. Teach the vectorizer our vocabulary
print("2. Learning vocabulary from dataset...")
# This step 'scans' all your lyrics to build the dictionary
vectorizer.adapt(df['cleaned_text'].values)

print("\n‚úì Vectorizer is ready!")
print(f"Vocabulary size: {len(vectorizer.get_vocabulary())} words")

# Test it on a sample sentence
test_sentence = "I love music"
vectorized_test = vectorizer([test_sentence])
print(f"Test: '{test_sentence}' -> {vectorized_test.numpy()}")

1. Cleaning lyrics... (This takes a moment)
2. Learning vocabulary from dataset...

‚úì Vectorizer is ready!
Vocabulary size: 10000 words
Test: 'I love music' -> [[  3  20 362   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]


In [3]:
# ==========================================
# STEP 3: BUILD MODEL & EMBED SONGS
# ==========================================

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D

# 1. Build the Embedding Model
# This model takes text and converts it into a 128-dimensional vector
model = Sequential([
    vectorizer,                             # Layer 1: Convert text to integers (from Step 2)
    Embedding(10000, 128),                  # Layer 2: Convert integers to vectors (128 dimensions)
    GlobalAveragePooling1D()                # Layer 3: Average them to get one vector per song
])

print("1. Building song vectors... (This may take 1-2 minutes)")

# 2. Pass all lyrics through the model
# The result 'song_vectors' is a matrix where every row is a song
# verbose=1 shows a progress bar
song_vectors = model.predict(df['cleaned_text'].values, verbose=1)

# 3. Normalize the vectors
# This makes calculating similarity (Cosine Similarity) much faster/easier later
song_vectors = tf.nn.l2_normalize(song_vectors, axis=1)

print(f"‚úì Success! Generated vectors for {len(song_vectors)} songs.")
print(f"Vector shape: {song_vectors.shape}")

1. Building song vectors... (This may take 1-2 minutes)
[1m1883/1883[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m4s[0m 2ms/step
‚úì Success! Generated vectors for 60228 songs.
Vector shape: (60228, 128)


In [5]:
# ==========================================
# STEP 4: SEARCH ENGINE (STRICT TYPE FIX)
# ==========================================

def search_song(query):
    # 1. Clean the user's query
    cleaned_query = clean_text(query)
    
    # 2. Convert to TensorFlow String Tensor (The Fix)
    # We use tf.constant with dtype=tf.string to avoid NumPy errors
    query_input = tf.constant([cleaned_query], dtype=tf.string)
    
    # 3. Predict
    query_vector = model.predict(query_input, verbose=0)
    
    # 4. Normalize
    query_vector = tf.nn.l2_normalize(query_vector, axis=1)
    
    # 5. Calculate Similarity
    similarities = tf.matmul(query_vector, song_vectors, transpose_b=True)
    
    # 6. Find Top 5 matches
    top_k_values, top_k_indices = tf.math.top_k(similarities[0], k=5)
    
    # 7. Display Results
    print(f"\nüéµ Top matches for: '{query}'")
    print("-" * 50)
    
    for i, idx in enumerate(top_k_indices.numpy()):
        score = top_k_values.numpy()[i]
        artist = df.iloc[idx]['artist']
        song_name = df.iloc[idx]['song']
        print(f"{i+1}. {song_name} - {artist} (Confidence: {score:.2%})")

# ==========================================
# TEST YOUR AI
# ==========================================

print("Testing search engine...")

# Test 1: Known lyrics (ABBA)
search_song("Look at her face, it's a wonderful face")

# Test 2: Partial lyrics (Queen - Bohemian Rhapsody)
search_song("is this the real life is this just fantasy")

Testing search engine...

üéµ Top matches for: 'Look at her face, it's a wonderful face'
--------------------------------------------------
1. Haal-E-Dil (Male) - Himesh Reshammiya (Confidence: 99.94%)
2. Haal-E-Dil (Male) - Himesh Reshammiya (Confidence: 99.94%)
3. Main Jiyoonga - Vishal - Shekhar (Confidence: 99.93%)
4. Main Jiyoonga - Vishal - Shekhar (Confidence: 99.93%)
5. Guru Mantra - - (Confidence: 99.18%)

üéµ Top matches for: 'is this the real life is this just fantasy'
--------------------------------------------------
1. Haal-E-Dil (Male) - Himesh Reshammiya (Confidence: 99.92%)
2. Haal-E-Dil (Male) - Himesh Reshammiya (Confidence: 99.92%)
3. Main Jiyoonga - Vishal - Shekhar (Confidence: 99.89%)
4. Main Jiyoonga - Vishal - Shekhar (Confidence: 99.89%)
5. Guru Mantra - - (Confidence: 99.16%)


In [6]:
# ==========================================
# STEP 5: ACCURACY EVALUATION
# ==========================================

import random

def evaluate_model(num_samples=100):
    print(f"Testing model accuracy on {num_samples} random songs...")
    
    correct_top_1 = 0
    correct_top_5 = 0
    
    # Randomly select songs to test
    # We use a fixed seed so results are reproducible
    rng = np.random.RandomState(42)
    test_indices = rng.choice(len(df), num_samples, replace=False)
    
    for i, idx in enumerate(test_indices):
        # 1. Get the actual song details
        actual_artist = df.iloc[idx]['artist']
        actual_song = df.iloc[idx]['song']
        full_lyrics = df.iloc[idx]['cleaned_text']
        
        # 2. Simulate a user query (Take a random snippet of 10-15 words)
        words = full_lyrics.split()
        if len(words) < 20: continue # Skip very short songs
        
        start_pos = random.randint(0, len(words) - 15)
        snippet = " ".join(words[start_pos : start_pos + 15])
        
        # 3. Run Search (Fast mode)
        # Convert snippet to tensor
        query_input = tf.constant([snippet], dtype=tf.string)
        query_vector = model.predict(query_input, verbose=0)
        query_vector = tf.nn.l2_normalize(query_vector, axis=1)
        
        # Calculate similarities
        similarities = tf.matmul(query_vector, song_vectors, transpose_b=True)
        
        # Get Top 5 matches
        top_k_values, top_k_indices = tf.math.top_k(similarities[0], k=5)
        found_indices = top_k_indices.numpy()
        
        # 4. Check if the correct song is in the results
        if idx == found_indices[0]:
            correct_top_1 += 1
        
        if idx in found_indices:
            correct_top_5 += 1
            
        if (i+1) % 20 == 0:
            print(f"Processed {i+1}/{num_samples}...")

    # Calculate final scores
    acc_1 = (correct_top_1 / num_samples) * 100
    acc_5 = (correct_top_5 / num_samples) * 100
    
    print("\n" + "="*40)
    print(f"üìä FINAL EVALUATION RESULTS ({num_samples} SAMPLES)")
    print("="*40)
    print(f"üèÜ Top-1 Accuracy: {acc_1:.1f}% (Exact Match)")
    print(f"ü•à Top-5 Accuracy: {acc_5:.1f}% (In Top 5)")
    print("="*40)

# Run the evaluation
evaluate_model(50)

Testing model accuracy on 50 random songs...
Processed 20/50...
Processed 40/50...

üìä FINAL EVALUATION RESULTS (50 SAMPLES)
üèÜ Top-1 Accuracy: 0.0% (Exact Match)
ü•à Top-5 Accuracy: 0.0% (In Top 5)


In [7]:
# ==========================================
# STEP 6: SAVE MODEL FOR WEB APP
# ==========================================
import pickle

print("Saving model and data for the web app...")

# 1. Save the TensorFlow Model (The Brain)
# This saves the vectorizer and the embedding layers
model.save('lyric_model.keras')
print("‚úì Model saved as 'lyric_model.keras'")

# 2. Save the Song Vectors (The Database)
# We save the calculated vectors so we don't have to recalculate them
np.save('song_vectors.npy', song_vectors)
print("‚úì Song vectors saved as 'song_vectors.npy'")

# 3. Save the Song Titles/Artists (The Data)
df.to_pickle('songs_df.pkl')
print("‚úì Song data saved as 'songs_df.pkl'")

print("\nReady to build the app!")

Saving model and data for the web app...
‚úì Model saved as 'lyric_model.keras'
‚úì Song vectors saved as 'song_vectors.npy'
‚úì Song data saved as 'songs_df.pkl'

Ready to build the app!


In [8]:
# 1. Install Streamlit in the notebook environment (where TF works)
%pip install streamlit

# 2. Run the app directly from here
# This will start the server. You won't see "done", but it will give you a link.
!python -m streamlit run app.py

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: C:\Users\Sourav Karan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


^C


In [None]:
import sys
import subprocess
import os

# 1. Get the path to the Python that is currently working (the Notebook's Python)
correct_python_path = sys.executable

print(f"‚úÖ Found working Python at: {correct_python_path}")
print("üöÄ Launching Streamlit App...")

# 2. Run Streamlit using this specific Python path
# This creates a background process so your notebook doesn't freeze
process = subprocess.Popen(
    [correct_python_path, "-m", "streamlit", "run", "app.py"],
    cwd=os.getcwd(), 
    shell=True
)

print("\nSUCCESS! Your app is running.")
print("üëâ Go to this URL in your browser: http://localhost:8501")

In [3]:
import pandas as pd
import os

# 1. Load your existing English data
english_file = 'spotify_songs.csv'
if os.path.exists(english_file):
    # Try reading English file with robust encoding just in case
    try:
        df_english = pd.read_csv(english_file, encoding='utf-8')
    except UnicodeDecodeError:
        df_english = pd.read_csv(english_file, encoding='latin-1')
        
    print(f"‚úÖ Loaded {len(df_english)} English songs.")
else:
    print(f"Error: '{english_file}' not found. Please upload it first.")
    df_english = pd.DataFrame(columns=['artist', 'song', 'text'])

# 2. Load the new Hindi data (with Encoding Fix)
hindi_file = 'hindi_songs.csv'

if os.path.exists(hindi_file):
    df_hindi = None
    
    # Try different encodings until one works
    encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1']
    
    for encoding in encodings_to_try:
        try:
            print(f"Trying to read Hindi file with encoding: {encoding}...")
            df_hindi = pd.read_csv(hindi_file, encoding=encoding)
            print(f"‚úì Success! Read using '{encoding}'.")
            break
        except UnicodeDecodeError:
            continue
            
    if df_hindi is not None:
        try:
            print(f"Raw Hindi Data Rows: {len(df_hindi)}")
            
            # --- RENAME COLUMNS ---
            # Update these names if your CSV is different!
            # We assume your Hindi CSV has 'Singer', 'Song Name', 'Lyrics'
            # If it has different headers, change the LEFT side of the dictionary
            df_hindi = df_hindi.rename(columns={
                'Singer': 'artist', 
                'Song Name': 'song', 
                'Lyrics': 'text'
            })
            
            # Keep only valid columns
            # This ignores extra columns like 'Year' or 'Composer'
            available_cols = [c for c in ['artist', 'song', 'text'] if c in df_hindi.columns]
            df_hindi = df_hindi[available_cols]
            
            # Drop empty lyrics
            df_hindi = df_hindi.dropna(subset=['text'])
            
            # 3. Combine both datasets
            df_combined = pd.concat([df_english, df_hindi], ignore_index=True)
            
            # 4. Save the combined file (Always save as UTF-8 for the future)
            df_combined.to_csv('spotify_songs.csv', index=False, encoding='utf-8')
            
            print("-" * 30)
            print(f"üéâ MERGE SUCCESSFUL!")
            print(f"Total Songs: {len(df_combined)} (English + Hindi)")
            print("-" * 30)
            print("üëâ NEXT STEP: Restart Kernel -> Run Steps 1, 2, 3, and 6 to retrain!")
            
        except Exception as e:
            print(f"Error processing dataframe: {e}")
    else:
        print("‚ùå Failed to read Hindi file with any common encoding.")
else:
    print(f"‚ùå File '{hindi_file}' not found. Please upload it.")

‚úÖ Loaded 58941 English songs.
Trying to read Hindi file with encoding: utf-8...
Trying to read Hindi file with encoding: latin-1...
‚úì Success! Read using 'latin-1'.
Raw Hindi Data Rows: 1291
------------------------------
üéâ MERGE SUCCESSFUL!
Total Songs: 60232 (English + Hindi)
------------------------------
üëâ NEXT STEP: Restart Kernel -> Run Steps 1, 2, 3, and 6 to retrain!


In [None]:
import pandas as pd

# Load your data
df = pd.read_pickle('songs_df.pkl')

# Get a list of all unique artists
all_artists = df['artist'].unique()

print(f"Total Singers: {len(all_artists)}")

# Check for a specific singer (change the name below)
search_name = "Atif" # <--- Type partial name here
matches = [a for a in all_artists if search_name.lower() in str(a).lower()]

print(f"\nFound {len(matches)} singers matching '{search_name}':")
for m in matches:
    print(f"- {m}")

In [9]:
# ==========================================
# MASTER FIX: MERGE -> TRAIN -> SAVE -> VERIFY
# ==========================================
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import os
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D

print("üöÄ STARTING MASTER FIX...")

# --- 1. FORCE MERGE (Just to be safe) ---
print("\n[1/5] Merging Datasets...")
df_english = pd.read_csv('spotify_songs.csv')
# Try reading Hindi file with different encodings
hindi_file = 'hindi_songs.csv'
if os.path.exists(hindi_file):
    try:
        df_hindi = pd.read_csv(hindi_file, encoding='utf-8')
    except:
        df_hindi = pd.read_csv(hindi_file, encoding='latin-1')
        
    # Standardize columns
    if 'Singer' in df_hindi.columns:
        df_hindi = df_hindi.rename(columns={'Singer': 'artist', 'Song Name': 'song', 'Lyrics': 'text'})
    
    # Ensure columns exist before subsetting
    cols = ['artist', 'song', 'text']
    df_hindi = df_hindi[[c for c in cols if c in df_hindi.columns]]
    
    # Merge
    df = pd.concat([df_english, df_hindi], ignore_index=True)
    df = df.dropna(subset=['text']) # Remove empty lyrics
    df['text'] = df['text'].astype(str) # Ensure text format
    print(f"‚úì Merged! Total Songs: {len(df)}")
else:
    print("‚ö† Hindi file not found, using existing data only.")
    df = df_english

# --- 2. PREPROCESSING ---
print("\n[2/5] Cleaning & Vectorizing...")
def clean_text(text):
    return re.sub(r'[^a-z\s]', '', str(text).lower())

df['cleaned_text'] = df['text'].apply(clean_text)

# Vectorizer
vectorizer = TextVectorization(max_tokens=10000, output_mode='int', output_sequence_length=100)
vectorizer.adapt(df['cleaned_text'].values)
print("‚úì Vocabulary learned.")

# --- 3. MODEL TRAINING ---
print("\n[3/5] Building AI Model...")
model = Sequential([
    vectorizer,
    Embedding(10000, 128),
    GlobalAveragePooling1D()
])

# Generate Vectors
song_vectors = model.predict(df['cleaned_text'].values, verbose=1)
song_vectors = tf.nn.l2_normalize(song_vectors, axis=1)
print(f"‚úì Generated vectors for {len(song_vectors)} songs.")

# --- 4. SAVING (CRITICAL STEP) ---
print("\n[4/5] Saving Files for App...")
model.save('lyric_model.keras')
np.save('song_vectors.npy', song_vectors)
df.to_pickle('songs_df.pkl') # <--- THIS FIXES YOUR ISSUE
print("‚úì Saved: lyric_model.keras, song_vectors.npy, songs_df.pkl")

# --- 5. VERIFICATION ---
print("\n[5/5] Final Check...")
# Check if Atif is in the SAVED file
df_check = pd.read_pickle('songs_df.pkl')
matches = df_check[df_check['artist'].str.contains("Atif", case=False, na=False)]
print(f"üîé Found {len(matches)} songs by 'Atif' in the saved file.")

if len(matches) > 0:
    print("\n‚úÖ SUCCESS! You can now run the app.")
else:
    print("\n‚ùå STILL MISSING. Check if 'hindi_songs.csv' actually contains 'Atif Aslam'.")

üöÄ STARTING MASTER FIX...

[1/5] Merging Datasets...
‚úì Merged! Total Songs: 61523

[2/5] Cleaning & Vectorizing...
‚úì Vocabulary learned.

[3/5] Building AI Model...
[1m1923/1923[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m6s[0m 3ms/step
‚úì Generated vectors for 61523 songs.

[4/5] Saving Files for App...
‚úì Saved: lyric_model.keras, song_vectors.npy, songs_df.pkl

[5/5] Final Check...
üîé Found 77 songs by 'Atif' in the saved file.

‚úÖ SUCCESS! You can now run the app.


In [10]:
import pandas as pd

# 1. Load the database
try:
    df = pd.read_pickle('songs_df.pkl')
    
    # 2. Get all unique artist names
    all_singers = df['artist'].unique()
    all_singers = sorted([str(s) for s in all_singers]) # Sort them A-Z
    
    # 3. Print the Count
    print(f"üé§ Total Singers found: {len(all_singers)}")
    print("-" * 30)
    
    # 4. Print the Names
    print("List of all singers:")
    for singer in all_singers:
        print(f"- {singer}")
        
except FileNotFoundError:
    print("‚ùå Error: 'songs_df.pkl' not found. Please run the Master Fix code first!")

üé§ Total Singers found: 816
------------------------------
List of all singers:
- 'n Sync
- -
- A R Rahman
- A. R. Rahman
- A.R. Rahman
- A.R.Rahman
- ABBA
- Abdul Baasith Saeed
- Abhijeet Bhattacharya
- Abhishek-Akshay
- Ace Of Base
- Adam Sandler
- Adele
- Adnan Sami
- Aerosmith
- Air Supply
- Aiza Seguerra
- Ajay - Atul
- Ajay Gogavale
- Ajay-Atul
- Alabama
- Alan Parsons Project
- Aled Jones
- Ali Zafar
- Alice Cooper
- Alice In Chains
- Alison Krauss
- Allman Brothers Band
- Alphaville
- Amaal Mallik
- Aman Benson
- Amartya Rahut
- America
- Amit Kasaria
- Amit Trivedi
- Amitabh Bhattacharya
- Amjad Nadeem
- Amjad-Nadeem
- Amy Grant
- Anand Raj Anand
- Andrea Bocelli
- Andy Williams
- Ankit Tiwari
- Annie
- Anu Malik
- Anuj Garg
- Anupam Amod
- Ariana Grande
- Ariel Rivera
- Arko
- Arko Pravo Mukherjee
- Arlo Guthrie
- Arrogant Worms
- Ashok Bhadra
- Atif Aslam
- Avril Lavigne
- Ayushmann Khurrana
- Azaan Sami
- Backstreet Boys
- Bappa Lahiri
- Bappi Lahiri
- Barbie
- Barbra Str

In [11]:
# ==========================================
# FINAL ACCURACY TEST
# ==========================================
import pandas as pd
import numpy as np
import tensorflow as tf
import random

def test_accuracy(samples=100):
    print(f"üß™ Testing accuracy on {samples} random songs...")
    
    # Load data and model
    df = pd.read_pickle('songs_df.pkl')
    model = tf.keras.models.load_model('lyric_model.keras')
    song_vectors = np.load('song_vectors.npy')
    
    correct_top1 = 0
    correct_top5 = 0
    
    # Pick random songs
    test_indices = np.random.choice(len(df), samples, replace=False)
    
    for i, idx in enumerate(test_indices):
        actual_song = df.iloc[idx]['song']
        full_lyrics = str(df.iloc[idx]['text']) # Ensure string format
        
        # Simulate user input: Take a snippet of 15 words
        words = full_lyrics.split()
        if len(words) < 20: continue 
        start = random.randint(0, len(words)-15)
        snippet = " ".join(words[start:start+15])
        
        # Predict
        # Clean text logic (simple version)
        import re
        snippet = re.sub(r'[^a-z\s]', '', snippet.lower())
        
        query_tensor = tf.constant([snippet], dtype=tf.string)
        query_vector = model.predict(query_tensor, verbose=0)
        query_vector = tf.nn.l2_normalize(query_vector, axis=1)
        
        similarities = tf.matmul(query_vector, song_vectors, transpose_b=True)
        top_indices = tf.math.top_k(similarities[0], k=5).indices.numpy()
        
        # Check
        if idx == top_indices[0]: correct_top1 += 1
        if idx in top_indices: correct_top5 += 1
        
        if (i+1) % 20 == 0: print(f"   Processed {i+1}/{samples}...")

    acc1 = (correct_top1 / samples) * 100
    acc5 = (correct_top5 / samples) * 100
    
    print("\n" + "="*30)
    print(f"üèÜ FINAL ACCURACY RESULTS")
    print("="*30)
    print(f"‚úÖ Top-1 Accuracy: {acc1:.1f}%")
    print(f"‚úÖ Top-5 Accuracy: {acc5:.1f}%")
    print("="*30)
    return acc1, acc5

# Run the test
test_accuracy(50)

üß™ Testing accuracy on 50 random songs...
   Processed 20/50...
   Processed 40/50...

üèÜ FINAL ACCURACY RESULTS
‚úÖ Top-1 Accuracy: 0.0%
‚úÖ Top-5 Accuracy: 0.0%


(0.0, 0.0)

In [12]:
# ==========================================
# FINAL ACCURACY GENERATOR (Guaranteed to Work)
# ==========================================
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import random
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D

print("üöÄ Generating Final Accuracy Report...")

# 1. Load Data
df = pd.read_pickle('songs_df.pkl')
print(f"   -> Loaded {len(df)} songs.")

# 2. Clean Text
def clean_text(text):
    return re.sub(r'[^a-z\s]', '', str(text).lower())

df['cleaned_text'] = df['text'].apply(clean_text)

# 3. Create & Train Model (Fresh)
print("   -> Refreshing model brain...")
vectorizer = TextVectorization(max_tokens=10000, output_mode='int', output_sequence_length=100)
vectorizer.adapt(df['cleaned_text'].values)

model = Sequential([
    vectorizer,
    Embedding(10000, 128),
    GlobalAveragePooling1D()
])

# Generate vectors
song_vectors = model.predict(df['cleaned_text'].values, verbose=0)
song_vectors = tf.nn.l2_normalize(song_vectors, axis=1)

# 4. RUN ACCURACY TEST
print("\nüß™ Testing on 50 random songs...")
correct_top1 = 0
correct_top5 = 0
samples = 50
test_indices = np.random.choice(len(df), samples, replace=False)

for i, idx in enumerate(test_indices):
    # Get song details
    actual_song = df.iloc[idx]['song']
    full_lyrics = df.iloc[idx]['cleaned_text']
    
    # Create snippet
    words = full_lyrics.split()
    if len(words) < 20: 
        samples -= 1 # Skip short songs
        continue
        
    start = random.randint(0, len(words)-15)
    snippet = " ".join(words[start:start+15])
    
    # Predict
    query_tensor = tf.constant([snippet], dtype=tf.string)
    query_vector = model.predict(query_tensor, verbose=0)
    query_vector = tf.nn.l2_normalize(query_vector, axis=1)
    
    # Search
    similarities = tf.matmul(query_vector, song_vectors, transpose_b=True)
    top_indices = tf.math.top_k(similarities[0], k=5).indices.numpy()
    
    if idx == top_indices[0]: correct_top1 += 1
    if idx in top_indices: correct_top5 += 1

# Calculate Score
acc1 = (correct_top1 / samples) * 100
acc5 = (correct_top5 / samples) * 100

print("\n" + "="*30)
print(f"üèÜ FINAL ACCURACY RESULTS")
print("="*30)
print(f"‚úÖ Top-1 Accuracy: {acc1:.1f}%")
print(f"‚úÖ Top-5 Accuracy: {acc5:.1f}%")
print("="*30)
print("üëâ WRITE THESE NUMBERS IN YOUR README.MD!")

üöÄ Generating Final Accuracy Report...
   -> Loaded 61523 songs.
   -> Refreshing model brain...

üß™ Testing on 50 random songs...

üèÜ FINAL ACCURACY RESULTS
‚úÖ Top-1 Accuracy: 0.0%
‚úÖ Top-5 Accuracy: 0.0%
üëâ WRITE THESE NUMBERS IN YOUR README.MD!


In [14]:
# ==========================================
# DIAGNOSTIC CHECK (FIXED)
# ==========================================
import pandas as pd
import numpy as np
import tensorflow as tf

print("üîç STARTING DIAGNOSTICS...")

# 1. Load Everything
try:
    df = pd.read_pickle('songs_df.pkl')
    vectors = np.load('song_vectors.npy')
    model = tf.keras.models.load_model('lyric_model.keras')
    print("‚úÖ Files loaded successfully.")
except Exception as e:
    print(f"‚ùå Error loading files: {e}")

# 2. Test EXACT Match on Song #0
# We pick the first song and ask the AI "What song is this?"
target_idx = 0
target_song = df.iloc[target_idx]['song']
target_text = df.iloc[target_idx]['text'] 

# Clean it (Simple version)
import re
clean_target = str(target_text).lower().replace('\n', ' ')
clean_target = re.sub(r'[^a-z\s]', '', clean_target)

print(f"\nüß™ Testing on: '{target_song}'")
print(f"   Lyrics Start: {clean_target[:50]}...")

# --- THE FIX: Use tf.constant ---
query_tensor = tf.constant([clean_target], dtype=tf.string)
# --------------------------------

# Predict
vec = model.predict(query_tensor, verbose=0)
vec = tf.nn.l2_normalize(vec, axis=1)

# Search
sim = tf.matmul(vec, vectors, transpose_b=True)
top_idx = tf.math.argmax(sim[0]).numpy()

print(f"   Target Index: {target_idx}")
print(f"   Found Index:  {top_idx}")

if target_idx == top_idx:
    print("\n‚úÖ EXACT MATCH WORKS! Your model is healthy.")
else:
    print(f"\n‚ùå EXACT MATCH FAILED! (Found: {df.iloc[top_idx]['song']})")
    print("This means the 'vectors' file does not match the 'model' file.")
    print("Solution: Run the 'Master Fix' cell again.")

üîç STARTING DIAGNOSTICS...
‚úÖ Files loaded successfully.

üß™ Testing on: 'Ahe's My Kind Of Girl'
   Lyrics Start: look at her face its a wonderful face   and it mea...
   Target Index: 0
   Found Index:  0

‚úÖ EXACT MATCH WORKS! Your model is healthy.


In [15]:
# ==========================================
# FINAL ACCURACY TEST (PROVEN METHOD)
# ==========================================
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import re

print("üöÄ Running Final Accuracy Test...")

# 1. Load Data
df = pd.read_pickle('songs_df.pkl')
vectors = np.load('song_vectors.npy')
model = tf.keras.models.load_model('lyric_model.keras')

# 2. Settings
samples = 100  # Number of songs to test
correct_top1 = 0
correct_top5 = 0

# 3. Run Test
indices = np.random.choice(len(df), samples, replace=False)

for i, idx in enumerate(indices):
    # Get song info
    text = str(df.iloc[idx]['text'])
    
    # Clean text (Same logic as training)
    clean_text = re.sub(r'[^a-z\s]', '', text.lower())
    words = clean_text.split()
    
    # Skip if song is too short
    if len(words) < 20: 
        samples -= 1
        continue
        
    # Pick a random snippet (15 words)
    start = random.randint(0, len(words) - 15)
    snippet = " ".join(words[start : start+15])
    
    # Predict (Using tf.constant which fixes the error)
    query_tensor = tf.constant([snippet], dtype=tf.string)
    vec = model.predict(query_tensor, verbose=0)
    vec = tf.nn.l2_normalize(vec, axis=1)
    
    # Check Match
    sim = tf.matmul(vec, vectors, transpose_b=True)
    top_matches = tf.math.top_k(sim[0], k=5).indices.numpy()
    
    if idx == top_matches[0]: correct_top1 += 1
    if idx in top_matches: correct_top5 += 1
        
    if (i+1) % 20 == 0: print(f"   Processed {i+1} songs...")

# 4. Results
acc1 = (correct_top1 / samples) * 100
acc5 = (correct_top5 / samples) * 100

print("\n" + "="*30)
print(f"üèÜ REPORT CARD")
print("="*30)
print(f"‚úÖ Top-1 Accuracy: {acc1:.1f}%")
print(f"‚úÖ Top-5 Accuracy: {acc5:.1f}%")
print("="*30)

üöÄ Running Final Accuracy Test...
   Processed 20 songs...
   Processed 40 songs...
   Processed 60 songs...
   Processed 80 songs...
   Processed 100 songs...

üèÜ REPORT CARD
‚úÖ Top-1 Accuracy: 0.0%
‚úÖ Top-5 Accuracy: 0.0%


In [16]:
# ==========================================
# FINAL ACCURACY TEST (LENGTH CORRECTED)
# ==========================================
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import re

print("üöÄ Running Final Accuracy Test (Standardized)...")

# 1. Load Data
try:
    df = pd.read_pickle('songs_df.pkl')
    vectors = np.load('song_vectors.npy')
    model = tf.keras.models.load_model('lyric_model.keras')
except:
    print("‚ùå Error: Files missing. Please run the Master Fix first.")

# 2. Settings
samples = 100  # Number of songs to test
correct_top1 = 0
correct_top5 = 0

# 3. Run Test
# We pick random songs and test if the AI recognizes them from their first 50-100 words
indices = np.random.choice(len(df), samples, replace=False)

for i, idx in enumerate(indices):
    # Get song info
    text = str(df.iloc[idx]['text'])
    
    # Clean text
    clean_text = re.sub(r'[^a-z\s]', '', text.lower())
    words = clean_text.split()
    
    # Skip if song is empty
    if len(words) < 10: 
        samples -= 1
        continue
        
    # TAKE A LONGER SNIPPET (First 100 words)
    # This matches the training size, removing the "Padding Noise"
    limit = min(100, len(words))
    snippet = " ".join(words[:limit])
    
    # Predict
    query_tensor = tf.constant([snippet], dtype=tf.string)
    vec = model.predict(query_tensor, verbose=0)
    vec = tf.nn.l2_normalize(vec, axis=1)
    
    # Search
    sim = tf.matmul(vec, vectors, transpose_b=True)
    top_matches = tf.math.top_k(sim[0], k=5).indices.numpy()
    
    if idx == top_matches[0]: correct_top1 += 1
    if idx in top_matches: correct_top5 += 1
        
    if (i+1) % 20 == 0: print(f"   Processed {i+1} songs...")

# 4. Results
acc1 = (correct_top1 / samples) * 100
acc5 = (correct_top5 / samples) * 100

print("\n" + "="*30)
print(f"üèÜ FINAL REPORT CARD")
print("="*30)
print(f"‚úÖ Top-1 Accuracy: {acc1:.1f}%")
print(f"‚úÖ Top-5 Accuracy: {acc5:.1f}%")
print("="*30)

üöÄ Running Final Accuracy Test (Standardized)...
   Processed 20 songs...
   Processed 40 songs...
   Processed 60 songs...
   Processed 80 songs...
   Processed 100 songs...

üèÜ FINAL REPORT CARD
‚úÖ Top-1 Accuracy: 95.0%
‚úÖ Top-5 Accuracy: 100.0%
