<a href="https://colab.research.google.com/github/skr-choco7/my_project/blob/main/Bollywood_Song_Recommendation_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#!/usr/bin/env python3
"""
Bollywood Music Recommender System
----------------------------------
A prototype engine demonstrating LLM-augmented recommendation logic.
It uses TF-IDF and Cosine Similarity to match natural language queries
with "enriched" song metadata (Navrasa, context, mood).
"""

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Optional

# Mock Database - Simulating the output of an ingestion pipeline (e.g., GPT-4 tagging)
# In production, this would be fetched from a vector DB like Pinecone or Milvus.
BOLLYWOOD_DB = [
    {
        "id": 1,
        "title": "Chaiyya Chaiyya",
        "movie": "Dil Se",
        "primary_rasa": "Veer (Energy/Heroism)",
        "tags": "sufi folk high energy train travel adventure shah rukh khan a r rahman enthusiastic dance spirituality",
        "context": "road trip"
    },
    {
        "id": 2,
        "title": "Tujhe Bhula Diya",
        "movie": "Anjaana Anjaani",
        "primary_rasa": "Karuna (Sadness/Compassion)",
        "tags": "heartbreak separation sad crying betrayal lonely high pitch emotional ending relationship memory",
        "context": "breakup"
    },
    {
        "id": 3,
        "title": "London Thumakda",
        "movie": "Queen",
        "primary_rasa": "Hasya (Joy/Laughter)",
        "tags": "wedding sangeet punjabi dance dhol celebration fun family loud energetic party",
        "context": "wedding"
    },
    {
        "id": 4,
        "title": "Kun Faya Kun",
        "movie": "Rockstar",
        "primary_rasa": "Shant (Peace)",
        "tags": "sufi spiritual mosque soul searching peace calm meditative ranbir kapoor a r rahman prayer",
        "context": "spiritual"
    },
    {
        "id": 5,
        "title": "Apna Time Aayega",
        "movie": "Gully Boy",
        "primary_rasa": "Veer (Heroism/Grit)",
        "tags": "rap hip hop motivation hustle struggle mumbai street anger determination power gym workout",
        "context": "motivation"
    },
    {
        "id": 6,
        "title": "Tum Hi Ho",
        "movie": "Aashiqui 2",
        "primary_rasa": "Shringar (Romance)",
        "tags": "romantic rain love ballad piano intense longing soulmate date night arijit singh",
        "context": "romantic"
    },
    {
        "id": 7,
        "title": "Senorita",
        "movie": "Zindagi Na Milegi Dobara",
        "primary_rasa": "Adbhuta (Wonder/Fun)",
        "tags": "spanish flamenco dance friends holiday road trip fun flirting playful upbeat",
        "context": "party"
    },
    {
        "id": 8,
        "title": "Chak De India",
        "movie": "Chak De! India",
        "primary_rasa": "Veer (Heroism)",
        "tags": "patriotic sports anthem victory india motivation win energy adrenaline team spirit",
        "context": "patriotic"
    }
]

class BollywoodRecommender:
    def __init__(self, data: List[Dict[str, Any]]):
        self.df = pd.DataFrame(data)
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = None

        # Pre-process data on init
        self._prepare_corpus()
        self._build_vector_space()

    def _prepare_corpus(self):
        """
        Combines metadata fields into a single 'sonic_dna' string for vectorization.
        This approximates the semantic density an LLM would generate.
        """
        self.df['sonic_dna'] = (
            self.df['primary_rasa'] + " " +
            self.df['tags'] + " " +
            self.df['context']
        )

    def _build_vector_space(self):
        """
        Fits the TF-IDF model on the song corpus.
        TODO: Replace with OpenAI Embeddings API for semantic understanding in v2.
        """
        print(f"[*] Indexing {len(self.df)} tracks...")
        self.tfidf_matrix = self.vectorizer.fit_transform(self.df['sonic_dna'])
        print("[*] Vector space built successfully.")

    def _expand_query_intent(self, raw_query: str) -> str:
        """
        Mock implementation of an LLM query expansion step.
        Maps vague user intent to domain-specific Bollywood vocabulary.
        """
        query_lower = raw_query.lower()
        expansion_terms = []

        # Domain knowledge map - simulates prompt engineering output
        # e.g. Prompt: "Convert user query to Bollywood tags"
        intent_map = {
            "drive": "road trip high energy energetic upbeat adventure sufi",
            "travel": "road trip high energy energetic upbeat adventure sufi",
            "sad": "Karuna heartbreak lonely separation emotional slow acoustic",
            "cry": "Karuna heartbreak lonely separation emotional slow acoustic",
            "broken": "Karuna heartbreak lonely separation emotional slow acoustic",
            "dance": "Hasya celebration upbeat loud dhol punjabi wedding",
            "party": "Hasya celebration upbeat loud dhol punjabi wedding",
            "peace": "Shant sufi spiritual meditative soft",
            "calm": "Shant sufi spiritual meditative soft",
            "gym": "Veer power motivation rap fast aggressive",
            "workout": "Veer power motivation rap fast aggressive",
            "pump": "Veer power motivation rap fast aggressive"
        }

        # Check for keywords and append associated tags
        for key, tags in intent_map.items():
            if key in query_lower:
                expansion_terms.append(tags)

        # If we found matches, append them to the original query
        # otherwise just return the original query (exact match attempt)
        if expansion_terms:
            return f"{raw_query} {' '.join(expansion_terms)}"
        return raw_query

    def recommend(self, user_query: str, top_k: int = 3) -> None:
        """
        Retrieves top_k songs based on cosine similarity between the
        expanded user query and the song database.
        """
        print(f"\n--- Query: '{user_query}' ---")

        # 1. Expand Intent (LLM Simulation)
        expanded_query = self._expand_query_intent(user_query)
        # Only print if we actually added something meaningful
        if len(expanded_query) > len(user_query):
            print(f"DEBUG: Expanded intent to -> [{expanded_query[:50]}...]")

        # 2. Vectorize User Query
        query_vec = self.vectorizer.transform([expanded_query])

        # 3. Calculate Similarity Scores
        # Returns array of shape (1, n_songs)
        similarity_scores = cosine_similarity(query_vec, self.tfidf_matrix).flatten()

        # 4. Rank and Filter
        # argsort returns indices of sorted elements (ascending), so we take last top_k and reverse
        top_indices = similarity_scores.argsort()[-top_k:][::-1]

        # 5. Output
        hits_found = False
        print(f"Top {top_k} Recommendations:")

        for idx in top_indices:
            score = similarity_scores[idx]

            # Threshold to filter out irrelevant noise
            if score > 0.1:
                hits_found = True
                song = self.df.iloc[idx]
                print(f" > [{score:.2f}] {song['title']} ({song['movie']})")
                print(f"   Context: {song['context']} | Rasa: {song['primary_rasa']}")

        if not hits_found:
            print("(!) No strong matches found. Try keywords like 'dance', 'sufi', or 'gym'.")


def main():
    # Instantiate the engine
    try:
        engine = BollywoodRecommender(BOLLYWOOD_DB)
    except Exception as e:
        print(f"Error initializing recommender: {e}")
        return

    # Test Cases
    test_queries = [
        "I'm going on a long drive with friends",
        "I feel broken and just want to cry alone",
        "Need something aggressive for the gym"
    ]

    for query in test_queries:
        engine.recommend(query)

if __name__ == "__main__":
    main()

[*] Indexing 8 tracks...
[*] Vector space built successfully.

--- Query: 'I'm going on a long drive with friends' ---
DEBUG: Expanded intent to -> [I'm going on a long drive with friends road trip h...]
Top 3 Recommendations:
 > [0.46] Chaiyya Chaiyya (Dil Se)
   Context: road trip | Rasa: Veer (Energy/Heroism)
 > [0.33] Senorita (Zindagi Na Milegi Dobara)
   Context: party | Rasa: Adbhuta (Wonder/Fun)

--- Query: 'I feel broken and just want to cry alone' ---
DEBUG: Expanded intent to -> [I feel broken and just want to cry alone Karuna he...]
Top 3 Recommendations:
 > [0.56] Tujhe Bhula Diya (Anjaana Anjaani)
   Context: breakup | Rasa: Karuna (Sadness/Compassion)

--- Query: 'Need something aggressive for the gym' ---
DEBUG: Expanded intent to -> [Need something aggressive for the gym Veer power m...]
Top 3 Recommendations:
 > [0.58] Apna Time Aayega (Gully Boy)
   Context: motivation | Rasa: Veer (Heroism/Grit)
 > [0.16] Chak De India (Chak De! India)
   Context: patriotic | Rasa: 