In [None]:
!pip install langchain langchain-community sentence-transformers faiss-cpu pandas numpy
!pip install transformers huggingface_hub langchain_huggingface

In [1]:
# Install required libraries

import pandas as pd
import numpy as np
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import FakeListLLM
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.documents import Document
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
import os

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d undefinenull/million-song-dataset-spotify-lastfm

Dataset URL: https://www.kaggle.com/datasets/undefinenull/million-song-dataset-spotify-lastfm
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading million-song-dataset-spotify-lastfm.zip to /content
 97% 623M/639M [00:03<00:00, 141MB/s] 
100% 639M/639M [00:03<00:00, 204MB/s]


In [4]:
!unzip million-song-dataset-spotify-lastfm.zip

Archive:  million-song-dataset-spotify-lastfm.zip
  inflating: MP3-Example/Blues/Blues-TRACOHF128F1498509.mp3  
  inflating: MP3-Example/Blues/Blues-TRADWSG128F4259317.mp3  
  inflating: MP3-Example/Blues/Blues-TRAELUY128F93147BA.mp3  
  inflating: MP3-Example/Blues/Blues-TRAHVWB128F9308FB0.mp3  
  inflating: MP3-Example/Blues/Blues-TRAJGIO128F92E84BC.mp3  
  inflating: MP3-Example/Blues/Blues-TRAOPNA128F427E83F.mp3  
  inflating: MP3-Example/Blues/Blues-TRAQGFP128E078FAB6.mp3  
  inflating: MP3-Example/Blues/Blues-TRARJEK128F930B3AA.mp3  
  inflating: MP3-Example/Blues/Blues-TRATLTV128F92FC979.mp3  
  inflating: MP3-Example/Blues/Blues-TRAUJPR128F92CA3AF.mp3  
  inflating: MP3-Example/Blues/Blues-TRAUYOD128F42647FE.mp3  
  inflating: MP3-Example/Blues/Blues-TRAYHQV128F4266D63.mp3  
  inflating: MP3-Example/Blues/Blues-TRBVOHY128F92E6A11.mp3  
  inflating: MP3-Example/Blues/Blues-TRCPUIL128F931ACAF.mp3  
  inflating: MP3-Example/Blues/Blues-TRCQEDW128E078F467.mp3  
  inflating: MP3-Exa

In [3]:
music_hist=pd.read_csv('/content/Music Info.csv')

In [4]:
user_hist=pd.read_csv('/content/User Listening History.csv')

In [5]:
# Merge the dataframes on the 'track_id' column
df = pd.merge(user_hist, music_hist, on='track_id', how='left')

In [6]:
# Step 2: Preprocess the dataset
# Combine relevant metadata (title, artist, tags) into a single text field for RAG
df['metadata'] = df['name'].astype(str) + ' by ' + df['artist'].astype(str) + ' tags: ' + df['tags'].astype(str)
# Handle missing play counts (fill NaN with 0)
df['playcount'] = df['playcount'].fillna(0).astype(int)
# Use a subset for faster processing (e.g., 10,000 rows)
df_subset = df.sample(n=100000, random_state=42).reset_index(drop=True)

In [7]:
df_subset.sample(5)

Unnamed: 0,track_id,user_id,playcount,name,artist,spotify_preview_url,spotify_id,tags,genre,year,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,metadata
67357,TRKQXCC128F93501B5,bcf59ac82b70072e06655fe7d640aad7a2c83ef1,5,Sentimental Heart,She & Him,https://p.scdn.co/mp3-preview/9981a1e9bd75f821...,1LdOt2dCyVHK4mMjaBUfGV,"indie, female_vocalists, folk, indie_pop, mellow",,2008,...,-7.982,1,0.0261,0.896,1e-05,0.228,0.553,76.575,4,"Sentimental Heart by She & Him tags: indie, fe..."
25299,TRBXLFU128F427A494,0a015e4e2982b47516e3569f6cc8abffc824e69c,3,Creature Fear,Bon Iver,https://p.scdn.co/mp3-preview/e4c91622605aed3e...,1ybu2CSF26zyquwCWb6YQJ,"alternative, indie, folk, indie_rock, singer_s...",Folk,2007,...,-15.945,1,0.0577,0.932,0.00736,0.325,0.181,104.381,3,"Creature Fear by Bon Iver tags: alternative, i..."
91973,TRGGULV128F4295F6A,efdfc89e1cbb6e8054b30f0c9709e20f18f6b1e6,7,How Can I Keep From Singing?,Enya,https://p.scdn.co/mp3-preview/fb77937aca7b066e...,0nThdvlErIL2MEpbE7iKu0,"ambient, new_age",New Age,1991,...,-17.1,1,0.0388,0.948,0.0196,0.098,0.0398,128.86,4,How Can I Keep From Singing? by Enya tags: amb...
81563,TRUHUNN128F92E58B5,9146cb74dff1732680ae77be17ad8092b22c8ad8,4,Hard Sun,Eddie Vedder,https://p.scdn.co/mp3-preview/898489a36e3b5714...,0Zf5BJahvBQ6yLoCR6fPko,"rock, alternative, alternative_rock, folk, sin...",Rock,2007,...,-6.544,1,0.0331,0.0533,0.000328,0.325,0.643,143.395,4,"Hard Sun by Eddie Vedder tags: rock, alternati..."
97843,TRJRBOJ128F42383E9,eb5931b091c3070c6caaa884ab6a95c160dc7358,1,Kiksu,Kyau vs. Albert,https://p.scdn.co/mp3-preview/2e2d73a6b0643822...,0qXWzxcJk1Nyn1CRpgTDAp,trance,Electronic,2013,...,-7.32,1,0.0358,0.00627,0.58,0.0917,0.254,138.01,4,Kiksu by Kyau vs. Albert tags: trance


In [87]:
# Get the 'tags' for "Warrior's Dance"
tags = df_subset.loc[df_subset['name'] == "Out All Night", 'tags']

print(tags)

2180    ska
Name: tags, dtype: object


In [8]:
# Step 3: Load data into LangChain documents with explicit index metadata
documents = []
for idx, row in df_subset.iterrows():
    doc = Document(
        page_content=row['metadata'],
        metadata={'index': idx, 'track_id': row['track_id'], 'name': row['name'], 'artist': row['artist']}
    )
    documents.append(doc)

In [67]:
# Debugging: Verify document metadata
print("\nDebugging: Sample document metadata")
for doc in documents[:3]:
    print(f"- Content: {doc.page_content}, Metadata: {doc.metadata}")


Debugging: Sample document metadata
- Content: Warriors Of Time by Black Tide tags: metal, heavy_metal, thrash_metal, Metadata: {'index': 0, 'track_id': 'TRNTNZL128F4259856', 'name': 'Warriors Of Time', 'artist': 'Black Tide'}
- Content: Kilometros by Sin Bandera tags: classical, Metadata: {'index': 1, 'track_id': 'TRNBMJP128F93433D8', 'name': 'Kilometros', 'artist': 'Sin Bandera'}
- Content: A Mutiny by Red Sparowes tags: experimental, instrumental, post_rock, Metadata: {'index': 2, 'track_id': 'TRDYGUV12903CDAF1E', 'name': 'A Mutiny', 'artist': 'Red Sparowes'}


In [10]:
# Set Hugging Face API key (optional, if using Inference API)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "PLACE YOUR HUGGINGFACE TOKEN"

In [11]:
# Step 5: Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(documents , embeddings)

In [12]:
# Step 6: Set up retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10, "lambda_mult": 0.5})  # Retrieve top 10 similar songs

In [13]:
# Step 7: Define a simple prompt template for RAG
prompt_template = """
You are a music recommendation assistant. Based on the song provided, recommend similar songs using the following metadata:
{context}

User input: {question}

Provide a list of recommended songs (title and artist) in a conversational tone. For each song, briefly explain why it was recommended (e.g., similar genre, artist, or tags). Limit to {num_recommendations} recommendations.
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question", "num_recommendations"])




In [14]:
# Set Hugging Face API key (optional, if using Inference API)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Initialize Hugging Face LLM
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    task="text-generation",
    pipline_kwargs={"temperature": 0.1, "max_new_tokens": 512}

)

model = ChatHuggingFace(
    llm=llm
)

                    pipline_kwargs was transferred to model_kwargs.
                    Please make sure that pipline_kwargs is what you intended.


In [15]:
# Step 9: Create the RAG chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough(),"num_recommendations": lambda x: 5}
    | prompt
    | model
)

In [16]:
# Step 9: Hybrid recommendation function
def recommend_songs(song_name, num_recommendations=5):
    # Retrieve documents
    retrieved_docs = retriever.invoke(song_name)

    # Debugging: Print retrieved documents
    print(f"\nDebugging: Retrieved {len(retrieved_docs)} documents for '{song_name}'")
    for doc in retrieved_docs:
        print(f"- Retrieved: {doc.page_content}, Metadata: {doc.metadata}")

    # Extract indices
    indices = [doc.metadata.get('index') for doc in retrieved_docs if doc.metadata.get('index') is not None]

    # Debugging: Check indices
    print(f"Debugging: Retrieved indices: {indices}")

    if not indices:
        print(f"No songs found similar to '{song_name}'. Try another song or check the dataset.")
        return "No recommendations available. Try another song or check the dataset."

    # Grab the subset rows by index (copy to avoid SettingWithCopy warnings)
    retrieved_songs = df_subset.iloc[indices][['name', 'artist', 'playcount', 'tags']].copy()

    # Collaborative signal: normalize play counts
    max_play_count = df_subset['playcount'].max()
    if max_play_count > 0:
        retrieved_songs['normalized_play_count'] = retrieved_songs['playcount'] / max_play_count
    else:
        retrieved_songs['normalized_play_count'] = 0.0

    # Content similarity proxy
    retrieved_songs['similarity_score'] = np.linspace(1.0, 0.5, len(retrieved_songs))

    # Hybrid score
    retrieved_songs['hybrid_score'] = (
        0.7 * retrieved_songs['similarity_score'] +
        0.3 * retrieved_songs['normalized_play_count']
    )

    # Rank & return top-N
    recommendations = (
        retrieved_songs
        .sort_values(by='hybrid_score', ascending=False)
        [['name', 'artist', 'tags']]
        .head(num_recommendations)
        .reset_index(drop=True)
    )

    # Format context for LLM
    context = "\n".join([f"{row['name']} by {row['artist']} (tags: {row['tags']})" for _, row in recommendations.iterrows()])

    # Get LLM-generated response
    llm_response = rag_chain.invoke(song_name)

    return llm_response.content

In [17]:
# Step 11: Test the recommendation system
song_input = "Sentimental Heart"  # Replace with any song name
response = recommend_songs(song_input, num_recommendations=5)



Debugging: Retrieved 10 documents for 'Sentimental Heart'
- Retrieved: Sentimental Heart by She & Him tags: indie, female_vocalists, folk, indie_pop, mellow, Metadata: {'index': 76239, 'track_id': 'TRKQXCC128F93501B5', 'name': 'Sentimental Heart', 'artist': 'She & Him'}
- Retrieved: Sentimental Heart by She & Him tags: indie, female_vocalists, folk, indie_pop, mellow, Metadata: {'index': 58085, 'track_id': 'TRKQXCC128F93501B5', 'name': 'Sentimental Heart', 'artist': 'She & Him'}
- Retrieved: Sentimental Heart by She & Him tags: indie, female_vocalists, folk, indie_pop, mellow, Metadata: {'index': 3441, 'track_id': 'TRKQXCC128F93501B5', 'name': 'Sentimental Heart', 'artist': 'She & Him'}
- Retrieved: Sentimental Heart by She & Him tags: indie, female_vocalists, folk, indie_pop, mellow, Metadata: {'index': 10663, 'track_id': 'TRKQXCC128F93501B5', 'name': 'Sentimental Heart', 'artist': 'She & Him'}
- Retrieved: Sentimental Heart by She & Him tags: indie, female_vocalists, folk, indie_pop

In [18]:
print(f"Recommendations for '{song_input}':")
print(response)

Recommendations for 'Sentimental Heart':
 Based on your interest in "Sentimental Heart" by She & Him, I'd like to recommend some similar songs for you.

1. ** "Stay Awhile" by She & Him** - I recommend this song because it's also by She & Him, so you'll enjoy the same indie, female vocalists, folk, and indie pop styles that you hear in "Sentimental Heart."

2. ** "In the Sun" by She & Him** - This is another great track from She & Him. It shares the same genre and tags as "Sentimental Heart," making it a perfect match for your taste.

3. ** "Holocene" by Bon Iver** - If you enjoy the mellow and indie-folk elements of "Sentimental Heart," you might appreciate this Bon Iver song. It's a bit more alternative, but it still maintains that soothing, heartfelt sound.

4. ** "The Ghost on the Shore" by The Head and the Heart** - This song is a great fit due to its indie, folk, and indie pop style. The Head and the Heart often incorporate beautiful female vocals, much like She & Him.

5. ** "He