In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Example data for two songs with missing values
import pandas as pd
import json

# Load data from the text file
with open('song_data.txt', 'r') as file:
    data = json.load(file)

# Create a DataFrame
df = pd.DataFrame(data)

# Replace missing values with "unknown" and ensure dtype compatibility
df = df.astype({
    'artist': 'object',
    'album': 'object',
    'genre': 'object',
    'Language': 'object'
})
df.fillna("unknown", inplace=True)

In [4]:
# Vectorize categorical features: artist, album, genre, language
tfidf_artist = TfidfVectorizer()
tfidf_album = TfidfVectorizer()
tfidf_genre = TfidfVectorizer()
tfidf_language = TfidfVectorizer()

# Fit and transform each feature individually
artist_matrix = tfidf_artist.fit_transform(df['artist'].astype(str))  # Ensure strings
album_matrix = tfidf_album.fit_transform(df['album'].astype(str))      # Ensure strings
genre_matrix = tfidf_genre.fit_transform(df['genre'].astype(str))      # Ensure strings
language_matrix = tfidf_language.fit_transform(df['Language'].astype(str))  # Ensure strings

# Combine all TF-IDF matrices into a single matrix
combined_features = np.hstack([artist_matrix.toarray(), 
                               album_matrix.toarray(),
                               genre_matrix.toarray(),
                               language_matrix.toarray()])


# Include numerical features such as duration and likesCount
numerical_features = df[['duration', 'likesCount']].fillna(0).to_numpy()

# Combine TF-IDF features with numerical features
combined_features = np.hstack([combined_features, numerical_features])

# Calculate cosine similarity between all pairs of songs
cosine_sim = cosine_similarity(combined_features, combined_features)

# Function to get recommendations based on song ID
def get_recommendations(song_id, cosine_sim, df):
    # Get index of the song
    idx = df.index[df['_id'] == song_id].tolist()[0]
    
    # Get pairwise similarity scores with all other songs
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the songs based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 10 recommendations (excluding itself)
    sim_scores = sim_scores[1:11]
    
    # Get the indices of recommended songs
    song_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar song IDs
    return df.iloc[song_indices]['_id'].tolist()

# Example usage: Get recommendations for a song ID
song_id = "2"  # Replace with actual song ID
recommendations = get_recommendations(song_id, cosine_sim, df)
print("Recommendations for song ID", song_id, ":")
print(recommendations)





Recommendations for song ID 1 :
['17', '166', '124', '144', '97', '30', '75', '164', '109', '137']
