In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load data from the text file
with open('songs_data.txt', 'r') as file:
    data = json.load(file)

# Create a DataFrame
df = pd.DataFrame(data)

# Replace missing values with "unknown" and ensure dtype compatibility
df = df.astype({
    'artist': 'object',
    'album': 'object',
    'genre': 'object',
    'Language': 'object'
})
df.fillna("unknown", inplace=True)

# Convert genre lists to a single string
df['genre'] = df['genre'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Vectorize categorical features: artist, album, genre, language
tfidf_artist = TfidfVectorizer()
tfidf_album = TfidfVectorizer()
tfidf_genre = TfidfVectorizer()
tfidf_language = TfidfVectorizer()

# Fit and transform each feature individually
artist_matrix = tfidf_artist.fit_transform(df['artist'])
album_matrix = tfidf_album.fit_transform(df['album'])
genre_matrix = tfidf_genre.fit_transform(df['genre'])
language_matrix = tfidf_language.fit_transform(df['Language'])

# Combine all feature matrices into a single matrix
combined_features = np.hstack([artist_matrix.toarray(), 
                               album_matrix.toarray(),
                               genre_matrix.toarray(),
                               language_matrix.toarray()])

# Calculate cosine similarity between all pairs of songs
cosine_sim = cosine_similarity(combined_features, combined_features)

# Function to get recommendations based on song ID
def get_recommendations(song_id, cosine_sim, df):
    # Get index of the song
    idx = df.index[df['_id'] == song_id].tolist()[0]
    
    # Get pairwise similarity scores with all other songs
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the songs based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 10 recommendations (excluding itself)
    sim_scores = sim_scores[1:11]
    
    # Get the indices of recommended songs
    song_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar song IDs
    return df.iloc[song_indices]['_id'].tolist()

# Example usage: Get recommendations for a song ID
song_id = "1"  # Replace with actual song ID
recommendations = get_recommendations(song_id, cosine_sim, df)
print("Recommendations for song ID", song_id, ":")
print(recommendations)


AttributeError: 'list' object has no attribute 'lower'