In [1]:
import numpy as np
import pandas as pd
import difflib
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import h5py



In [17]:

# Data Collection
musicArtist_data = pd.read_csv(r"C:\Users\parab\Desktop\secret\Portfolio projects\MUSIC RECOMMENDATION SYSTEM\artists.csv", encoding='latin1')
musicArtist_data.head()


   Index                                  mbid              artist_mb  \
0      0  cc197bad-dc9c-440d-a5b5-d52ba2e14234               Coldplay   
1      1  a74b1b7f-71a5-4011-9441-d0b5e4122711              Radiohead   
2      2  8bfac288-ccc5-448d-9573-c33ea2aa5c30  Red Hot Chili Peppers   
3      3  73e5e69d-3554-40d8-8516-00cb38737a1c                Rihanna   
4      4  b95ce3ff-3d05-4e87-9e01-c97b66af13d4                 Eminem   

           artist_lastfm      country_mb           country_lastfm  \
0               Coldplay  United Kingdom           United Kingdom   
1              Radiohead  United Kingdom           United Kingdom   
2  Red Hot Chili Peppers   United States            United States   
3                Rihanna   United States  Barbados; United States   
4                 Eminem   United States            United States   

                                             tags_mb  \
0  rock; pop; alternative rock; british; uk; brit...   
1  rock; electronic; alternative r

In [None]:
musicArtist_data.shape

In [5]:
# Data Filtering
musicArtists_data = musicArtist_data.iloc[0:15000].drop_duplicates(subset='artist_mb', keep='first')


In [6]:
# Clean tags columns
musicArtists_data['tags_mb'] = musicArtists_data['tags_mb'].str.replace(" ", "")
musicArtists_data['tags_lastfm'] = musicArtists_data['tags_lastfm'].str.replace(" ", "")


In [7]:

# Combine features
combined_features = musicArtists_data['country_mb'] + ' ' + musicArtists_data['tags_lastfm']


In [8]:
# Replace NaN values with an empty string
combined_features = combined_features.fillna('')


In [9]:
# Vectorizing the data
vectorizer = TfidfVectorizer()
feature_vector = vectorizer.fit_transform(combined_features)


In [None]:

# Cosine Similarity
similarity = cosine_similarity(feature_vector)
similarity[1]

In [11]:
# Save similarity matrix to HDF5 file
with h5py.File('similarityscores.h5', 'w') as hf:
    hf.create_dataset('similarity', data=similarity)


In [12]:
# Load similarity matrix from HDF5 file
with h5py.File('similarityscores.h5', 'r') as hf:
    similarity1 = hf['similarity'][:]


In [13]:

# Recommendation Function
def recommend(musicArtist):
    musicArtist_name = musicArtist
    list_of_artist_name = musicArtists_data['artist_mb'].tolist()
    find_close_match = difflib.get_close_matches(musicArtist_name, list_of_artist_name)

    if find_close_match:
        close_match = find_close_match[0]
        musicArtist_index = musicArtists_data[musicArtists_data['artist_mb'] == close_match].index[0]
        distances = similarity[musicArtist_index]
        musicArtists_list = sorted(enumerate(distances), key=lambda x: x[1], reverse=True)[1:30]

        recommended_artists = [musicArtists_data.iloc[i[0]].artist_mb for i in musicArtists_list]
        return recommended_artists
    else:
        print(f"No close match found for {musicArtist_name}")
        return []

# Example usage
recommended_artists = recommend('arctic monkeys')
print(recommended_artists)

['Razorlight', 'Dirty Pretty Things', 'Babyshambles', 'Kaiser Chiefs', 'Hardâ\x80\x90Fi', 'Editors', 'Oasis', 'Supergrass', 'MaxÃ¯mo Park', 'Blur', 'Bloc Party', 'Kasabian', 'Franz Ferdinand', 'Little Man Tate', 'Milburn', 'Graham Coxon', 'Art Brut', 'Athlete', 'British Sea Power', 'Stereophonics', 'Nine Black Alps', 'Doves', 'Starsailor', 'Idlewild', 'Elastica', 'Feeder', 'Placebo', 'Ash', 'Mystery Jets']
