In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np

In [None]:
data = pd.read_csv("/Users/rohitsharma/Desktop/DM PROJECT/data.csv")
print(data.head())
data.info()
data.columns

In [None]:
# Group the data by 'year' and calculate the mean only for numeric columns
year_data = data.select_dtypes(include=['float64', 'int64']).groupby('year').mean().reset_index()
year_data.head()

In [None]:
def determine_column_counts(df):
    columns = df.columns
    columns = columns.drop('id')
    data_counts = pd.DataFrame(columns = ['column_name','column_type','num_rows','distinct_data', 'distinct_percent', 'missing_data', 'missing_percentage'])
    
    #For each column, calculate number and percentage of missing rows and distinct counts
    for col in columns :
        col_vals = df[col]
        total_data = len(col_vals)
        col_missing = sum(col_vals.isnull())
        missing_percent = round((col_missing / len(df[col])) * 100 , 3)
        col_distinct = (col_vals.nunique())
        distinct_percent = round((col_distinct / len(df[col])) * 100 , 3)

        data_counts.loc[len(data_counts)] = [col, col_vals.dtype, total_data, col_distinct, distinct_percent, col_missing, missing_percent]
        
    return(data_counts)

missing_data = determine_column_counts(data)
missing_data

There is no missing data present within the dataset, with 170653 rows. 

There are a few columns with only a couple distinct values - explicity and mode both only have two distinct values. 

Name, tempo, duration_ms have the highest percent of distinct data

In [None]:
# Setting up the aesthetics for the plots
sns.set(style="whitegrid")

# Plotting distributions of some key features
fig, axes = plt.subplots(3, 3, figsize=(15, 15))

# Valence distribution
sns.histplot(data['valence'], bins=30, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of Valence')

# Acousticness distribution
sns.histplot(data['acousticness'], bins=30, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Distribution of Acousticness')

# Danceability distribution
sns.histplot(data['danceability'], bins=30, kde=True, ax=axes[0,2])
axes[0,2].set_title('Distribution of Danceability')

# Energy distribution
sns.histplot(data['energy'], bins=30, kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Distribution of Energy')

# Popularity distribution
sns.histplot(data['popularity'], bins=30, kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Distribution of Popularity')

# Tempo distribution
sns.histplot(data['tempo'], bins=30, kde=True, ax=axes[1, 2])
axes[1, 2].set_title('Distribution of Tempo')


# Year distribution
sns.histplot(data['year'], bins=30, kde=True, ax=axes[2, 0])
axes[2, 0].set_title('Distribution of Songs By Year')

# Mode distribution
sns.histplot(data['year'], bins=30, kde=True, ax=axes[2, 1])
axes[2, 1].set_title('Distribution of Mode')

data['decade'] = data['year'].apply(lambda year : f'{(year//10)*10}s' )
sns.histplot(data['decade'], bins=30,  ax=axes[2, 2])
axes[2, 2].set_title('Distribution of Songs by Decade')

plt.tight_layout()
plt.show()


Distribution of Valence: Peaks in the mid-range, suggesting that most songs have moderate levels of musical positiveness.

Distribution of Acousticness: Shows a U-shaped distribution, indicating that songs are typically either highly acoustic or not at all.

Distribution of Danceability: Follows a bell-shaped curve centered around 0.5 to 0.7, implying most songs are moderately danceable.

Distribution of Energy: Also centers around a moderate level, suggesting that most songs have a balanced energy level.

Distribution of Popularity: Heavily skewed towards lower values, indicating that most songs have low popularity scores.

Distribution of Tempo: Shows a bell-shaped distribution with a peak around 120 BPM, suggesting that this is the most common tempo for songs.

Distribution of Songs by Year: Increases over time, particularly after 1960, indicating a growing number of songs being produced or recorded in databases.

Distribution of Mode: Shows a preference for one mode over the other, indicating a commonality in the modal structure of songs, with a majority likely being in a major key given the two distinct peaks.

Distribution of Songs by Decade: There's a noticeable increase in song counts beginning in the 1950s, with each subsequent decade having more songs than the last, peaking in the 2000s. The count for the 2010s appears slightly lower than the 2000s, but it's still significantly higher than in earlier decades. This could be due to various factors including the increase in music production, distribution channels, and changes in recording technologies.

In [None]:
# Set up the matplotlib figure
fig, axes = plt.subplots(2, 2, figsize=(15, 15))  # 3 Rows, 2 Columns

# Danceability vs. Valence
sns.scatterplot(ax=axes[0, 0], data=data, x='danceability', y='valence')
axes[0, 0].set_title('Danceability vs. Valence')
axes[0, 0].set_xlabel('Danceability')
axes[0, 0].set_ylabel('Valence')

# Year vs. Popularity
sns.lineplot(ax=axes[0, 1], data=data, x='year', y='popularity', ci=None)
axes[0, 1].set_title('Year vs. Popularity')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Popularity')

# Energy vs. Loudness
sns.scatterplot(ax=axes[1, 0], data=data, x='energy', y='loudness')
axes[1, 0].set_title('Energy vs. Loudness')
axes[1, 0].set_xlabel('Energy')
axes[1, 0].set_ylabel('Loudness')


# Speechiness vs. Explicit
sns.boxplot(ax=axes[1, 1], data=data, x='explicit', y='speechiness')
axes[1, 1].set_title('Speechiness by Explicit Content')
axes[1, 1].set_xlabel('Explicit Content')
axes[1, 1].set_ylabel('Speechiness')



# Adjust layout
plt.tight_layout()
plt.show()


Danceability vs. Valence: Shows no clear correlation between how danceable a song is and its positivity level. The data is spread evenly across all values.

Year vs. Popularity: Indicates a significant increase in song popularity from the 1960s to the present, with a particularly sharp rise in recent years.

Energy vs. Loudness: Demonstrates a positive relationship, with more energetic songs generally being louder.

Speechiness by Explicit Content: Reveals that explicit songs tend to have a higher range and median of speechiness compared to non-explicit songs.





In [None]:

# Now 'year' should be one of the columns, so we can use it in the plot
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=sound_features, title='Trend of various sound features over decades')

# Show the plot
fig.show()


The chart indicates significant changes in the acousticness and instrumentalness of music over time, while other features like danceability, energy, liveness, and valence have shown more stability or minor fluctuations.






In [None]:
artists = pd.read_csv("/Users/rohitsharma/Desktop/DM PROJECT/data_by_artist.csv")
artists

In [None]:
top10_popular_artists = artists.nlargest(10, 'popularity')
top10_most_song_produced_artists = artists.nlargest(10, 'count')
print('Top 10 Artists that produced most songs:')
top10_most_song_produced_artists[['count','artists']].sort_values('count',ascending=False)

In [None]:

print('Top 10 Artists that had most popularity score:')
top10_popular_artists[['popularity','artists']].sort_values('popularity',ascending=False)

In [None]:
## GENRE ANALYSIS

In [None]:
data_w_genres = pd.read_csv('/Users/rohitsharma/Desktop/DM PROJECT/data_w_genres.csv')

print(data_w_genres.head())

print(data_w_genres.columns)

In [None]:
## Explore Genre Distribution

In [None]:
data_w_genres['genres'] = data_w_genres['genres'].apply(eval)  # Convert string representation of list to actual list
data_exploded_genres = data_w_genres.explode('genres')

genre_counts = data_exploded_genres['genres'].value_counts()

# Display the top 10 genres
print(genre_counts.head(10))

In [None]:
#Analyze Features by Genre

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# For simplicity, let's focus on the top 5 genres
top_genres = genre_counts.head(5).index.tolist()

# Filter data to include only the top genres
data_top_genres = data_exploded_genres[data_exploded_genres['genres'].isin(top_genres)]

# Visualize the distribution of musical features for the top genres
plt.figure(figsize=(20, 10))
for i, feature in enumerate(['acousticness', 'danceability', 'energy', 'tempo', 'valence'], 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='genres', y=feature, data=data_top_genres)
    plt.title(f'Distribution of {feature} by Genre')
plt.tight_layout()
plt.show()

In [None]:
# Select only numeric columns for the mean calculation, excluding non-numeric ones like 'artists'
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Now group by 'year' and calculate the mean for these numeric columns only
data_by_year = numeric_data.groupby('year').mean().reset_index()

# Display the first few rows to verify
print(data_by_year.head())

In [None]:
data_by_year = data_by_year.reset_index()

# Use pd.melt to transform the DataFrame into a long-form DataFrame
data_long = pd.melt(data_by_year, id_vars=['year'], value_vars=['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence'])

# Plot the data using seaborn
plt.figure(figsize=(16, 8))  # Set the figure size
sns.lineplot(data=data_long, x='year', y='value', hue='variable')

# Customize the plot with titles, labels, etc.
plt.title('Musical Features Over Time')
plt.xlabel('Year')
plt.ylabel('Value')
plt.legend(title='Variable')
plt.show()

In [None]:
data['count'] = 1  # Temporary column to aid in counting
releases_by_year = data.groupby('year')['count'].sum().reset_index()

# Merge this with your data_by_year DataFrame
data_by_year = pd.merge(data_by_year, releases_by_year, on='year', how='left')

# Plotting features with popularity and number of releases
plt.figure(figsize=(15, 7))
sns.lineplot(x='year', y='popularity', data=data_by_year, label='Popularity', color='green')
ax2 = plt.twinx()
sns.lineplot(x='year', y='count', data=data_by_year, label='Number of Releases', color='red', ax=ax2)

plt.title('Popularity and Number of Releases Over Time')
ax2.set_ylabel('Number of Releases')
plt.legend(title='Metrics')
plt.show()


In [None]:
## Recommender System

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

In [2]:
data = pd.read_csv("/Users/rohitsharma/Desktop/DM PROJECT/data.csv")
data.columns
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [3]:
data_w_genres = pd.read_csv('/Users/rohitsharma/Desktop/DM PROJECT/data_w_genres.csv')
print(data_w_genres.columns)


Index(['genres', 'artists', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key', 'mode', 'count'],
      dtype='object')


In [4]:
data_by_genres = pd.read_csv('/Users/rohitsharma/Desktop/DM PROJECT/data_by_genres.csv')

print(data_by_genres.columns)


Index(['mode', 'genres', 'acousticness', 'danceability', 'duration_ms',
       'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
       'tempo', 'valence', 'popularity', 'key'],
      dtype='object')


In [5]:
data_by_year = pd.read_csv('/Users/rohitsharma/Desktop/DM PROJECT/data_by_year.csv')

print(data_by_year.columns)

Index(['mode', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'key'],
      dtype='object')


In [6]:
data_by_artist = pd.read_csv('/Users/rohitsharma/Desktop/DM PROJECT/data_by_artist.csv')

print(data_by_artist.columns)

Index(['mode', 'count', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'popularity', 'key'],
      dtype='object')


In [7]:
import ast
# Function to safely convert to list
def safe_convert_to_list(value):
    if isinstance(value, list):  # If it's already a list, just return it
        return value
    try:
        # Attempt to convert a string representation of a list into a list
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):  # If there's an error during conversion
        return []  # Return an empty list or some other default

# Apply this function to each entry in the 'artists' column
data['artists'] = data['artists'].apply(safe_convert_to_list)

# Explode the 'artists' list into separate rows
data_exploded = data.explode('artists')

# Merge with data_w_genres
merged_df = pd.merge(data_exploded, data_w_genres[['artists','genres']], left_on='artists', right_on='artists', how='left')

In [8]:
merged_df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo', 'genres'],
      dtype='object')

In [None]:
## RECOMMENDER SYSTEM

In [10]:
song_library = merged_df.sort_values(by=['popularity'], ascending=False).head(10000)
song_library.reset_index(inplace=True, drop=True)
song_library.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,genres
0,0.145,2020,0.401,Bad Bunny,0.731,205090,0.573,1,47EiUVwUp4C9fGccaPuUCS,5.2e-05,4,0.113,-10.059,0,Dakiti,100,2020-10-30,0.0544,109.928,"['latin', 'reggaeton', 'trap latino']"
1,0.145,2020,0.401,Jhay Cortez,0.731,205090,0.573,1,47EiUVwUp4C9fGccaPuUCS,5.2e-05,4,0.113,-10.059,0,Dakiti,100,2020-10-30,0.0544,109.928,"['latin', 'reggaeton', 'trap latino']"
2,0.756,2020,0.221,iann dior,0.7,140526,0.722,1,3tjFYV6RSFtuktYl3ZtYcq,0.0,7,0.272,-3.558,0,Mood (feat. iann dior),99,2020-07-24,0.0369,90.989,"['melodic rap', 'pop rap']"
3,0.756,2020,0.221,24kGoldn,0.7,140526,0.722,1,3tjFYV6RSFtuktYl3ZtYcq,0.0,7,0.272,-3.558,0,Mood (feat. iann dior),99,2020-07-24,0.0369,90.989,['cali rap']
4,0.737,2020,0.0112,BTS,0.746,199054,0.765,0,0t1kP63rueHleOhQkYSXFY,0.0,6,0.0936,-4.41,0,Dynamite,97,2020-08-28,0.0993,114.044,"['k-pop', 'k-pop boy group']"


In [11]:
# Create CountVectorizer object to transform text into vector
song_vectorizer = CountVectorizer()

# Fit the vectorizer on "genres" field of song_library DataFrame
song_vectorizer.fit(song_library['genres'])

In [12]:
genre_vectors = song_vectorizer.transform(song_library['genres']).toarray()


### PIPELINE

In [28]:
pip install spotipy

Collecting spotipy
  Obtaining dependency information for spotipy from https://files.pythonhosted.org/packages/b8/e8/4c099f9431ec9a86f576b344702cd4446d1ff7df09b172dc1951f25d58b1/spotipy-2.23.0-py3-none-any.whl.metadata
  Downloading spotipy-2.23.0-py3-none-any.whl.metadata (3.3 kB)
Collecting redis>=3.5.3 (from spotipy)
  Obtaining dependency information for redis>=3.5.3 from https://files.pythonhosted.org/packages/63/c9/7e8397d1eedaadcd2fbcbbd34b1373c08743ebb475a0afda7089df6bb646/redis-5.0.2-py3-none-any.whl.metadata
  Downloading redis-5.0.2-py3-none-any.whl.metadata (9.3 kB)
Collecting async-timeout>=4.0.3 (from redis>=3.5.3->spotipy)
  Obtaining dependency information for async-timeout>=4.0.3 from https://files.pythonhosted.org/packages/a7/fa/e01228c2938de91d47b307831c62ab9e4001e747789d0b05baf779a6488c/async_timeout-4.0.3-py3-none-any.whl.metadata
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Downloading red

In [29]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import clear_output
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import warnings
warnings.filterwarnings("ignore")

In [30]:
client_id='6c235f1045cf45c08208ee92dd45c146'
client_secret='6c54872c4cde4e24aa8241af7b142163'

In [31]:
credmanager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=credmanager)

In [32]:
artist_search = sp.search('Eminem', type='artist')['artists']['items'][0]
artist_search

{'external_urls': {'spotify': 'https://open.spotify.com/artist/7dGJo4pcD2V6oG8kP0tJRR'},
 'followers': {'href': None, 'total': 81324558},
 'genres': ['detroit hip hop', 'hip hop', 'rap'],
 'href': 'https://api.spotify.com/v1/artists/7dGJo4pcD2V6oG8kP0tJRR',
 'id': '7dGJo4pcD2V6oG8kP0tJRR',
 'images': [{'height': 640,
   'url': 'https://i.scdn.co/image/ab6761610000e5eba00b11c129b27a88fc72f36b',
   'width': 640},
  {'height': 320,
   'url': 'https://i.scdn.co/image/ab67616100005174a00b11c129b27a88fc72f36b',
   'width': 320},
  {'height': 160,
   'url': 'https://i.scdn.co/image/ab6761610000f178a00b11c129b27a88fc72f36b',
   'width': 160}],
 'name': 'Eminem',
 'popularity': 90,
 'type': 'artist',
 'uri': 'spotify:artist:7dGJo4pcD2V6oG8kP0tJRR'}

In [33]:
def art_features(spotify_search_result):
    """function to retrieve info about the artists"""
    result = {
        'artist_name': spotify_search_result.get('name', 'artist_name_not_available'),
        'artist_id': spotify_search_result.get('id', 'artist_id_not_available'),
        'artist_popularity': spotify_search_result.get('popularity', 0),
        'artist_first_genre': (spotify_search_result.get('genres', ['genre_not_available']) + ['genre_not_available'])[0],
        'artist_n_followers': spotify_search_result.get('followers', {}).get('total', 0)
    }
    return result

artist_features = art_features(artist_search)
artist_features

{'artist_name': 'Eminem',
 'artist_id': '7dGJo4pcD2V6oG8kP0tJRR',
 'artist_popularity': 90,
 'artist_first_genre': 'detroit hip hop',
 'artist_n_followers': 81324558}

In [34]:
artist_related_artists = sp.artist_related_artists(artist_features['artist_id'])['artists']

print('Artist has', len(artist_related_artists), 'related artists. The top 3 is given below. \n\n')

print(art_features(artist_related_artists[0]))
print('\n')
print(art_features(artist_related_artists[1]))
print('\n')
print(art_features(artist_related_artists[2]))

Artist has 20 related artists. The top 3 is given below. 


{'artist_name': 'Dr. Dre', 'artist_id': '6DPYiyq5kWVQS4RGwxzPC7', 'artist_popularity': 79, 'artist_first_genre': 'g funk', 'artist_n_followers': 11214305}


{'artist_name': '50 Cent', 'artist_id': '3q7HBObVc0L8jNeTe5Gofh', 'artist_popularity': 83, 'artist_first_genre': 'east coast hip hop', 'artist_n_followers': 13644360}


{'artist_name': 'Snoop Dogg', 'artist_id': '7hJcb9fa4alzcOq3EaNPoG', 'artist_popularity': 81, 'artist_first_genre': 'g funk', 'artist_n_followers': 11083256}


In [35]:
## Pipeline
import networkx as nx

def create_expanded_graph(artists_name_list, max_artists, expand_factor):
    """
    Creates a graph from a list of artist names and expands it by adding related artists.
    
    Parameters:
    - artists_name_list: List of artist names to search and add to the graph.
    - max_artists: Maximum number of artists to add to the graph, including related artists.
    
    Returns:
    - A NetworkX graph object with artists as nodes.
    """
    G = nx.Graph()  # Create an empty graph
    
    # Initial addition of artists from the provided list
    for name in artists_name_list:
        if len(G) >= max_artists*expand_factor:
            break  # Stop if we've reached the max artists limit
        
        search_result = sp.search(name, type='artist')['artists']['items']
        if search_result:
            this_artist = art_features(search_result[0])
            G.add_node(this_artist['artist_name'], **this_artist, related_found=False)
    
    # Expansion of the graph with related artists
    while True:
        current_size = len(G)
        for x in list(G):
            if G.nodes[x]['related_found'] or len(G) >= max_artists*expand_factor:
                continue  # Skip if related artists are found or max size is reached
            
            relateds = sp.artist_related_artists(G.nodes[x]['artist_id'])['artists']
            for r in relateds:
                rdict = art_features(r)
                rname = rdict['artist_name']
                
                if rname not in G:
                    G.add_node(rname, **rdict, related_found=False)
                    if len(G) >= max_artists*expand_factor:
                        break  # Stop if we've reached the max artists limit
                
                G.add_edge(x, rname)  # Add an edge between the artist and their related artist
            
            G.nodes[x]['related_found'] = True  # Mark the artist's related artists as found
        
        if current_size == len(G) or len(G) >= max_artists*expand_factor:
            break  # Stop if no new artists were added or max size is reached
    
    #print(f'Graph created with {len(G)} artists.')
    return G

In [36]:
def print_all_artists(G):
    """
    Prints all artists in the graph.

    Parameters:
    - G: NetworkX graph object with artists as nodes.
    """
    for artist in G.nodes:
        print(artist)

In [37]:
import math

def prune_artists_for_diversity(G, original_artists, max_artists, diversity_factor):
    """
    Prunes the graph to a list of artist names based on a diversity factor.
    
    Parameters:
    - G: The expanded graph of artists.
    - original_artists: List of original artist names that should not be included in the final list.
    - max_artists: Maximum number of artists to include in the final list.
    - diversity_factor: A value between 0 and 1 indicating the balance between including artists with low and high connectivity.
    
    Returns:
    - A list of artist names selected based on the specified diversity.
    """
    # Exclude original artists and get the degree of the remaining nodes
    remaining_nodes = [(node, G.degree(node)) for node in G.nodes if node not in original_artists]
    
    # Sort nodes based on their degree (connectivity)
    nodes_sorted_by_degree = sorted(remaining_nodes, key=lambda x: x[1])
    
    # Determine the number of artists to pick from the least and most connected nodes
    num_least_connected = math.floor(diversity_factor * max_artists)
    num_most_connected = max_artists - num_least_connected
    
    # Select artists based on the diversity factor
    selected_artists = [node[0] for node in nodes_sorted_by_degree[:num_least_connected]] + \
                       [node[0] for node in nodes_sorted_by_degree[-num_most_connected:]]
    
    return selected_artists

In [38]:
def select_artists(artists_name_list, max_artists, expand_factor, diversity_factor):
    G = create_expanded_graph(artists_name_list, max_artists, expand_factor)
    recommended_artists = prune_artists_for_diversity(G, artists_name_list, max_artists=max_artists, diversity_factor=diversity_factor)
    return recommended_artists

In [39]:
artists_name_list = [
    'Tate McRae', 'Harry Styles', 'Conan Gray', 'The Weeknd', 'Post Malone'
]
max_artists = 10
expand_factor = 15
diversity_factor = 0.6
recommended_artists = select_artists(artists_name_list, max_artists, expand_factor, diversity_factor)
print("Recommended Artists:", recommended_artists)

Recommended Artists: ['Anna Clendening', 'Louis Tomlinson', 'Niall Horan', 'Liam Payne', 'One Direction', 'ZAYN', 'Madison Beer', 'Alec Benjamin', 'Sabrina Carpenter', 'Gracie Abrams']


In [None]:
##

In [43]:
def song_recommender_by_recommended_artists(recommended_artists, song_library, genre_vectors):
    try:
        # Directly use recommended_artists for filtering
        artist_songs = song_library[song_library['artists'].apply(lambda x: any(artist in x for artist in recommended_artists))]
        
        if artist_songs.empty:
            print('No songs found for the recommended artists in the song library.')
            return

        # The rest of the function remains the same
        num_cols = ['year', 'duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness',
                    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        avg_num_features = artist_songs[num_cols].mean().to_numpy().reshape(1, -1)

        artist_song_indices = artist_songs.index.tolist()
        avg_genre_vector = np.mean(genre_vectors[artist_song_indices], axis=0).reshape(1, -1)

        genre_similarities = cosine_similarity(avg_genre_vector, genre_vectors)
        num_similarities = cosine_similarity(avg_num_features, song_library[num_cols].to_numpy())

        avg_similarities = (genre_similarities + num_similarities) / 2
        sim_scores_indices = np.argsort(-avg_similarities[0])

        recommended_songs = song_library.iloc[sim_scores_indices].loc[~song_library.index.isin(artist_song_indices)][['name', 'artists', 'year']].head(5)

        return recommended_songs.reset_index(drop=True)
    except Exception as e:
        print(f'An error occurred: {e}')

In [54]:
artists_name_list = ['Tate McRae', 'Def Leppard', 'Opeth', 'The Weeknd', 'Post Malone']
max_artists = 10
expand_factor = 15
diversity_factor = 0.6
recommended_artists = select_artists(artists_name_list, max_artists, expand_factor, diversity_factor)
print("Recommended Artists:", recommended_artists)

Recommended Artists: ['boygenius', 'Alec Benjamin', 'Sasha Alex Sloan', 'Ruth B.', 'Dean Lewis', 'Poison', 'Lauren Spencer Smith', 'Nessa Barrett', 'Sabrina Carpenter', 'Madison Beer']


In [55]:
# Use the recommended artists list to get song recommendations
recommended_songs = song_recommender_by_recommended_artists(recommended_artists, song_library, genre_vectors)
print(recommended_songs)

                                  name        artists  year
0              The Code (feat. Polo G)       King Von  2020
1  No Se Me Quita (feat. Ricky Martin)         Maluma  2019
2             Solo Quédate En Silencio  Maite Perroni  2004
3                            Headlines          Drake  2011
4                      BROWN SKIN GIRL      SAINt JHN  2019


## EXTRA CODE (Song Recommender by Artist & Song - Song Recommender)

### Song Recommender by Artist

In [22]:
def song_recommender_by_artists(artist_list, song_library, genre_vectors):
    try:
        # Filter songs by the given list of artists
        artist_songs = song_library[song_library['artists'].apply(lambda x: any(artist in x for artist in artist_list))]
        
        if artist_songs.empty:
            print('No songs found for the given artists in the song library.')
            return

        # Average numerical features for the selected songs
        num_cols = ['year', 'duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness',
                    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
        avg_num_features = artist_songs[num_cols].mean().to_numpy().reshape(1, -1)

        # Average genre vectors for the selected songs
        artist_song_indices = artist_songs.index.tolist()
        avg_genre_vector = np.mean(genre_vectors[artist_song_indices], axis=0).reshape(1, -1)

        # Calculate cosine similarity for genres and numerical features across the entire song library
        genre_similarities = cosine_similarity(avg_genre_vector, genre_vectors)
        num_similarities = cosine_similarity(avg_num_features, song_library[num_cols].to_numpy())

        # Average of genre and numerical similarities
        avg_similarities = (genre_similarities + num_similarities) / 2

        # Sort songs by similarity
        sim_scores_indices = np.argsort(-avg_similarities[0])

        # Create DataFrame for recommended songs, excluding the songs by the input artists
        recommended_songs = song_library.iloc[sim_scores_indices].loc[~song_library.index.isin(artist_song_indices)][['name', 'artists', 'year']].head(5)

        return recommended_songs.reset_index(drop=True)
    except Exception as e:
        print(f'An error occurred: {e}')

In [None]:
recommended_artists

In [40]:
artist_list = ["Def Leppard", "Opeth"] 
recommended_songs = song_recommender_by_artists(artist_list, song_library, genre_vectors)
print(recommended_songs)

No songs found for the given artists in the song library.
None


### SONG RECOMMENDER

In [17]:
import numpy as np

In [None]:
song_library = merged_df.sort_values(by=['popularity'], ascending=False).head(10000)
song_library.reset_index(inplace=True, drop=True)
song_library.head()

In [None]:
# Create CountVectorizer object to transform text into vector
song_vectorizer = CountVectorizer()

# Fit the vectorizer on "genres" field of song_library DataFrame
song_vectorizer.fit(song_library['genres'])

In [18]:
def song_recommender(song_name):
    try:
        # Numeric columns (audio features) in song_library DataFrame
        num_cols = ['year', 'duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness',
                    'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

        # Create vector from "genres" field (text data) for given song
        text_vec1 = song_vectorizer.transform(song_library[song_library['name']==str(song_name)]['genres']).toarray()

        # Create vector from numerical columns for given song
        num_vec1 = song_library[song_library['name']==str(song_name)][num_cols].to_numpy()

        # Initialise empty list to store similarity scores
        sim_scores=  []

        # For every song/track in song library, determine cosine similarity with given song
        for index, row in song_library.iterrows():
            name = row['name']

            # Create vector from "genres" field for other songs
            text_vec2 = song_vectorizer.transform(song_library[song_library['name']==name]['genres']).toarray()

            # Create vector from numerical columns for other songs
            num_vec2 = song_library[song_library['name']==name][num_cols].to_numpy()

            # Calculate cosine similarity using text vectors
            text_sim = cosine_similarity(text_vec1, text_vec2)[0][0]

            # Calculate cosine similarity using numerical vectors
            num_sim = cosine_similarity(num_vec1, num_vec2)[0][0]

            # Take average of both similarity scores and add to list of similarity scores
            sim = (text_sim + num_sim)/2
            sim_scores.append(sim)
        
        # Add new column containing similarity scores to song_library DataFrame
        song_library['similarity'] = sim_scores

        # Sort DataFrame based on "similarity" column
        song_library.sort_values(by=['similarity', 'popularity', 'year'], ascending=[False, False, False], inplace=True)

        # Create DataFrame "recommended_songs" containing 5 songs that are most similar to the given song and return this DataFrame
        recommended_songs = song_library[['name', 'artists', 'year']][2:7]
        return recommended_songs
    except:
        # If given song is not found in song library then display message
        print('{} not found in songs library.'.format(song_name))

In [26]:
song_recommender('Dakiti')

Unnamed: 0,name,artists,year
408,La Santa,Bad Bunny,2020
409,La Santa,Daddy Yankee,2020
3630,Mayores,Bad Bunny,2019
3631,Mayores,Becky G,2019
3759,Estamos Bien,Bad Bunny,2018
