# Reading the files 

We are going to use the Million Song Dataset, a freely-available collection of audio features and metadata for a million contemporary popular music tracks.

There are two files that will be interesting for us. The first of them will give us information about the songs. Particularly, it contains the user ID, song ID and the listen count. On the other hand, the second file will contain song ID, title of that song, release, artist name and year. We need to merge these two DataFrames. For that aim, we'll use the song_ID


# Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix

ImportError: No module named pandas

In [2]:
#song_info = pd.read_csv('https://static.turi.com/datasets/millionsong/10000.txt',sep='\t',header=None)
#song_info.columns = ['user_id', 'song_id', 'listen_count']

#Read song  metadata
#song_actual =  pd.read_csv('https://static.turi.com/datasets/millionsong/song_data.csv')
#song_actual.drop_duplicates(['song_id'], inplace=True)

#Merge the two dataframes above to create input dataframe for recommender systems
#songs = pd.merge(song_info, song_actual, on="song_id", how="left")
from knn_recommender.recommender import Recommender

ImportError: No module named knn_recommender.recommender

In [None]:
#songs.to_csv('songs.csv',index=False)

In [None]:
df_songs=pd.read_csv('songs.csv')

In [None]:
df_songs.head()

In [None]:
df_songs.shape

In [None]:
print(f'There are {df_songs.shape[0]} observations in the dataset')

# cleaning of data no missing values can be seen

In [None]:
df_songs.isnull().sum()

In [None]:
df_songs.dtypes

In [None]:
#unique_song,users,artists
unique_songs=df_songs['title'].unique().shape[0]
unique_songs

In [None]:
unique_artist=df_songs['artist_name'].unique().shape[0]
unique_artist

In [None]:
unique_users=df_songs['user_id'].unique().shape[0]
unique_users

In [None]:
#count the most popular songs,10 most popular songs
ten_pop_songs=df_songs.groupby('title')['listen_count'].count().reset_index().sort_values(['listen_count','title'],ascending=[0,1])
ten_pop_songs['percentage']=round(ten_pop_songs['listen_count'].div(ten_pop_songs['listen_count'].sum())*100,2)

In [None]:
ten_pop_songs=ten_pop_songs[:10]
ten_pop_songs

In [None]:
labels=ten_pop_songs['title'].tolist()
counts=ten_pop_songs['listen_count'].tolist()

In [None]:
plt.figure()

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x=counts,y=labels,palette='bright')
sns.despine(left=True,bottom=True)
#plt.bar(labels,counts,color='maroon',width=0.4)

In [None]:
#artist 10 most popular artist
ten_pop_artist=df_songs.groupby('artist_name')['listen_count'].count().reset_index().sort_values(['listen_count','artist_name'],ascending=[0,1])
ten_pop_artist=ten_pop_artist[:10]
ten_pop_artist

In [None]:
counts=ten_pop_artist['listen_count'].tolist()
labels=ten_pop_artist['artist_name'].tolist()

In [None]:
plt.figure(figsize=(15,9))
sns.barplot(x=counts,y=labels,palette='dark')
sns.despine(left=True,bottom=True)

# Listen count by user


the other information that can be derived from the feature listen count is maximum time the same user listen to same song and on an average the number of times same user listens to the same song 

In [None]:
#max time
listen_counts=pd.DataFrame(df_songs.groupby('listen_count').size(),columns=['count'])
listen_counts.reset_index(drop=False)['listen_count'].iloc[-1]

In [None]:
#average
df_songs['listen_count'].mean()

In [None]:
#average for same user listening to the same song
print(f"on an average a user listens to {round(df_songs['listen_count'].mean())} times")

In [None]:
#distribution of listen_count
plt.figure(figsize=(20,6))
#plt.boxplot(df_songs['listen_count'])
sns.boxplot(x='listen_count',data=df_songs)
sns.despine()

# Songs User listens on an average

In [None]:
song_user = df_songs.groupby('user_id')['song_id'].count()
print(f"A user listens to an average of {np.mean(song_user)} songs")

In [None]:
print(f"{np.median(song_user)} songs, with minimum {np.min(song_user)} and maximum {np.max(song_user)} songs")

We can see that a user listens in average to 26 songs. Even the maximum amount of songs listen by an user is 711, and we have 9567 songs in our dataset.

So, not all user listen to all songs, so a lot of values in the song x users matrix are going to be zero. Thus, we’ll be dealing with extremely sparse data.

In [None]:
# Get how many values should it be if all songs have been listen by all users
values_matrix = unique_users * unique_songs
values_matrix

In [None]:
# Substract the total values with the actural shape of the DataFrame songs
df_songs.shape[0]
zero_values_matrix = values_matrix - df_songs.shape[0]
zero_values_matrix

In [None]:
print(f"The matrix of users x songs has {zero_values_matrix} values that are zero")

Dealing with such a sparse matrix, will take a lot of memory and resources. we will select all those users that have listened to at least 16 songs.

# Preparing the data to be used in the model

In [None]:
# Get users which have listen to at least 16 songs
song_ten_id = song_user[song_user > 16].index.to_list()

In [None]:
# Filtered the dataset to keep only those users with more than 16 listened
df_song_id_more_ten = df_songs[df_songs['user_id'].isin(song_ten_id)].reset_index(drop=True)
df_song_id_more_ten.head()

We need now to work with a scipy-sparse matrix to avoid overflow and wasted memory. For that purpose, we'll use the csr_matrix function from scipy.sparse.

In [None]:
# convert the dataframe into a pivot table
df_songs_features = df_song_id_more_ten.pivot(index='song_id', columns='user_id', values='listen_count').fillna(0)

# obtain a sparse matrix
mat_songs_features = csr_matrix(df_songs_features.values)

In [None]:
#now taking a look at the table user x song.
df_songs_features.head()

Because the system will output the id of the song, instead of the title, we'll make a function that maps those indices with the song title.

In [None]:
df_unique_songs = df_songs.drop_duplicates(subset=['song_id']).reset_index(drop=True)[['song_id', 'title']]

In [None]:
decode_id_song = {
    song: i for i, song in 
    enumerate(list(df_unique_songs.set_index('song_id').loc[df_songs_features.index].title))
}

# Models And Recommendations

In [None]:
model = Recommender(metric='cosine',algorithm='brute', k=20, data=mat_songs_features, decode_id_song=decode_id_song)

In [None]:
song = 'I believe in miracles'

In [None]:
new_recommendations = model.make_recommendation(new_song=song, n_recommendations=10)

In [None]:
print(f"The recommendations for {song} are:")
print(f"{new_recommendations}")