In [1]:
import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
    pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
import math as mt
from scipy.sparse.linalg import * #used for matrix multiplication
from scipy.sparse.linalg import svds
from scipy.sparse import csc_matrix
from scipy.stats import skew, norm, probplot
import seaborn as sns
sns.set(style="ticks", color_codes=True, font_scale=1.5)
color = sns.color_palette()
sns.set_style('darkgrid')

In [2]:
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
    #Create the popularity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        #Get a count of user_ids for each unique song as recommendation score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
    
        #Sort the songs based upon recommendation score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
    
        #Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)
        
    

    #Use the popularity based recommender system model to make recommendations
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations
    

#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):
            
        #Get users for all songs in user_songs.
        user_songs_users = []        
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))
        
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           
        #Calculate similarity between user songs and all unique songs in the training data
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(user_songs)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value and also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pd.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #For the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to make recommendations
    def recommend(self, user):
        
        #Get all unique songs for this user
        user_songs = self.get_user_items(user)    
            
        print("No. of unique songs for the user: %d" % len(user_songs))
        
        #Get all unique items (songs) in the training data
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        #Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #Use the cooccurence matrix to make recommendations
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        
        user_songs = item_list
        
        #Get all unique items (songs) in the training data
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        #Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #Use the cooccurence matrix to make recommendations
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations
    
    

In [3]:
track_metadata_df = pd.read_csv('./Dataset/song_data.csv')
count_play_df = pd.read_csv('./Dataset/10000.txt', sep='\t', header=None, names=['user','song','play_count'])

print('First see of track metadata:')
print('Number of rows:', track_metadata_df.shape[0])
print('Number of unique songs:', len(track_metadata_df.song_id.unique()))
display(track_metadata_df.head())
print('Note the problem with repeated track metadata. Let\'s see of counts play song by users:')
display(count_play_df.shape, count_play_df.head())

First see of track metadata:
Number of rows: 1000000
Number of unique songs: 999056


Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


Note the problem with repeated track metadata. Let's see of counts play song by users:


(2000000, 3)

Unnamed: 0,user,song,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [4]:
unique_track_metadata_df = track_metadata_df.groupby('song_id').max().reset_index()

print('Number of rows after unique song Id treatment:', unique_track_metadata_df.shape[0])
print('Number of unique songs:', len(unique_track_metadata_df.song_id.unique()))
display(unique_track_metadata_df.head())

Number of rows after unique song Id treatment: 999056
Number of unique songs: 999056


Unnamed: 0,song_id,title,release,artist_name,year
0,SOAAABI12A8C13615F,Afro Jazziac,To Birdland And Hurry,Herbie Mann,2000
1,SOAAABT12AC46860F0,Herre Gud Ditt Dyre Namn Og Ære,Som Den Gyldne Sol Frembryter,Bergen Big Band,0
2,SOAAABX12A8C13FEB2,N.Y.C. Remix,Paris Can´t Wait,Guardner,0
3,SOAAACR12A58A79456,Irresistible,Wowie Zowie,Superchumbo,2002
4,SOAAACY12A58A79663,Untitled 1,Pine Cone Temples,Thuja,0


In [5]:
user_song_list_count = pd.merge(count_play_df, 
                                unique_track_metadata_df, how='left', 
                                left_on='song', 
                                right_on='song_id')
user_song_list_count.rename(columns={'play_count':'listen_count'},inplace=True)
del(user_song_list_count['song_id'])

In [9]:
def create_popularity_recommendation(train_data, user_id, item_id, n=10):
    #Get a count of user_ids for each unique song as recommendation score
    train_data_grouped = train_data.groupby([item_id]).agg({user_id: 'count'}).reset_index()
    train_data_grouped.rename(columns = {user_id: 'score'},inplace=True)
    
    #Sort the songs based upon recommendation score
    train_data_sort = train_data_grouped.sort_values(['score', item_id], ascending = [0,1])
    
    #Generate a recommendation rank based upon score
    train_data_sort['Rank'] = train_data_sort.score.rank(ascending=0, method='first')
        
    #Get the top n recommendations
    popularity_recommendations = train_data_sort.head(n)
    return popularity_recommendations

In [10]:
recommendations = create_popularity_recommendation(user_song_list_count,'user','title', 15)
display(recommendations)

Unnamed: 0,title,score,Rank
6837,Sehr kosmisch,8277,1.0
8726,Undo,7032,2.0
1965,Dog Days Are Over (Radio Edit),6949,3.0
9497,You're The One,6729,4.0
6499,Revelry,6145,5.0
6826,Secrets,5841,6.0
3438,Horn Concerto No. 4 in E flat K495: II. Romanc...,5385,7.0
2596,Fireflies,4795,8.0
3323,Hey_ Soul Sister,4758,9.0
8495,Tive Sim,4548,10.0


In [8]:
display(create_popularity_recommendation(user_song_list_count,'user','artist_name', 10)) 

Unnamed: 0,artist_name,score,Rank
649,Coldplay,29422,1.0
2850,The Black Keys,19862,2.0
1651,Kings Of Leon,18747,3.0
1107,Florence + The Machine,18112,4.0
1370,Jack Johnson,17801,5.0
2946,The Killers,16063,6.0
2374,Radiohead,14890,7.0
736,Daft Punk,14715,8.0
2073,Muse,14005,9.0
1554,Justin Bieber,13959,10.0


In [9]:
total_play_count = sum(user_song_list_count.listen_count)
play_count = user_song_list_count[['song', 'listen_count']].groupby('song').sum().\
             sort_values(by='listen_count',ascending=False).head(5000)

print('5,000 most popular songs represents {:3.2%} of total listen.'.format(float(play_count.sum())/total_play_count))

song_subset = list(play_count.index[:5000])
user_subset = list(user_song_list_count.loc[user_song_list_count.song.isin(song_subset), 'user'].unique())
user_song_list_count_sub = user_song_list_count[user_song_list_count.song.isin(song_subset)]
display(user_song_list_count_sub.head())

5,000 most popular songs represents 81.88% of total listen.


Unnamed: 0,user,song,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [11]:
is_model = item_similarity_recommender_py()
is_model.create(user_song_list_count_sub, 'user', 'title')
user_id = list(user_song_list_count_sub.user)[7]
user_items = is_model.get_user_items(user_id)

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

No. of unique songs for the user: 33
no. of unique songs in the training set: 4867
Non zero values in cooccurence_matrix :120655


Unnamed: 0,user_id,song,score,rank
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Quiet Houses,0.04471,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Meadowlarks,0.043836,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Heard Them Stirring,0.04274,3
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Tiger Mountain Peasant Song,0.041485,4
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Sun It Rises,0.040973,5
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Your Protector,0.039942,6
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Oliver James,0.039287,7
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Great Indoors,0.036765,8
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,White Winter Hymnal,0.036345,9
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,If I Could,0.034576,10


In [12]:
import pickle



In [13]:
popularity_model = popularity_recommender_py()  # Assuming an instance of the popularity_recommender_py class
with open('music_rec/assets/popularity_recommender.pkl', 'wb') as file:
    pickle.dump(popularity_model, file)

In [14]:
item_similarity_model = item_similarity_recommender_py()  # Assuming an instance of the item_similarity_recommender_py class
with open('music_rec/assets/item_similarity_recommender.pkl', 'wb') as file:
    pickle.dump(item_similarity_model, file)


In [15]:
import joblib

In [16]:

popularity_model = popularity_recommender_py() 
joblib.dump(popularity_model, 'music_rec/assets/popularity_model.joblib')

['music_rec/assets/popularity_model.joblib']

In [17]:
import joblib

# Load the dumped model
popularity_model = joblib.load('rec_music/assets/popularity_model.joblib')
item_similarity_model = item_similarity_recommender_py() 
joblib.dump(item_similarity_model, 'rec_music/assets/item_similarity_model.joblib')

['music_rec/assets/item_similarity_model.joblib']

In [75]:
import pandas as pd
import joblib

# Load the dumped model
item_similarity_model = joblib.load('music_rec/assets/item_similarity_model.joblib')

# # Load the dataset
track_metadata_df = pd.read_csv('./Dataset/song_data.csv')
# Rename the 'song_id' and 'title' columns to match the column names in the DataFrame
track_metadata_df = track_metadata_df.rename(columns={'song_id': 'song', 'title': 'title'})

count_play_df = pd.read_csv('./Dataset/10000.txt', sep='\t', header=None, names=['user', 'song_id', 'play_count'])


# track_metadata_df = pd.read_csv('./Dataset/song_data.csv')
# #track_metadata_df = track_metadata_df.rename(columns={'song_id': 'song', 'title': 'title'})
# count_play_df = pd.read_csv('./Dataset/10000.txt', sep='\t', header=None, names=['user','song','play_count'])


# Function to get recommendations for a single user
def get_recommendations_for_user(user_id):
    # Filter the dataset for the specified user
    user_data = count_play_df[count_play_df['user'] == user_id]
    
    # Merge the user data with track metadata to get song details
    user_data = pd.merge(user_data, track_metadata_df, left_on='song_id', right_on='song', how='left')
    
    # Remove unnecessary columns
    user_data = user_data.drop(['user', 'song_id'], axis=1)
    
    # Perform any necessary preprocessing or feature engineering on the user data
    
    # Get the unique songs listened by the user
    unique_songs = user_data['title'].unique()
    
    # Create the item similarity model and set the training data
    item_similarity_model.create(count_play_df, 'user', 'song_id')
    
    # Generate recommendations using the item similarity model
    recommendations = item_similarity_model.recommend(user_id)
    
    # Map song IDs to song titles in the recommendations DataFrame
    recommendations_with_titles = pd.merge(recommendations, track_metadata_df[['song', 'title', 'release', 'artist_name', 'year']], left_on='song', right_on='song', how='left')
    recommendations_with_titles = recommendations_with_titles.drop(['song'], axis=1)
    
    # Reset the index of recommendations DataFrame
    recommendations_with_titles.reset_index(drop=True, inplace=True)
    
    # Return the recommended songs
    return recommendations_with_titles[['score', 'rank', 'title', 'release', 'artist_name', 'year']]

# Specify the user ID for testing
user_id = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'

# Get recommendations for the user
recommended_songs = get_recommendations_for_user(user_id)

# Display the recommended songs
print("Recommended Songs:")
print(recommended_songs.to_string(index=False))


No. of unique songs for the user: 45
no. of unique songs in the training set: 10000
Non zero values in cooccurence_matrix :266224
Recommended Songs:
   score  rank                       title           release  artist_name  year
0.034138     1                Quiet Houses       Fleet Foxes  Fleet Foxes  2008
0.033386     2                 Meadowlarks       Fleet Foxes  Fleet Foxes  2008
0.032704     3         Heard Them Stirring       Fleet Foxes  Fleet Foxes  2008
0.032118     4               Great Indoors  Room For Squares   John Mayer     0
0.031717     5 Tiger Mountain Peasant Song       Fleet Foxes  Fleet Foxes  2008
0.031205     6                Sun It Rises       Fleet Foxes  Fleet Foxes  2008
0.030370     7              Your Protector       Fleet Foxes  Fleet Foxes  2008
0.030211     8                Oliver James       Fleet Foxes  Fleet Foxes  2008
0.028503     9                       Belle In Between Dreams Jack Johnson  2005
0.028141    10                  If I Could In Betwe

In [20]:
display(user_song_list_count_sub[(user_song_list_count_sub.user==user_id) & (user_song_list_count_sub.listen_count>1)])

Unnamed: 0,user,song,listen_count,title,release,artist_name,year
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
11,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOIZAZL12A6701C53B,5,I'll Be Missing You (Featuring Faith Evans & 1...,No Way Out,Puff Daddy,0
14,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOKRIMP12A6D4F5DA3,5,I?'m A Steady Rollin? Man,Diggin' Deeper Volume 7,Robert Johnson,0
16,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOMGIYR12AB0187973,6,Behind The Sea [Live In Chicago],Live In Chicago,Panic At The Disco,0
43,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOYHEPA12A8C13097F,8,Moonshine,Thicker Than Water,Jack Johnson,2003
