In [2]:
import pandas 
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.sparse

In [3]:
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
    #Create the popularity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        #Get a count of user_ids for each unique song as recommendation score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
    
        #Sort the songs based upon recommendation score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
    
        #Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)

    #Use the popularity based recommender system model to
    #make recommendations
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations
    

#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):
            
        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_songs_users = []        
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))
            
        ###############################################
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           
        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(user_songs)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pandas.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self,user):
        
        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_songs = usersongs    
            
        print("No. of unique songs for the user: %d" % len(user_songs))
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        user_songs = item_list
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations











In [4]:
DfSong1=pandas.read_table("/content/drive/MyDrive/ChatBot/10000.txt",header=None)
DfSong1.columns=['user_id','song_id','listen_count']
print(len(DfSong1))
print(DfSong1.head())
DfSong1=DfSong1.iloc[:100000]
print(len(DfSong1))
print(DfSong1.head())


DfSong2= pandas.read_csv("/content/drive/MyDrive/ChatBot/song_data.csv")
DfSong=pandas.merge(DfSong1, DfSong2.drop_duplicates(['song_id']), on="song_id", how="left")

print(DfSong.head())

2000000
                                    user_id             song_id  listen_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0             1
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D             1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273             1
100000
                                    user_id             song_id  listen_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0             1
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D             1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273             1
                                    user_id      

In [5]:
GroupSong = DfSong.groupby(['title']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = GroupSong['listen_count'].sum()
GroupSong['percentage']  = GroupSong['listen_count'].div(grouped_sum)*100
GroupSong.sort_values(['listen_count', 'title'], ascending = [0,1])

users = DfSong['user_id'].unique()
print(len(users)) ## return 365 unique users
songs = DfSong['title'].unique()
print(len(songs)) ## return 5151 unique songs

3863
9507


In [6]:
train_data, test_data = train_test_split(DfSong, test_size = 0.20, random_state=0)

pm = popularity_recommender_py()
pm.create(train_data, 'user_id', 'title')#user the popularity model to make some prediction
user_id = users[5]
pm.recommend(user_id)

is_model = item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'title')

#Print the songs for the user in training data
user_id = users[5]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model


------------------------------------------------------------------------------------
Training data songs for the user userid: 4bd88bfb25263a75bbdd467e74018f4ae570e5df:
------------------------------------------------------------------------------------
The Real Slim Shady
16 Candles
Ghosts 'n' Stuff (Original Instrumental Mix)
Forgive Me
Just Lose It
Missing You
Without Me
Push It
Say My Name
My Dad's Gone Crazy
Speechless
Somebody To Love
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------


In [7]:
is_model.recommend(user_id)

NameError: ignored

In [8]:
def InData(music):
  listeTitle=DfSong2['title']
  ret = False
  print(listeTitle[0]==DfSong2['title'][0])
  theId=""
  for i in range(len(listeTitle)):
    if(music ==listeTitle[i] and ret==False):
      ret =True
      theId=DfSong2['song_id'][i]
  print(theId)
  print(len(DfSong2))
  for j in range(len(DfSong1)):
    #print(DfSong1['song_id'][j])
    if theId==DfSong1['song_id'][j]:
      print("c'est la fete")
  return ret

In [9]:
print(InData("Learn To Fly"))

True
SOACPCX12AB0186A0A
1000000
True


In [79]:
print(DfSong2[0:150])

                song_id                   title  \
0    SOQMMHC12AB0180CB8            Silent Night   
1    SOVFVAK12A8C1350D9             Tanssi vaan   
2    SOGTUKN12AB017F4F1       No One Could Ever   
3    SOBNYVR12A8C13558C           Si Vos Querés   
4    SOHSBXH12A8C13B0DF        Tangle Of Aspens   
..                  ...                     ...   
145  SOGWEOB12AB018A4D0   I Say A Little Prayer   
146  SOIHJKI12A8C13D474            Dawning Star   
147  SOFMVOA12AB018240A           Irreplaceable   
148  SOCELXR12AB017EB1B              AcroyearII   
149  SOBAHCX12AB0183F7A  And Then You Went Away   

                                               release         artist_name  \
0                                Monster Ballads X-Mas    Faster Pussy cat   
1                                          Karkuteillä    Karkkiautomaatti   
2                                               Butter      Hudson Mohawke   
3                                              De Culo         Yerba Brava 

In [10]:
train_data, test_data = train_test_split(DfSong, test_size = 0.20, random_state=0)

pm = popularity_recommender_py()
pm.create(train_data, 'user_id', 'title')#user the popularity model to make some prediction
user_id = users[5]
pm.recommend(user_id)

is_model = item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'title')

#Print the songs for the user in training data
user_id = "Utilisateur"
test="Rockin' In The Free World"
usersongs = [test]

onCherche=InData(test)
print(onCherche)
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

if onCherche==True:
  print("----------------------------------------------------------------------")
  print(test+" is in the list ! Recommendation process going on:")
  print("----------------------------------------------------------------------")

  print(is_model.recommend(user_id))

True
SOHWKYI12A8C136434
1000000
True
------------------------------------------------------------------------------------
Training data songs for the user userid: Utilisateur:
------------------------------------------------------------------------------------
----------------------------------------------------------------------
Rockin' In The Free World is in the list ! Recommendation process going on:
----------------------------------------------------------------------
No. of unique songs for the user: 1
no. of unique songs in the training set: 9415
Non zero values in cooccurence_matrix :0
       user_id                   song  score rank
0  Utilisateur                 Rocker    0.0    1
1  Utilisateur  Mercedes Benz - Remix    0.0    2
2  Utilisateur              Composure    0.0    3
3  Utilisateur    The Walking Wounded    0.0    4
4  Utilisateur           Out Of Exile    0.0    5
5  Utilisateur            Many Ghosts    0.0    6
6  Utilisateur            On Your Way    0.0    