In [None]:
# # # # Collaborative filtering (CF)[1] is the industry standard technique used in recommender systems.

# # # In basic CF, the rating of an item is estimated by aggregating either:

# # # The rating given to the item by “similar” users (user-based CF)
# # # The rating given to “similar” items by the user (item-based CF)
# # # The problem with these is that user-item matrices are large and sparse.

# # # Instead, the industry standard is to find latent features by reducing the matrix with matrix factorization[2].

# # # Specifically, the way this is done is through an algorithm known as Alternating Least Squares (ALS)[3].

# # # This approach is used by recommendation-centric companies like Netflix and Quora, in combination with other approaches.
# # # As far as I’m aware, ALS is still the bread and butter of real-world recommender systems.



# # When using a Matrix Factorization approach to implement a recommendation algorithm you decompose your large user/item matrix into 
# # lower dimensional user factors and item factors. In the most simple approach you can then estimate the user rating (or in general preference) by multiplying those factors according to the following equation:

# # r′ui=pTuqi  (1)

# # In order to learn those factors you need to minimize the following quadratic loss function:

# # argminq,p∑u,i(rui−pTuqi)2  (2)

# # Note that for simplicity I am omitting the possible biases in the first equation and the regularization in this second one.

# # In an SGD (Stochastic Gradient descent) approach, for each example in the dataset you compute the error  (rui−pTuqi)  and then you update the parameters by a factor in the opposite direction of the gradient.

# # Alternating Least Squares (ALS) represents a different approach to optimizing the loss function. The key insight is that you can turn the non-convex optimization problem in Equation (2) 
# # into an "easy" quadratic problem if you fix either  pu  or  qi . ALS fixes each one of those alternatively. When one is fixed, the other one is computed, and vice versa.

# # There are two main benefits of this approach. First, this is very easy to parallelize. Second, whenever dealing with implicit datasets, which are usually not sparse, 
# # SGD is not practical (users times items can easily be in the order of billions). ALS is a much more efficient optimization technique in these cases.




# collaborative filtering is not a suitable model to deal with cold start problem, in which it cannot draw any inference for users or items about which it has not yet gathered sufficient information.

In [None]:
# Matrix factorization approach
# matrix R -> rows - users | col - books | inside ratings for different books
# matrix P -> rows - users | col - latent variables like fantasy/thriller/biography | inside - ratings for different latent variables
# matrix Q -> rows - books | col - latent variables like fantasy/thriller/biography | inside - ratings books have for different latent variables
# R = P * R

In [None]:
# Recommendation systems are of two types 
# 1. Content based filtering (only based on the current users past data)
# 2. Collaborative Filtering (finding similar users   &    finding similar books)

In [None]:
# Do combined network, also try 1/0 - change loss and activation
# Hyperprameters - num of epochs, learning rate,num of layers, number of negative samples in train-4, test -100

In [None]:
import tensorflow as tf

tf.test.gpu_device_name()

''

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

# /content/gdrive/My Drive/Kaggle is the path where kaggle.json is present in the Google Drive

In [None]:
#changing the working directory

%cd /content/gdrive/My Drive/Kaggle


/content/gdrive/My Drive/Kaggle


In [None]:
% pwd

#Check the present working directory using pwd command

'/content/gdrive/My Drive/Kaggle'

In [None]:
!ls

Apple-Twitter-Sentiment-DFE.csv    Recommendation_df_train.csv
apple-twitter-sentiment-texts.csv  sampleSubmission.csv
BitCoinPrice.csv		   sampleSubmission.csv.zip
Bitcoin_tweet_sentiment.csv	   sampleSubmission_v2.csv.zip
BTCUSD_1hr.csv			   saved_models
BTCUSD_1min.csv			   submission.csv
BTCUSD_day.csv			   test.csv
Credit_Card_Autoencoder_Model.h5   test_embed_df.csv
creditcard.csv			   test.tsv
decoder_layer.png		   train.csv
decoder.png			   train_embed_df.csv
encoder_layer.png		   train-sample.csv.zip
encoder.png			   train.tsv
kaggle.json			   train_v2.csv
Movie_Conversation.csv		   transformer.png
movie_conversations.txt		   tweetsfinal1.csv
movie_lines.txt			   tweetsfinal2.csv
product_descriptions.csv	   tweetsfinal3.csv
product_descriptions.csv.zip	   tweets_labelled.csv
Recommendation_df_test1.csv	   usersha1-artmbid-artname-plays.tsv
Recommendation_df_test.csv	   usersha1-profile.tsv
Recommendation_df_train1.csv


In [None]:
import pandas as pd

df = pd.read_csv('usersha1-artmbid-artname-plays.tsv', sep = '\t', header = None, names = ['Date', 'User', 'Artist', 'Plays'])
df

Unnamed: 0,Date,User,Artist,Plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
...,...,...,...,...
17535650,"sep 20, 2008",7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12
17535651,"sep 20, 2008",9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11
17535652,"sep 20, 2008",e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11
17535653,"sep 20, 2008",f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10


In [None]:
df = df.drop(['Date'], axis = 1)

df

Unnamed: 0,User,Artist,Plays
0,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
...,...,...,...
17535650,7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12
17535651,9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11
17535652,e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11
17535653,f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10


In [None]:
df.isna().sum()

User      226137
Artist       204
Plays          0
dtype: int64

In [None]:
df = df.dropna()
df

Unnamed: 0,User,Artist,Plays
0,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
...,...,...,...
17535650,7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12
17535651,9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11
17535652,e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11
17535653,f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10


In [None]:
df.loc[df.Plays == 0]  # No rows with number of plays = 0

Unnamed: 0,User,Artist,Plays


In [None]:
# Delete rows with only 1 interactions

df_count = df.groupby(['User']).count()
df['Count']= df.groupby('User')['User'].transform('count')
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,User,Artist,Plays,Count
0,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137,96
1,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099,8567
2,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897,924
3,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717,1475
4,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706,1377
...,...,...,...,...
17535650,7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12,444
17535651,9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11,12
17535652,e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11,569
17535653,f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10,28933


In [None]:
df = df[df['Count'] > 1]
df

Unnamed: 0,User,Artist,Plays,Count
0,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137,96
1,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099,8567
2,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897,924
3,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717,1475
4,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706,1377
...,...,...,...,...
17535650,7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12,444
17535651,9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11,12
17535652,e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11,569
17535653,f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10,28933


In [None]:
df['User'].nunique()

123220

In [None]:
df['Artist'].nunique()

148836

In [None]:
df['Plays'].nunique()

14122

In [None]:
df_count

Unnamed: 0_level_0,Artist,Plays
User,Unnamed: 1_level_1,Unnamed: 2_level_1
00010eb3-ebfe-4965-81ef-0ac64cd49fde,11,11
0001cd84-2a11-4699-8d6b-0abf969c5f06,68,68
0002260a-b298-48cc-9895-52c9425796b7,1,1
00026532-1fe3-45fb-a0df-34aec04a1319,3,3
00026d14-39c6-4f2d-b556-093233b5e714,5,5
...,...,...
fffed9ff-98c6-458a-8379-47e7fb4ba6ec,127,127
ffff01cd-0ae0-46c7-867b-d17d8d38cff8,8,8
ffff3742-4ae3-4e13-a29c-d4c164985a5b,107,107
ffff44bd-e5a5-4e87-8700-35481264e37d,7,7


In [None]:
df_count[df_count['Artist'] == 1]

Unnamed: 0_level_0,Artist,Plays
User,Unnamed: 1_level_1,Unnamed: 2_level_1
0002260a-b298-48cc-9895-52c9425796b7,1,1
00039b8a-3da6-4cb2-85e3-f93e30f43049,1,1
0004533f-77b7-468d-8657-40db6adec34f,1,1
00049e34-3853-4aa7-8c97-5f04a8adfd58,1,1
0004c7ff-2ffd-4ce0-b155-f9f9470b10b8,1,1
...,...,...
fff239d3-b78a-4c2c-9e05-d8b08857149b,1,1
fff2c064-930c-4c68-98b3-5c1d195d02be,1,1
fff4cb7d-55af-4f9a-9a18-3b2f8089efac,1,1
fff8a2b2-1395-41ac-a880-048bf2e41c9a,1,1


In [None]:
# Convert user and artists names into numerical IDs

df['Userid'] = df['User'].astype('category').cat.codes
df['Artistid'] = df['Artist'].astype('category').cat.codes

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,User,Artist,Plays,Count,Userid,Artistid
0,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137,96,28833,14680
1,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099,8567,117013,33935
2,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897,924,86545,83337
3,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717,1475,29624,40863
4,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706,1377,90432,66085
...,...,...,...,...,...,...
17535650,7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12,444,61723,136350
17535651,9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11,12,70446,28312
17535652,e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11,569,111701,75271
17535653,f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10,28933,118875,113224


In [None]:
# Create a lookup frame so that we can get the artists name in readable format later

item_lookup = df[['Artist', 'Artistid']].drop_duplicates()
item_lookup

Unnamed: 0,Artist,Artistid
0,betty blowtorch,14680
1,die Ärzte,33935
2,melissa etheridge,83337
3,elvenking,40863
4,juliette & the licks,66085
...,...,...
17531390,pablos eye,95632
17531520,"ergs, the",41878
17532347,dug dugs,38207
17533008,タイナカ サチ,146799


In [None]:
type(item_lookup['Artistid'][0])

numpy.int32

In [None]:
item_lookup['Artistid'] = item_lookup['Artistid'].astype(str)

In [None]:
type(item_lookup['Artistid'][0])

str

In [None]:
df = df[['Userid', 'Artistid', 'Plays']]
df

Unnamed: 0,Userid,Artistid,Plays
0,28833,14680,2137
1,117013,33935,1099
2,86545,83337,897
3,29624,40863,717
4,90432,66085,706
...,...,...,...
17535650,61723,136350,12
17535651,70446,28312,11
17535652,111701,75271,11
17535653,118875,113224,10


In [None]:
# # Create training and test sets

# # Few helper functions

# # Return a list of 0 for the first item and 1 for all others

import numpy as np

def mask_first1(x):
  result = np.ones_like(x)
  result[0] = 0
  return result

# # Splits our original data into one test and one training set. The test set is made up of one item for each user. This is our holdout item used to compute Top@K later.
# # The training set is the same as our original data but without any of the holdout items.

def train_test_split1(df):
  df_test = df.copy(deep=True)
  df_train = df.copy(deep=True)
  df_test = df_test.groupby(['Userid']).first()
  df_test['Userid'] = df_test.index
  df_test = df_test[['Userid', 'Artistid', 'Plays']]
  # del df_test.index.name
  mask = df.groupby(['Userid'])['Userid'].transform(mask_first1).astype(bool)
  df_train = df.loc[mask]

  return df_train, df_test

df_train1, df_test1 = train_test_split1(df)

In [None]:
# df_train.to_csv("/content/gdrive/My Drive/Kaggle/Recommendation_df_train.csv")
# df_test.to_csv("/content/gdrive/My Drive/Kaggle/Recommendation_df_test.csv")

In [None]:
df_train1.to_csv("/content/gdrive/My Drive/Kaggle/Recommendation_df_train1.csv")
df_test1.to_csv("/content/gdrive/My Drive/Kaggle/Recommendation_df_test1.csv")

NameError: ignored

In [None]:
# import pandas as pd
# df_train = pd.read_csv('Recommendation_df_train.csv')
# df_test = pd.read_csv('Recommendation_df_test.csv')

In [None]:
import pandas as pd
df_train = pd.read_csv('Recommendation_df_train1.csv')
df_test = pd.read_csv('Recommendation_df_test1.csv')

In [None]:
df_train

Unnamed: 0.1,Unnamed: 0,Userid,Artistid,Plays
0,199,6064,7723,203
1,205,22590,19205,144
2,206,39648,9081,143
3,213,59620,50849,109
4,228,53734,128498,77
...,...,...,...,...
17149200,17535650,61723,136350,12
17149201,17535651,70446,28312,11
17149202,17535652,111701,75271,11
17149203,17535653,118875,113224,10


In [None]:
df_test   # Are these the top plays of the user or any random plays - these are most recent played in the list which is by default the first/latest entry

Unnamed: 0,Userid,Userid.1,Artistid,Plays
0,0,0,71763,36
1,1,1,84842,14
2,2,2,32432,26
3,3,3,93215,131
4,4,4,92217,63
...,...,...,...,...
123215,123215,123215,67576,135
123216,123216,123216,13326,16
123217,123217,123217,107737,270
123218,123218,123218,33845,721


In [None]:
# Create list of all total unique users and items/artists

import numpy as np

users = list(np.sort(df_train['Userid'].unique()))
users[-1]

123219

In [None]:
temp = [df_train['Artistid'], df_test['Artistid']]
total_artistids = pd.concat(temp)

In [None]:
total_artistids

0           7723
1          19205
2           9081
3          50849
4         128498
           ...  
123215     67576
123216     13326
123217    107737
123218     33845
123219     71850
Name: Artistid, Length: 17272425, dtype: int64

In [None]:
artists = list(np.sort(total_artistids.unique()))
artists[-1]

148835

In [None]:
df_train['Artistid'].nunique()

146419

In [None]:
# Get the rows, columns and values for our matrix, taking this from df_train

rows = df_train['Userid'].astype(int)
rows

0             6064
1            22590
2            39648
3            59620
4            53734
             ...  
17149200     61723
17149201     70446
17149202    111701
17149203    118875
17149204     31313
Name: Userid, Length: 17149205, dtype: int64

In [None]:
userids = np.array(rows.tolist())    # Get all user ids and item ids

In [None]:
cols = df_train['Artistid'].astype(int)
cols

0             7723
1            19205
2             9081
3            50849
4           128498
             ...  
17149200    136350
17149201     28312
17149202     75271
17149203    113224
17149204    130616
Name: Artistid, Length: 17149205, dtype: int64

In [None]:
artistids = np.array(cols.tolist())   

In [None]:
values = list(df_train['Plays'])

In [None]:
userids

array([  6064,  22590,  39648, ..., 111701, 118875,  31313])

In [None]:
artistids

array([  7723,  19205,   9081, ...,  75271, 113224, 130616])

In [None]:
# Sample 100 negative interactions for each user in our test data
# Returns a pandas dataframe of 100 negative interactions for each user in df_test

def get_test_negatives(userids, artistids, artists, df_test):

  negatives_list = []
  negatives = []
  test_u = df_test['Userid'].values.tolist()
  test_a = df_test['Artistid'].values.tolist()

  test_pairs = list(zip(test_u, test_a))
  total_pairs = set(zip(userids, artistids))

  for (u, a) in test_pairs:
    negatives = []
    negatives.append((u, a))
    for t in range(100):
      j = np.random.randint(len(artists))
      while (u, j) in total_pairs:
        j = np.random.randint(len(artists))
      negatives.append((u, j))
    negatives_list.append(negatives)
  
  return np.array(negatives_list)

  # df_test_neg = pd.Dataframe(negative_list)
  # df_test_neg_shuffled = df_test_neg.sample(frac = 1)
  # return df_test_neg_shuffled

test_array = get_test_negatives(userids, artistids, artists, df_test)


In [None]:
test_array.shape

(123220, 101, 2)

In [None]:
test_array

array([[[     0,  71763],
        [     0,  49945],
        [     0, 121952],
        ...,
        [     0,  52539],
        [     0,  38579],
        [     0, 135129]],

       [[     1,  84842],
        [     1, 110352],
        [     1,  64156],
        ...,
        [     1, 133434],
        [     1,  20989],
        [     1,  23714]],

       [[     2,  32432],
        [     2,  79867],
        [     2, 109394],
        ...,
        [     2,   6366],
        [     2, 135128],
        [     2, 130421]],

       ...,

       [[123217, 107737],
        [123217, 115466],
        [123217, 106433],
        ...,
        [123217, 127441],
        [123217,  97408],
        [123217, 118904]],

       [[123218,  33845],
        [123218,  12588],
        [123218,   2808],
        ...,
        [123218,  80942],
        [123218,  60738],
        [123218, 138211]],

       [[123219,  71850],
        [123219,  88783],
        [123219,  87303],
        ...,
        [123219, 110954],
        [123219

In [None]:
num_neg = 4

def get_train_negatives(userids, artistids, values, artists, num_neg, df_train):

  total_3_pairs = set(zip(userids, artistids, values))
  total_2_pairs = set(zip(userids, artistids)) 

  train_negatives = []

  for (u, a, v) in total_3_pairs:
    
    train_negatives.append((u, a, v))
    for t in range(num_neg):
      j = np.random.randint(len(artists))
      while (u, j) in total_2_pairs:
        j = np.random.randint(len(artists))
      train_negatives.append((u, j, 0))

  df_train_neg = pd.DataFrame(train_negatives)
  df_train_neg_shuffled = df_train_neg.sample(frac = 1)
  return df_train_neg_shuffled

df_train = get_train_negatives(userids, artistids, values, artists, num_neg, df_train)

In [None]:
df_train

Unnamed: 0,0,1,2
9059738,52803,33695,0
27489745,94136,106464,10
4324672,37999,130005,0
20898217,111622,105054,0
142879,122868,142593,0
...,...,...,...
1179016,81337,142265,0
26694822,119153,63712,0
18648471,86042,120974,0
22412231,104925,105424,0


In [None]:
df_train= df_train.sample(frac = 1)
df_train

Unnamed: 0,0,1,2
4302255,95395,120161,276
7259924,28735,137855,0
25957708,34770,31033,0
779960,2592,22236,161
2868456,18601,87852,0
...,...,...,...
19711934,57423,65032,0
1785489,5437,47500,0
28967738,28864,56400,0
18254752,119452,53568,0


In [None]:
df_train = df_train.rename(columns = {0: 'Userid', 1: 'Artistid', 2: 'Plays'})
df_train

Unnamed: 0,Userid,Artistid,Plays
4302255,95395,120161,276
7259924,28735,137855,0
25957708,34770,31033,0
779960,2592,22236,161
2868456,18601,87852,0
...,...,...,...
19711934,57423,65032,0
1785489,5437,47500,0
28967738,28864,56400,0
18254752,119452,53568,0


In [None]:
df_train.columns

Index(['Userid', 'Artistid', 'Plays'], dtype='object')

In [None]:
len(users)

123220

In [None]:
max(users)

123219

In [None]:
len(artists)

148836

In [None]:
max(artists)

148835

In [None]:
artists[-10:]

[148826,
 148827,
 148828,
 148829,
 148830,
 148831,
 148832,
 148833,
 148834,
 148835]

In [None]:
artists.sort()

In [None]:
# Define my Multi layer perceptron neural network

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from keras.layers import Input
from keras.models import Model

num_neg = 4
epochs = 10
batch_size = 256

input1 = Input(shape=(1,))
user_embedding = Embedding(len(users)+1, 1, input_length = 1)(input1)  # Original was 32 
user_embedding = Flatten()(user_embedding)

input2 = Input(shape=(1,))
artist_embedding = Embedding(len(artists)+1, 1, input_length = 1)(input2)
artist_embedding = Flatten()(artist_embedding)

concatenated = concatenate([user_embedding, artist_embedding])
print(concatenated)
dropout = Dropout(0.2)(concatenated)

layer_1 = Dense(64, activation='relu')(dropout)
batch_norm1 = BatchNormalization()(layer_1)
dropout1 = Dropout(0.2)(batch_norm1)

layer_2 = Dense(32, activation='relu')(dropout1)
batch_norm2 = BatchNormalization()(layer_2)
dropout2 = Dropout(0.2)(batch_norm2)

layer_3 = Dense(16, activation='relu')(dropout2)
layer_4 = Dense(8, activation='relu')(layer_3)

output = tf.keras.layers.Dense(1)(layer_4)  # I have changed function initializer, it was lecun_initializer in code
model = Model([input1, input2], output)

model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])   # Try binary also by keeping 1/0 as output

model.summary()

KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name=None), name='concatenate/concat:0', description="created by layer 'concatenate'")
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 1)         123221      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None,

In [None]:
j = np.random.randn(100, 336)
j.shape

(100, 336)

In [None]:
df_train[['Userid']].shape

(29324520, 1)

In [None]:
df_train[['Userid']].shape

(29324520, 1)

In [None]:
df_train1 = np.array(df_train['Userid'].tolist(), dtype = 'float32')
df_train1

array([ 95395.,  28735.,  34770., ...,  28864., 119452.,  90293.],
      dtype=float32)

In [None]:
df_train2 = np.array(df_train['Artistid'].tolist(), dtype = 'float32')
df_train2

array([120161., 137855.,  31033., ...,  56400.,  53568.,  68817.],
      dtype=float32)

In [None]:
df_train3 = np.array(df_train['Plays'].tolist(), dtype = 'float32')
df_train3

array([276.,   0.,   0., ...,   0.,   0.,   0.], dtype=float32)

In [None]:
model.fit([df_train1, df_train2], df_train3, batch_size = batch_size, epochs = epochs, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f17f00da650>

In [None]:
# # Save the model

model.save('/content/gdrive/My Drive/Kaggle/Song_Recommendation_Model.h5')

from keras.models import load_model

loaded_model = load_model('/content/gdrive/My Drive/Kaggle/Song_Recommendation_Model.h5')

In [None]:
test_array.shape

(123220, 101, 2)

In [None]:
test_array_flat = test_array.reshape(-1,2)
test_array_flat

array([[     0,  71763],
       [     0,  49945],
       [     0, 121952],
       ...,
       [123219, 110954],
       [123219,  19228],
       [123219,  66616]])

In [None]:
test_only_u = test_array_flat[:,0]
print(test_only_u)
test_only_u = test_only_u.reshape(-1,1)
print(test_only_u.shape)
test_only_a = test_array_flat[:,1]
print(test_only_a)
test_only_a = test_only_a.reshape(-1,1)
print(test_only_a.shape)

[     0      0      0 ... 123219 123219 123219]
(12445220, 1)
[ 71763  49945 121952 ... 110954  19228  66616]
(12445220, 1)


In [None]:
predictions = model.predict([test_only_u, test_only_a])
predictions

array([[15.846638 ],
       [14.890669 ],
       [35.44056  ],
       ...,
       [14.6628475],
       [32.77674  ],
       [10.31     ]], dtype=float32)

In [None]:
predictions.shape

(12445220, 1)

In [None]:
test_only_u = test_only_u.reshape(-1)
test_only_a = test_only_a.reshape(-1)
predictions = predictions.reshape(-1)

In [None]:
c = np.dstack((test_only_u,test_only_a,predictions))
c

array([[[0.00000000e+00, 7.17630000e+04, 1.58466377e+01],
        [0.00000000e+00, 4.99450000e+04, 1.48906689e+01],
        [0.00000000e+00, 1.21952000e+05, 3.54405594e+01],
        ...,
        [1.23219000e+05, 1.10954000e+05, 1.46628475e+01],
        [1.23219000e+05, 1.92280000e+04, 3.27767410e+01],
        [1.23219000e+05, 6.66160000e+04, 1.03100004e+01]]])

In [None]:
import numpy as np
to_df = np.column_stack((test_only_u, test_only_a, predictions))
print(to_df.shape)
print(to_df)

(12445220, 3)
[[0.00000000e+00 7.17630000e+04 1.58466377e+01]
 [0.00000000e+00 4.99450000e+04 1.48906689e+01]
 [0.00000000e+00 1.21952000e+05 3.54405594e+01]
 ...
 [1.23219000e+05 1.10954000e+05 1.46628475e+01]
 [1.23219000e+05 1.92280000e+04 3.27767410e+01]
 [1.23219000e+05 6.66160000e+04 1.03100004e+01]]


In [3]:
to_df = to_df.tolist()
to_df

In [None]:
import pandas as pd

df_prediction = pd.DataFrame(to_df, columns =['User_inp', 'Artist_inp', 'Prediction'])
df_prediction

Unnamed: 0,User_inp,Artist_inp,Prediction
0,0.0,71763.0,15.846638
1,0.0,49945.0,14.890669
2,0.0,121952.0,35.440559
3,0.0,83915.0,31.961067
4,0.0,135134.0,24.889641
...,...,...,...
12445215,123219.0,36426.0,16.010546
12445216,123219.0,5048.0,16.734028
12445217,123219.0,110954.0,14.662848
12445218,123219.0,19228.0,32.776741


In [None]:
df_prediction = df_prediction.reset_index()
df_prediction

Unnamed: 0,index,User_inp,Artist_inp,Prediction
0,0,0.0,71763.0,15.846638
1,1,0.0,49945.0,14.890669
2,2,0.0,121952.0,35.440559
3,3,0.0,83915.0,31.961067
4,4,0.0,135134.0,24.889641
...,...,...,...,...
12445215,12445215,123219.0,36426.0,16.010546
12445216,12445216,123219.0,5048.0,16.734028
12445217,12445217,123219.0,110954.0,14.662848
12445218,12445218,123219.0,19228.0,32.776741


In [None]:
df_prediction_group = df_prediction.groupby(['User_inp'])['Prediction'].nlargest(10).reset_index()
df_prediction_group

Unnamed: 0,User_inp,level_1,Prediction
0,0.0,60,471.524597
1,0.0,88,151.845398
2,0.0,91,55.569191
3,0.0,59,54.635475
4,0.0,29,54.431252
...,...,...,...
1232195,123219.0,12445150,161.293121
1232196,123219.0,12445180,148.890564
1232197,123219.0,12445119,75.745598
1232198,123219.0,12445196,62.743561


In [None]:
df_prediction_group=df_prediction_group.rename(columns = {'level_1':'index'})
df_prediction_group

Unnamed: 0,User_inp,index,Prediction
0,0.0,60,471.524597
1,0.0,88,151.845398
2,0.0,91,55.569191
3,0.0,59,54.635475
4,0.0,29,54.431252
...,...,...,...
1232195,123219.0,12445150,161.293121
1232196,123219.0,12445180,148.890564
1232197,123219.0,12445119,75.745598
1232198,123219.0,12445196,62.743561


In [None]:
df_prediction_group = df_prediction_group.merge(df_prediction, on='index')
df_prediction_group

Unnamed: 0,User_inp_x,index,Prediction_x,User_inp_y,Artist_inp,Prediction_y
0,0.0,60,471.524597,0.0,86096.0,471.524597
1,0.0,88,151.845398,0.0,125683.0,151.845398
2,0.0,91,55.569191,0.0,113272.0,55.569191
3,0.0,59,54.635475,0.0,17243.0,54.635475
4,0.0,29,54.431252,0.0,33136.0,54.431252
...,...,...,...,...,...,...
1232195,123219.0,12445150,161.293121,123219.0,125679.0,161.293121
1232196,123219.0,12445180,148.890564,123219.0,103028.0,148.890564
1232197,123219.0,12445119,75.745598,123219.0,71850.0,75.745598
1232198,123219.0,12445196,62.743561,123219.0,4008.0,62.743561


In [None]:
# def fun(x, df_test):
#   idx = int(x['User_inp_y'])
#   if df_test.iloc[idx]['Artistid'] == x['Artist_inp']

# df_prediction_group['Result'] = df_prediction_group.groupby(['User_inp_x']).apply(fun, args = (df_test))
# df_prediction_group

In [None]:
df_test

Unnamed: 0,Userid,Userid.1,Artistid,Plays
0,0,0,71763,36
1,1,1,84842,14
2,2,2,32432,26
3,3,3,93215,131
4,4,4,92217,63
...,...,...,...,...
123215,123215,123215,67576,135
123216,123216,123216,13326,16
123217,123217,123217,107737,270
123218,123218,123218,33845,721


In [2]:
val = []
for index, row in df_prediction_group.iterrows():
  idx = int(row['User_inp_y'])
  print(idx)
  if df_test.iloc[idx]['Artistid'] == row['Artist_inp']:
    val.append(1)
  else:
    val.append(0)


In [None]:
df_prediction_group['Val'] = val
df_prediction_group

Unnamed: 0,User_inp_x,index,Prediction_x,User_inp_y,Artist_inp,Prediction_y,Val
0,0.0,60,471.524597,0.0,86096.0,471.524597,0
1,0.0,88,151.845398,0.0,125683.0,151.845398,0
2,0.0,91,55.569191,0.0,113272.0,55.569191,0
3,0.0,59,54.635475,0.0,17243.0,54.635475,0
4,0.0,29,54.431252,0.0,33136.0,54.431252,0
...,...,...,...,...,...,...,...
1232195,123219.0,12445150,161.293121,123219.0,125679.0,161.293121,0
1232196,123219.0,12445180,148.890564,123219.0,103028.0,148.890564,0
1232197,123219.0,12445119,75.745598,123219.0,71850.0,75.745598,1
1232198,123219.0,12445196,62.743561,123219.0,4008.0,62.743561,0


In [None]:
final = df_prediction_group.groupby(['User_inp_x'])['Val'].sum().reset_index()
final

Unnamed: 0,User_inp_x,Val
0,0.0,0
1,1.0,0
2,2.0,0
3,3.0,0
4,4.0,0
...,...,...
123215,123215.0,0
123216,123216.0,1
123217,123217.0,0
123218,123218.0,0


In [None]:
score = len(final[final.Val > 0])
score

14927

In [None]:
total = len(test_array)
acc = (score/total)* 100
acc

12.11410485310826

123220

In [1]:
# k = 10
# success = 0
# total = len(test_array)
# results = []

# for i in range(total):
#   print(i)
#   k_largest_artists = []
#   for pair in test_array[i]:
#     # print(pair)
#     # pair = np.array(pair, dtype = 'float32')
#     # pair = pair.reshape(1,2)
#     # print(type(pair))
#     score = model.predict([np.array(pair[0]).reshape(1,1), np.array(pair[1]).reshape(1,1)]) # Inputs to predict should be a 2 D array or a list of 2 D arrays if model takes more
#     # print(score[0][0])
#     # score = model(inputs=[pair[0], pair[1]], training = False).numpy()
#     results.append([pair[1], score[0][0]])
#   # print(results)
#   k_largest = sorted(results, key=lambda l:l[1], reverse=True)[:k]
  
#   k_largest_artists = [artist[0] for artist in k_largest]
#   # print(k_largest_artists)
#   # print(k_largest)
#   if test_array[i][0][1] in k_largest_artists:
#     success = success+1
#   print(success)
#   print(total)

# acc = (success/total)* 100
# print(acc)

In [None]:
# Define my Matrix factorization network

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from keras.layers import Input
from keras.models import Model

num_neg = 4
epochs = 1
batch_size = 256
latent_features = 8  # Dimension of our features 

input1 = Input(shape=(1,))
user_embedding = Embedding(len(users)+1, latent_features, input_length = 1)(input1)
user_embedding = Flatten()(user_embedding)

input2 = Input(shape=(1,))
artist_embedding = Embedding(len(artists)+1, latent_features, input_length = 1)(input2)
artist_embedding = Flatten()(artist_embedding)

prediction_matrix = tf.multiply(user_embedding, artist_embedding)

output = tf.keras.layers.Dense(1)(prediction_matrix)  # I have changed function initializer, it was lecun_initializer in code
model = Model([input1, input2], output)

model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])

model.summary()

In [None]:
model.fit([df_train['Userid'], df_train['Artistid']], df_train['Plays'], batch_size = batch_size, epochs = epochs, verbose = 1)

In [None]:
# Save the model

model.save('/content/gdrive/My Drive/Kaggle/Song_Recommendation_Model_MF.h5')

from keras.models import load_model

loaded_model = load_model('/content/gdrive/My Drive/Kaggle/Song_Recommendation_Model_MF.h5')

In [None]:
k = 10
success = 0
total = len(test_array)

for i in range(test_array):
  for pair in test_array[i]:
    score = model.predict(pair)
    results.append([pair[1], score])
  k_largest = sorted(results, key=lambda l:l[1], reverse=True)[:k]
  if test_array[i][0][1] in k_largest:
    success++

acc = (success/total)* 100

In [None]:
# Note: Instead of our very simple matrix factorization function implemented here, we could potentially use a BPR or ALS model to factor our matrices.
#  I have not tested this though so not sure how it would impact the performance or the final result

# BPR - Bayesian Personalized Ranking - how to treat data that user has not seen? it may be that user loves that song but score is zero only because they have not discovered it yet
# So we cant assign score of 0 , so we assign scores based on baysian probablity, based on confidence scores, less score instead of 0, and higher score for data which we have
 
# ALS - ALternating least squares - used to segregate the matrix ito two matrices with user * features  and artists * features  -> thus finding latent/hidden features
# ALS is an iterative optimization process where we for every iteration try to arrive closer and closer to a factorized representation of our original data R = U * V
# With the alternating least squares approach we use the same idea but iteratively alternate between optimizing U and fixing V and vice versa. We do this for each iteration to arrive closer to R = U x V.
# if we fix the user factors or item factors we can calculate a global minimum. The derivative of the above equation gets us the following equation for minimizing the loss of our users:

In [None]:
# Combining both models together
#  The idea is to take advantage of both the linearity and non-linearity of the two networks.

# Define my Multi layer perceptron neural network

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from keras.layers import Input
from keras.models import Model

num_neg = 4
epochs = 1
batch_size = 256
latent_features = 8  # Dimension of our features 

# Our Multi perceptron Neural network model

nn_input1 = Input(shape=(1,))
nn_user_embedding = Embedding(len(users)+1, 32, input_length = 1)(nn_input1)
nn_user_embedding = Flatten()(nn_user_embedding)

nn_input2 = Input(shape=(1,))
nn_artist_embedding = Embedding(len(artists)+1, 32, input_length = 1)(nn_input2)
nn_artist_embedding = Flatten()(nn_artist_embedding)

nn_concatenated = concatenate([nn_user_embedding, nn_artist_embedding])
print(nn_concatenated)
nn_dropout = Dropout(rate = 0.2)(nn_concatenated)

nn_layer_1 = Dense(64, activation='relu')(nn_dropout)
nn_batch_norm1 = BatchNormalization()(nn_layer_1)
nn_dropout1 = Dropout(0.2)(nn_batch_norm1)

nn_layer_2 = Dense(32, activation='relu')(nn_layer_1)
nn_batch_norm2 = BatchNormalization()(nn_layer_2)
nn_dropout2 = Dropout(0.2)(nn_batch_norm2)

nn_layer_3 = Dense(16, activation='relu')(nn_layer_2)
nn_layer_4 = Dense(8, activation='relu')(nn_layer_3)

# Our Matrix factorization model

mf_input1 = Input(shape=(1,))
mf_user_embedding = Embedding(len(users)+1, latent_features, input_length = 1)(mf_input1)
mf_user_embedding = Flatten()(mf_user_embedding)

mf_input2 = Input(shape=(1,))
mf_rtist_embedding = Embedding(len(artists)+1, latent_features, input_length = 1)(mf_input2)
mf_artist_embedding = Flatten()(mf_artist_embedding)

mf_prediction_matrix = tf.multiply(mf_user_embedding, mf_artist_embedding)

# We merge the two networks together
merged_vector = concatenate([mf_prediction_matrix, nn_layer_4])

output = tf.keras.layers.Dense(1)(merged_vector)  # I have changed function initializer, it was lecun_initializer in code
model = Model([nn_input1, nn_input2, mf_input1, mf_input2], output)

model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy'])   # Try binary also by keeping 1/0 as output

model.summary()

In [None]:
model.fit([df_train['Userid'], df_train['Artistid']], df_train['Plays'], batch_size = batch_size, epochs = epochs, verbose = 1)

In [None]:
# Save the model

model.save('/content/gdrive/My Drive/Kaggle/Song_Recommendation_Model_NN&MF.h5')

from keras.models import load_model

loaded_model = load_model('/content/gdrive/My Drive/Kaggle/Song_Recommendation_Model_NN&MF.h5')

In [None]:
k = 10
success = 0
total = len(test_array)

for i in range(test_array):
  for pair in test_array[i]:
    score = model.predict(pair)
    results.append([pair[1], score])
  k_largest = sorted(results, key=lambda l:l[1], reverse=True)[:k]
  if test_array[i][0][1] in k_largest:
    success++

acc = (success/total)* 100

In [None]:
dataset_1 = tf.data.Dataset.from_tensor_slices(np.random.rand(10, 100, 5))
dataset_1

<TensorSliceDataset shapes: (100, 5), types: tf.float64>