In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from surprise.model_selection import train_test_split,cross_validate
from surprise import SVD, accuracy
from surprise import KNNBasic
import os
from tensorflow.keras import backend as K
import tables

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from surprise import SVDpp, Reader, Dataset
from sklearn.preprocessing import LabelEncoder

### USER PREFERENCE DATA

In [14]:
taste_profile = pd.read_csv('../datasets/train_triplets.txt',sep='\t',header=None,names = ['user_id','song_id','play_count'],nrows= 200000)
song_metadata = pd.read_csv('../datasets/track_metadata.csv')
song_metadata.head()

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,252.05506,0.649822,0.394032,2003,7032331,-1,0
1,TRMMMKD128F425225D,Tanssi vaan,SOVFVAK12A8C1350D9,Karkuteillä,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,156.55138,0.439604,0.356992,1995,1514808,-1,0
2,TRMMMRX128F93187D9,No One Could Ever,SOGTUKN12AB017F4F1,Butter,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4,Hudson Mohawke,138.97098,0.643681,0.437504,2006,6945353,-1,0
3,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615,Yerba Brava,145.05751,0.448501,0.372349,2003,2168257,-1,0
4,TRMMMWA128F426B589,Tangle Of Aspens,SOHSBXH12A8C13B0DF,Rene Ablaze Presents Winter Sessions,AREQDTE1269FB37231,,Der Mystic,514.29832,0.0,0.0,0,2264873,-1,0


In [15]:
def remove_outliers(df):
    # Compute quartiles for the play_count of users and year of publication columns
    q1_play_count = df['play_count'].quantile(0.25)
    q3_play_count = df['play_count'].quantile(0.75)
    iqr_play_count = q3_play_count - q1_play_count
    lower_play_count = q1_play_count - 1.5 * iqr_play_count
    upper_play_count = q3_play_count + 1.5 * iqr_play_count
    print(q1_play_count, q3_play_count,lower_play_count, upper_play_count)

    # Identify and remove outliers based on IQR
    df = df[(df['play_count'] >= lower_play_count) & (df['play_count'] <= upper_play_count)]
    return df

In [9]:
taste_profile = remove_outliers(taste_profile)

1.0 3.0 -2.0 6.0


In [10]:
taste_profile.play_count.value_counts()

1    118872
2     30713
3     13398
5      9332
4      7242
6      4263
Name: play_count, dtype: int64

### METADATA Preprocessing

In [16]:
song_metadata.replace({b'': np.nan}, inplace=True)

In [17]:
song_metadata.drop_duplicates(inplace=True)
song_metadata.shape

(1000000, 14)

In [18]:
song_metadata.columns,song_metadata.dtypes

(Index(['track_id', 'title', 'song_id', 'release', 'artist_id', 'artist_mbid',
        'artist_name', 'duration', 'artist_familiarity', 'artist_hotttnesss',
        'year', 'track_7digitalid', 'shs_perf', 'shs_work'],
       dtype='object'),
 track_id               object
 title                  object
 song_id                object
 release                object
 artist_id              object
 artist_mbid            object
 artist_name            object
 duration              float64
 artist_familiarity    float64
 artist_hotttnesss     float64
 year                    int64
 track_7digitalid        int64
 shs_perf                int64
 shs_work                int64
 dtype: object)

In [9]:
song_metadata = song_metadata.applymap(lambda x: x.decode('utf-8').strip('\'"') if isinstance(x, bytes) else x)
song_metadata.head()

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,252.05506,0.649822,0.394032,2003,7032331,-1,0
1,TRMMMKD128F425225D,Tanssi vaan,SOVFVAK12A8C1350D9,Karkuteillä,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,156.55138,0.439604,0.356992,1995,1514808,-1,0
2,TRMMMRX128F93187D9,No One Could Ever,SOGTUKN12AB017F4F1,Butter,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4,Hudson Mohawke,138.97098,0.643681,0.437504,2006,6945353,-1,0
3,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615,Yerba Brava,145.05751,0.448501,0.372349,2003,2168257,-1,0
4,TRMMMWA128F426B589,Tangle Of Aspens,SOHSBXH12A8C13B0DF,Rene Ablaze Presents Winter Sessions,AREQDTE1269FB37231,,Der Mystic,514.29832,0.0,0.0,0,2264873,-1,0


In [10]:
song_metadata.isna().sum()

track_id                  0
title                    15
song_id                   0
release                   5
artist_id                 0
artist_mbid           62571
artist_name               0
duration                  0
artist_familiarity        0
artist_hotttnesss         0
year                      0
track_7digitalid          0
shs_perf                  0
shs_work                  0
dtype: int64

In [11]:
song_metadata.drop(columns =['artist_mbid','shs_work','shs_perf','track_7digitalid','artist_id'],inplace=True)

In [12]:
print('Song metadata:')
print('Number of rows:', song_metadata.shape[0])
print('Number of unique songs:', len(song_metadata.song_id.unique()))
display(song_metadata.head())

Song metadata:
Number of rows: 1000000
Number of unique songs: 999056


Unnamed: 0,track_id,title,song_id,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,Faster Pussy cat,252.05506,0.649822,0.394032,2003
1,TRMMMKD128F425225D,Tanssi vaan,SOVFVAK12A8C1350D9,Karkuteillä,Karkkiautomaatti,156.55138,0.439604,0.356992,1995
2,TRMMMRX128F93187D9,No One Could Ever,SOGTUKN12AB017F4F1,Butter,Hudson Mohawke,138.97098,0.643681,0.437504,2006
3,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,Yerba Brava,145.05751,0.448501,0.372349,2003
4,TRMMMWA128F426B589,Tangle Of Aspens,SOHSBXH12A8C13B0DF,Rene Ablaze Presents Winter Sessions,Der Mystic,514.29832,0.0,0.0,0


In [13]:
df_merged = pd.merge(taste_profile,song_metadata, on='song_id',how='left')

In [14]:
display(df_merged.head())
df_merged.play_count.describe().reset_index().T

Unnamed: 0,user_id,song_id,play_count,track_id,title,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,TRIQAUQ128F42435AD,The Cove,Thicker Than Water,Jack Johnson,112.63955,0.832012,0.677482,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,TRIRLYL128F42539D1,Nothing from Nothing,To Die For,Billy Preston,153.59955,0.580555,0.482492,1974
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,TRMHBXZ128F4238406,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,358.24281,0.633119,0.417718,1976
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,TRYQMNI128F147C1C7,Under Cold Blue Stars,Under Cold Blue Stars,Josh Rouse,266.34404,0.675339,0.509058,2002
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,TRAHZNE128F9341B86,Riot Radio (Soundtrack Version),Nick & Norah's Infinite Playlist - Original Mo...,The Dead 60s,141.42649,0.650281,0.451749,0


Unnamed: 0,0,1,2,3,4,5,6,7
index,count,mean,std,min,25%,50%,75%,max
play_count,205187.0,2.859426,7.425793,1.0,1.0,1.0,3.0,1890.0


In [19]:
print('{:d} users, {:.2%} of total play counts, listening a single more than 100 times'.format(
    df_merged.user_id[df_merged.play_count>150].unique().shape[0],
    df_merged.play_count[df_merged.play_count>150].count()/df_merged.shape[0]))
display(df_merged.play_count[df_merged.play_count>150].describe().reset_index().T)

28 users, 0.02% of total play counts, listening a single more than 100 times


Unnamed: 0,0,1,2,3,4,5,6,7
index,count,mean,std,min,25%,50%,75%,max
play_count,33.0,287.878788,310.559149,151.0,184.0,193.0,270.0,1890.0


In [16]:
df_merged.isna().sum()

user_id               0
song_id               0
play_count            0
track_id              0
title                 0
release               0
artist_name           0
duration              0
artist_familiarity    0
artist_hotttnesss     0
year                  0
dtype: int64

In [166]:
df_merged.dropna(inplace=True)

In [175]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_merged[['play_count']])
df_merged['play_count_norm'] = scaler.transform(df_merged[['play_count']])


In [217]:
# train_data cols
training_cols = ['song_id', 'user_id','duration','artist_familiarity','artist_hotttnesss','year']
output_cols = ['play_count_norm']
training_data = df_merged[training_cols].copy()
training_output = df_merged[output_cols]

# meta data text
text_cols = ['artist_name', 'title', 'release']
meta_text= df_merged[text_cols].copy()

training_data.shape,meta_text.shape

((203338, 6), (203338, 3))

In [218]:
from gensim.models import Word2Vec

meta_text['text'] = meta_text.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
corpus = [text.split() for text in meta_text['text']]
emb_size = 500
# Train Word2Vec model
model_w2v = Word2Vec(sentences=corpus, vector_size=emb_size, window=5, min_count=1, workers=4, sg=0)

In [219]:
# Encode metadata using Word2Vec
meta_text_vec = []
for text in meta_text['text']:
    vec = np.zeros(emb_size)
    tokens = text.split()
    for token in tokens:
        if token in model_w2v.wv:
            vec += model_w2v.wv[token]
    meta_text_vec.append(vec)
meta_text_vec = np.array(meta_text_vec)
meta_text_vec.shape

(203338, 500)

In [220]:
concatenated_array = np.concatenate((meta_text_vec, training_data[['duration','artist_familiarity','artist_hotttnesss','year']].values), axis=1)
concatenated_array.shape

(203338, 504)

In [221]:
training_data.isna().sum()


song_id               0
user_id               0
duration              0
artist_familiarity    0
artist_hotttnesss     0
year                  0
dtype: int64

In [222]:
# training_data.fillna(0,inplace=True)
# training_data.dropna(inplace=True)


In [223]:
# training_data

In [224]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split data into train and test sets
train_idx, test_idx = train_test_split(range(len(training_data)), test_size=0.2, random_state=42)

# Subset data into train and test sets using the indices
train_df = training_data.iloc[train_idx]
train_meta_array = concatenated_array[train_idx]
train_out = training_output.iloc[train_idx]

test_df = training_data.iloc[test_idx]
test_meta_array = concatenated_array[test_idx]
test_out = training_output.iloc[test_idx]


In [225]:
train_df.head()

Unnamed: 0,song_id,user_id,duration,artist_familiarity,artist_hotttnesss,year
170237,SOIEYWJ12AB0184438,3e6ef2a572d1f6f06df71bf28190eae9e1934a61,247.27465,0.865129,0.53482,2008
1164,SOHGYWI12A67ADCD65,baf47ed8da24d607e50d8684cde78b923538640f,253.85751,0.596613,0.425667,1990
182524,SOWEJXA12A6701C574,c4bccc5a73a417581d3f6eec86299720e3fdd597,294.1122,0.865022,0.916053,2005
189779,SOGQJKF12A8C13729E,e9870c5d36cd50fe8b513df5de6e1dc0e4c6f37c,231.23546,0.905477,0.84338,2007
138285,SOTQJCD12A6701D129,74bf50ca7c10e05be0a0eb8dc4e658e63388d82e,242.1024,0.740061,0.553463,2005


In [226]:
train_df['song_id'] = pd.Categorical(train_df['song_id'])
train_inp1 = train_df['song_id'].cat.codes
train_df['user_id'] = pd.Categorical(train_df['user_id'])
train_inp2 = train_df['user_id'].cat.codes

test_df['song_id'] = pd.Categorical(test_df['song_id'])
test_inp1 = test_df['song_id'].cat.codes
test_df['user_id'] = pd.Categorical(test_df['user_id'])
test_inp2 = test_df['user_id'].cat.codes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['song_id'] = pd.Categorical(train_df['song_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = pd.Categorical(train_df['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['song_id'] = pd.Categorical(test_df['song_id'])
A value is trying to be set on a 

In [227]:
#nunique() will return the total number of unique items

song_id = len(train_inp1) 
user_id = len(train_inp2)
song_input_dim = 1#song_input_data.shape[1]
user_input_dim = 1#user_input_data.shape[1]
meta_text_dim = train_meta_array.shape[1]

song_id,user_id,song_input_dim,user_input_dim,meta_text_dim

(162670, 162670, 1, 1, 504)

In [228]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Input, Embedding, Concatenate, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import keras
from keras.layers import Add, Activation, Lambda, BatchNormalization, Concatenate, Dropout, Input, Embedding, Dot, Reshape, Dense, Flatten
from keras import regularizers

In [229]:
#Embedding layer for songs
songs_input = Input(shape=(song_input_dim,))#1st Input Layer
embedding_layer_songs = Embedding(song_id+1 ,10)(songs_input)#Embedding layer
embedding_output_songs = Flatten()(embedding_layer_songs)#Embedding layer output

#Embedding layer for users
users_input = Input(shape=(user_input_dim,))#1st Input Layer
embedding_layer_users = Embedding(user_id+1 ,10)(users_input)#Embedding layer
embedding_output_users = Flatten()(embedding_layer_users)#Embedding layer output

# #Embedding layer for meta data
meta_input = Input(shape=(meta_text_dim,), name='metadata_input')#1st Input Layer
dense_layer = Dense(64, activation='relu')(meta_input)
metadata_embedding = Dense(10, activation='relu',name='metadata_embedding')(dense_layer)

#Concatination and Dense layer
joining_layer = Concatenate()([embedding_output_songs, embedding_output_users,metadata_embedding])# metadata_embedding
hidden_layer_1 = Dense(16, activation='relu')(joining_layer)
hidden_layer_1 = Dense(8, activation='relu')(hidden_layer_1)
output_layer = Dense(1, activation= 'sigmoid')(hidden_layer_1)

model1 = tf.keras.Model([songs_input, users_input,meta_input], output_layer) 

In [230]:
#Model compilation

optimizer = Adam(lr=0.001, epsilon = 1e-6, amsgrad=True) #epsilon = decay rate
model1.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
model1.summary()



Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_17 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_18 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 metadata_input (InputLayer)    [(None, 504)]        0           []                               
                                                                                                  
 embedding_16 (Embedding)       (None, 1, 10)        1626710     ['input_17[0][0]']               
                                                                                            

In [231]:
train_inp1.shape, train_inp2.shape,train_meta_array.shape,train_out.shape

((162670,), (162670,), (162670, 504), (162670, 1))

In [232]:
#Training model

early_stopping = EarlyStopping(monitor = 'val_loss', patience = 1)

history = model1.fit(
  [train_inp1, train_inp2,train_meta_array], train_out, 
  batch_size=64, 
  epochs=20, 
  verbose=1,
  callbacks = [early_stopping],
  validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20


In [233]:
test_df['song_id'] = pd.Categorical(test_df['song_id'])
test_inp1 = test_df['song_id'].cat.codes
test_df['user_id'] = pd.Categorical(test_df['user_id'])
test_inp2 = test_df['user_id'].cat.codes

test_df = training_data.iloc[test_idx]
test_meta_array = concatenated_array[test_idx]
test_out = training_output.iloc[test_idx]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['song_id'] = pd.Categorical(test_df['song_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['user_id'] = pd.Categorical(test_df['user_id'])


In [234]:
ans = model1.predict([test_inp1,test_inp2,test_meta_array])



In [235]:
model1.evaluate([test_inp1,test_inp2,test_meta_array],test_out)



[0.018955158069729805, 0.13767772912979126]

### WITHOUT NORMALIZING

In [17]:
# train_data cols
training_cols = ['song_id', 'user_id','duration','artist_familiarity','artist_hotttnesss','year']
output_cols = ['play_count']
training_data = df_merged[training_cols].copy()
training_output = df_merged[output_cols]

# meta data text
text_cols = ['artist_name', 'title', 'release']
meta_text= df_merged[text_cols].copy()

training_data.shape,meta_text.shape

((205187, 6), (205187, 3))

In [18]:
from gensim.models import Word2Vec

meta_text['text'] = meta_text.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
corpus = [text.split() for text in meta_text['text']]
emb_size = 500
# Train Word2Vec model
model_w2v = Word2Vec(sentences=corpus, vector_size=emb_size, window=5, min_count=1, workers=4, sg=0)

In [19]:
# Encode metadata using Word2Vec
meta_text_vec = []
for text in meta_text['text']:
    vec = np.zeros(emb_size)
    tokens = text.split()
    for token in tokens:
        if token in model_w2v.wv:
            vec += model_w2v.wv[token]
    meta_text_vec.append(vec)
meta_text_vec = np.array(meta_text_vec)
meta_text_vec.shape

(205187, 500)

In [20]:
concatenated_array = np.concatenate((meta_text_vec, training_data[['duration','artist_familiarity','artist_hotttnesss','year']].values), axis=1)
concatenated_array.shape

(205187, 504)

In [21]:
training_data.isna().sum()


song_id               0
user_id               0
duration              0
artist_familiarity    0
artist_hotttnesss     0
year                  0
dtype: int64

In [22]:
# training_data.fillna(0,inplace=True)
# training_data.dropna(inplace=True)


In [23]:
# training_data

In [24]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split data into train and test sets
train_idx, test_idx = train_test_split(range(len(training_data)), test_size=0.2, random_state=42)

# Subset data into train and test sets using the indices
train_df = training_data.iloc[train_idx]
train_meta_array = concatenated_array[train_idx]
train_out = training_output.iloc[train_idx]

test_df = training_data.iloc[test_idx]
test_meta_array = concatenated_array[test_idx]
test_out = training_output.iloc[test_idx]


In [25]:
train_df.head()

Unnamed: 0,song_id,user_id,duration,artist_familiarity,artist_hotttnesss,year
106606,SOFLJQZ12A6D4FADA6,2f27bb3405561e8c5cd2a0f10a35c639e06c38f2,127.242,0.629097,0.41132,1974
54200,SORLVZE12A67ADBF3C,1f2d0f82aae27c374b5c09ab5ef122f6e39fbad2,215.87546,0.757952,0.498424,2007
51185,SOPTZCE12AB0187866,6a4424678ae575822d6a368e2a51d61d9c79e3a4,217.83465,0.65512,0.433727,1996
74566,SOYCGVD12AF729F8C2,d521d6f5f6bec4f64521a2605bdcaede4191d1b8,323.63057,0.871011,0.80483,2002
201041,SOBZDFV12AB0182373,29d207f0191b03c04ad6fb26cd431c9d31f7933b,217.91302,0.789104,0.576197,1994


In [26]:
train_df['song_id'] = pd.Categorical(train_df['song_id'])
train_inp1 = train_df['song_id'].cat.codes
train_df['user_id'] = pd.Categorical(train_df['user_id'])
train_inp2 = train_df['user_id'].cat.codes

test_df['song_id'] = pd.Categorical(test_df['song_id'])
test_inp1 = test_df['song_id'].cat.codes
test_df['user_id'] = pd.Categorical(test_df['user_id'])
test_inp2 = test_df['user_id'].cat.codes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['song_id'] = pd.Categorical(train_df['song_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = pd.Categorical(train_df['user_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['song_id'] = pd.Categorical(test_df['song_id'])
A value is trying to be set on a 

In [27]:
#nunique() will return the total number of unique items

song_id = len(train_inp1) 
user_id = len(train_inp2)
song_input_dim = 1#song_input_data.shape[1]
user_input_dim = 1#user_input_data.shape[1]
meta_text_dim = train_meta_array.shape[1]

song_id,user_id,song_input_dim,user_input_dim,meta_text_dim

(164149, 164149, 1, 1, 504)

In [28]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Input, Embedding, Concatenate, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import keras
from keras.layers import Add, Activation, Lambda, BatchNormalization, Concatenate, Dropout, Input, Embedding, Dot, Reshape, Dense, Flatten
from keras import regularizers

2023-04-30 13:34:04.842616: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [29]:
#Embedding layer for songs
songs_input = Input(shape=(song_input_dim,))#1st Input Layer
embedding_layer_songs = Embedding(song_id+1 ,10)(songs_input)#Embedding layer
embedding_output_songs = Flatten()(embedding_layer_songs)#Embedding layer output

#Embedding layer for users
users_input = Input(shape=(user_input_dim,))#1st Input Layer
embedding_layer_users = Embedding(user_id+1 ,10)(users_input)#Embedding layer
embedding_output_users = Flatten()(embedding_layer_users)#Embedding layer output

# #Embedding layer for meta data
meta_input = Input(shape=(meta_text_dim,), name='metadata_input')#1st Input Layer
dense_layer = Dense(64, activation='relu')(meta_input)
metadata_embedding = Dense(10, activation='relu',name='metadata_embedding')(dense_layer)

#Concatination and Dense layer
joining_layer = Concatenate()([embedding_output_songs, embedding_output_users,metadata_embedding])# metadata_embedding
hidden_layer_1 = Dense(16, activation='relu')(joining_layer)
hidden_layer_1 = Dense(8, activation='relu')(hidden_layer_1)
output_layer = Dense(1)(hidden_layer_1)

model2 = tf.keras.Model([songs_input, users_input,meta_input], output_layer) 

2023-04-30 13:34:32.852965: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [30]:
#Model compilation

optimizer = Adam(lr=0.001, epsilon = 1e-6, amsgrad=True) #epsilon = decay rate
model2.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
model2.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 metadata_input (InputLayer)    [(None, 504)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 10)        1641500     ['input_1[0][0]']                
                                                                                              

In [31]:
train_inp1.shape, train_inp2.shape,train_meta_array.shape,train_out.shape

((164149,), (164149,), (164149, 504), (164149, 1))

In [32]:
#Training model

early_stopping = EarlyStopping(monitor = 'val_loss', patience = 1)

history = model2.fit(
  [train_inp1, train_inp2,train_meta_array], train_out, 
  batch_size=128, 
  epochs=20, 
  verbose=1,
  callbacks = [early_stopping],
  validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [33]:
test_df['song_id'] = pd.Categorical(test_df['song_id'])
test_inp1 = test_df['song_id'].cat.codes
test_df['user_id'] = pd.Categorical(test_df['user_id'])
test_inp2 = test_df['user_id'].cat.codes

test_df = training_data.iloc[test_idx]
test_meta_array = concatenated_array[test_idx]
test_out = training_output.iloc[test_idx]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['song_id'] = pd.Categorical(test_df['song_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['user_id'] = pd.Categorical(test_df['user_id'])


In [34]:
ans = model2.predict([test_inp1,test_inp2,test_meta_array])



In [35]:
model2.evaluate([test_inp1,test_inp2,test_meta_array],test_out)



[50.80177688598633, 7.127536296844482]