In [9]:
import pandas as pd
import numpy
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Concatenate, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.losses import MeanSquaredError

# Preprocessing

In [None]:
file_path = '/content/spotify_dataset.csv'

df_playlist = pd.read_csv(file_path, on_bad_lines='skip')

df_playlist.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


## Làm sạch tên cột

In [None]:
df_playlist.columns = df_playlist.columns.str.replace('"', '')
df_playlist.columns = df_playlist.columns.str.replace('name', '')
df_playlist.columns = df_playlist.columns.str.replace(' ', '')
df_playlist.columns

Index(['user_id', 'artist', 'track', 'playlist'], dtype='object')

## Chỉ giữ lại các artist có tần suất xuất hiện cao hơn 50

In [None]:
df_playlist = df_playlist.groupby('artist').filter(lambda x : len(x)>=50)

## Nhóm để lấy số lần xuất hiện của nghệ sĩ trong danh sách phát của người dùng (user_id)

In [None]:
size = lambda x: len(x)
df_freq = df_playlist.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0:'freq'})[['user_id', 'artist', 'freq']].sort_values(['freq'], ascending=False)
df_freq.head()

Unnamed: 0,user_id,artist,freq
103167,26b51e580277e131f87e4c7ee4c0887a,Vitamin String Quartet,3306
343024,7ee2b92c5bcf6133b8132363e5bda960,Jamey Aebersold Play-A-Long,1633
475405,b26235eefbfaad98b38fa26f839b8592,Johnny Cash,1371
260566,6095ff89fc71c514fb0a2aeda5b96c90,Grateful Dead,1181
188,00123e0f544dee3ab006aa7f1e5725a7,Hot Tuna,914


## Tạo DataFrame cho artist và thêm artist_id

In [None]:
df_artist = pd.DataFrame(df_freq["artist"].unique())
df_artist = df_artist.reset_index()
df_artist = df_artist.rename(columns={'index':'artist_id', 0:'artist'})
df_artist.head()

Unnamed: 0,artist_id,artist
0,0,Vitamin String Quartet
1,1,Jamey Aebersold Play-A-Long
2,2,Johnny Cash
3,3,Grateful Dead
4,4,Hot Tuna


## Xuất file CSV

In [None]:
output_path = '/content/data_preprocess.csv'
df_playlist.to_csv(output_path, index=False)

print(f"File đã được lưu thành công tại {output_path}")

File đã được lưu thành công tại /content/data_preprocess.csv


In [None]:
file_path = '/content/data_preprocess.csv'
df = pd.read_csv(file_path)

df_info = df.info()
df_shape = df.shape
df_head = df.head()

df_shape, df_info, df_head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3050033 entries, 0 to 3050032
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   object
 1   artist    object
 2   track     object
 3   playlist  object
dtypes: object(4)
memory usage: 93.1+ MB


((3050033, 4),
 None,
                             user_id                            artist  \
 0  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
 1  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
 2  9cc0cfd4d7d7885102480dd99e7a90d6  Elvis Costello & The Attractions   
 3  9cc0cfd4d7d7885102480dd99e7a90d6                    Elvis Costello   
 4  9cc0cfd4d7d7885102480dd99e7a90d6                            Lissie   
 
                                                track        playlist  
 0               (The Angels Wanna Wear My) Red Shoes  HARD ROCK 2010  
 1  (What's So Funny 'Bout) Peace, Love And Unders...  HARD ROCK 2010  
 2                              Accidents Will Happen  HARD ROCK 2010  
 3                                             Alison  HARD ROCK 2010  
 4                                        All Be Okay  HARD ROCK 2010  )

## Xử lý dữ liệu bị thiếu và trùng lặp

In [None]:
missing_values = df.isnull().sum()
duplicate_rows = df.duplicated().sum()

missing_values, duplicate_rows

(user_id       0
 artist        0
 track         7
 playlist    157
 dtype: int64,
 0)

## Thay thế các giá trị thiếu và trùng lặp bằng giá trị mặc định

In [None]:
df.fillna({"playlist": "Unknown Playlist"}, inplace=True)
df.fillna({"track": "Unknown Track"}, inplace=True)

missing_values_after = df.isnull().sum()
missing_values_after

Unnamed: 0,0
user_id,0
artist,0
track,0
playlist,0


## Chuẩn hóa dữ liệu

In [None]:
df["user_id"] = df["user_id"].str.strip().str.lower()
df["artist"] = df["artist"].str.strip().str.lower()
df["track"] = df["track"].str.strip().str.lower()
df["playlist"] = df["playlist"].str.strip().str.lower()

df.head()

Unnamed: 0,user_id,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello,(the angels wanna wear my) red shoes,hard rock 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello & the attractions,"(what's so funny 'bout) peace, love and unders...",hard rock 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello & the attractions,accidents will happen,hard rock 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello,alison,hard rock 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,lissie,all be okay,hard rock 2010


## Xuất file CSV

In [None]:
output_path = '/content/data_process.csv'
df.to_csv(output_path, index=False)

print(f"File đã được lưu thành công tại {output_path}")

File đã được lưu thành công tại /content/data_process.csv


In [None]:
file_path = '/content/data_process.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,user_id,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello,(the angels wanna wear my) red shoes,hard rock 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello & the attractions,"(what's so funny 'bout) peace, love and unders...",hard rock 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello & the attractions,accidents will happen,hard rock 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,elvis costello,alison,hard rock 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,lissie,all be okay,hard rock 2010


## Mã hóa user_id và track thành số

In [None]:
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()

df["user_id_encoded"] = user_encoder.fit_transform(df["user_id"])
df["track_encoded"] = track_encoder.fit_transform(df["track"])

## Tính số lượng user có bài hát trong playlist (play_count)

In [None]:
play_count_df = df.groupby("track_encoded")["user_id_encoded"].nunique().reset_index()
play_count_df.rename(columns={"user_id_encoded": "play_count"}, inplace=True)

df = df.merge(play_count_df, on="track_encoded", how="left")
df_model = df[["user_id_encoded", "track_encoded", "play_count"]]
print(df_model.head())

   user_id_encoded  track_encoded  play_count
0             2967           1541          20
1             2967           1718          23
2             2967          17911          25
3             2967          23619          78
4             2967          23974           1


## Chuẩn hóa play_count

In [None]:
scaler = MinMaxScaler()
df_model.loc[:, "play_count"] = scaler.fit_transform(df_model[["play_count"]]).astype(float)

## Xuất file pkl

In [None]:
df_model.to_pickle("dataset_encoded.pkl")

# Analysis

In [None]:
train_data, test_data = train_test_split(df_model, test_size=0.2, random_state=42)

train_tensor = tf.data.Dataset.from_tensor_slices(({
    "user_id": train_data["user_id_encoded"].values,
    "track_id": train_data["track_encoded"].values
}, train_data["play_count"].values)).batch(512)

# Xây dựng mô hình
embedding_size = 50
num_users = df_model["user_id_encoded"].nunique()
num_tracks = df_model["track_encoded"].nunique()

user_input = Input(shape=(1,), name="user_id")
track_input = Input(shape=(1,), name="track_id")

user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name="user_embedding")(user_input)
track_embedding = Embedding(input_dim=num_tracks, output_dim=embedding_size, name="track_embedding")(track_input)

user_embedding = Flatten()(user_embedding)
track_embedding = Flatten()(track_embedding)

dot_product = Dot(axes=1)([user_embedding, track_embedding])

concat = Concatenate()([user_embedding, track_embedding])
dense1 = Dense(128, activation="relu")(concat)
dense2 = Dense(64, activation="relu")(dense1)
output = Dense(1, activation="sigmoid")(dense2)

model = Model(inputs=[user_input, track_input], outputs=output)
model.compile(loss="mse", optimizer="adam", metrics=["mae"])

model.summary()

## Training

In [None]:
batch_size = 512

train_tensor = tf.data.Dataset.from_tensor_slices(({
    "user_id": train_data["user_id_encoded"].values,
    "track_id": train_data["track_encoded"].values
}, train_data["play_count"].values)).batch(batch_size)

test_tensor = tf.data.Dataset.from_tensor_slices(({
    "user_id": test_data["user_id_encoded"].values,
    "track_id": test_data["track_encoded"].values
}, test_data["play_count"].values)).batch(batch_size)

model.fit(train_tensor, validation_data=test_tensor, epochs=10, verbose=1)

Epoch 1/10
[1m4766/4766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2544s[0m 533ms/step - loss: 0.0063 - mae: 0.0291 - val_loss: 4.5395e-05 - val_mae: 0.0043
Epoch 2/10
[1m4766/4766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2500s[0m 525ms/step - loss: 1.6938e-05 - mae: 0.0027 - val_loss: 1.2704e-05 - val_mae: 0.0022
Epoch 3/10
[1m4766/4766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2522s[0m 529ms/step - loss: 5.4623e-06 - mae: 0.0015 - val_loss: 9.2417e-06 - val_mae: 0.0017
Epoch 4/10
[1m4766/4766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2508s[0m 526ms/step - loss: 2.3542e-06 - mae: 9.7050e-04 - val_loss: 7.4812e-06 - val_mae: 0.0013
Epoch 5/10
[1m4766/4766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2573s[0m 533ms/step - loss: 1.7026e-06 - mae: 7.5249e-04 - val_loss: 7.7961e-06 - val_mae: 0.0013
Epoch 6/10
[1m4766/4766[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2521s[0m 528ms/step - loss: 1.1715e-06 - mae: 6.3051e-04 - val_loss: 7.0767e-06 - val_ma

<keras.src.callbacks.history.History at 0x7db1540e4c10>

In [None]:
model.save("/content/recommendation_model.h5")



# Test

In [15]:
# Load mô hình mà không compile để tránh lỗi mất hàm loss
model = load_model("recommendation_model.h5", compile=False)

# Lưu lại mô hình mới
model.save("recommendation_model_fixed.h5")



In [17]:
# Tải mô hình với custom_objects
model = load_model("recommendation_model_fixed.h5")
# Tải dữ liệu từ file pickle
df_model = pd.read_pickle("dataset_encoded.pkl")

file_path = 'data_process.csv'
df = pd.read_csv(file_path)

df.head()
print(df_model.head())



   user_id_encoded  track_encoded  play_count
0             2967           1541    0.018234
1             2967           1718    0.021113
2             2967          17911    0.023033
3             2967          23619    0.073896
4             2967          23974    0.000000


In [18]:
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()

df["user_id_encoded"] = user_encoder.fit_transform(df["user_id"])
df["track_encoded"] = track_encoder.fit_transform(df["track"])

In [19]:
sample_user_id = np.random.choice(df_model["user_id_encoded"].unique())

all_tracks = np.array(df_model["track_encoded"].unique())

user_inputs = np.full_like(all_tracks, sample_user_id)
predictions = model.predict([user_inputs, all_tracks])

top_indices = predictions.flatten().argsort()[-10:][::-1]

recommended_tracks = track_encoder.inverse_transform(all_tracks[top_indices])

original_user_id = user_encoder.inverse_transform([sample_user_id])[0]
print(f"🎵 Gợi ý bài hát cho người dùng {original_user_id}:")
for i, track in enumerate(recommended_tracks, 1):
    print(f"{i}. {track}")

[1m17088/17088[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 690us/step
🎵 Gợi ý bài hát cho người dùng d0a7e55e5736f047b541dc7e40d9e529:
1. intro
2. home
3. radioactive
4. closer
5. runaway
6. alive
7. wake me up
8. hold on
9. happy
10. midnight city
