In [None]:
!pip -q install keras "tensorflow==2.15.1" "tf2onnx" "onnx" "seaborn" "onnxruntime"

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from pathlib import Path
import pickle
import os

import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
import tf2onnx
import onnx
import tensorflow as tf

from sklearn.metrics import confusion_matrix
import numpy as np
import pickle
import seaborn as sns
from matplotlib import pyplot as plt
import onnxruntime as rt

In [None]:
song_properties = pd.read_parquet('https://github.com/rhoai-mlops/jukebox/raw/refs/heads/main/1-data_prep/song_properties.parquet')
song_rankings = pd.read_parquet('https://github.com/rhoai-mlops/jukebox/raw/refs/heads/main/1-data_prep/song_rankings.parquet')
song_properties.keys()

In [None]:
song_rankings = song_rankings.dropna()

In [None]:
X = song_rankings.merge(song_properties, on='spotify_id', how='left')
X = X[['is_explicit', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
y = song_rankings['country']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_one_hot = tf.keras.utils.to_categorical(y_encoded)

# Split the data into training and testing sets so you have something to test the trained model with.
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size = 0.2, shuffle = False)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size = 0.2, stratify = y_train)

# Scale the data to remove mean and have unit variance. The data will be between -1 and 1, which makes it a lot easier for the model to learn than random (and potentially large) values.
# It is important to only fit the scaler to the training data, otherwise you are leaking information about the global distribution of variables (which is influenced by the test set) into the training set.
scaler = MinMaxScaler()
scaled_x_train = scaler.fit_transform(X_train.values)

In [None]:
Path("models/music/1/artifacts").mkdir(parents=True, exist_ok=True)

with open("models/music/1/artifacts/scaler.pkl", "wb") as handle:
    pickle.dump(scaler, handle)

with open("models/music/1/artifacts/label_encoder.pkl", "wb") as handle:
    pickle.dump(label_encoder, handle)

In [None]:
model = Sequential()
model.add(Dense(32, activation = 'relu', input_dim = len(X.columns)))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dense(y_one_hot.shape[1], activation = 'sigmoid'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy', 'Precision', 'Recall'])
model.summary()

In [None]:
epochs = 2
history = model.fit(scaled_x_train, y_train, epochs=epochs, \
                    validation_data=(scaler.transform(X_val.values),y_val), \
                    verbose = True)
print("Training of model is complete")

In [None]:
model_proto, _ = tf2onnx.convert.from_keras(model)
os.makedirs("models/music/1", exist_ok=True)
onnx.save(model_proto, "models/music/1/model.onnx")

In [None]:
sess = rt.InferenceSession("models/music/1/model.onnx", providers=rt.get_available_providers())
input_name = sess.get_inputs()[0].name
output_name = sess.get_outputs()[0].name
y_pred_temp = sess.run([output_name], {input_name: scaler.transform(X_test.values).astype(np.float32)}) 
y_pred_temp = y_pred_temp[0]
y_pred_argmax = np.argmax(y_pred_temp, axis=1)

In [None]:
y_test_argmax = np.argmax(y_test, axis=1)

In [None]:
accuracy = np.sum(y_pred_argmax == y_test_argmax) / len(y_pred_argmax)
print("Accuracy: " + str(accuracy))

c_matrix = confusion_matrix(y_test_argmax,y_pred_argmax)
ax = sns.heatmap(c_matrix, cmap='Blues')
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")
ax.set_title('Confusion Matrix')
plt.show()