In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import gc

# Load Data
spotify_data = pd.read_csv('cleaned_spotify_data.csv')
user_profiles = pd.read_csv('cleaned_user_profiles.csv')
user_profiles = user_profiles.sample(frac = 0.1, random_state = 42)
spotify_data = spotify_data.sample(frac = 0.25, random_state = 42)

features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
spotify_features = spotify_data[features]
user_features = user_profiles[features]

# Data Scaling
scaler = StandardScaler()
spotify_features_scaled = scaler.fit_transform(spotify_features)
spotify_features_scaled = spotify_features_scaled
user_features_scaled = scaler.transform(user_features)
user_features_scaled = user_features_scaled

In [26]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [27]:
spotify_data

Unnamed: 0,track,artist,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
30056,I Miss You,Jeriqo,edm,0.520,0.789,0,-7.717,1,0.0432,0.004910,0.000013,0.0816,0.4150,174.026,216347
11827,Who Are You,The Who,rock,0.651,0.661,9,-11.405,1,0.0511,0.265000,0.003130,0.1060,0.4890,156.371,378707
23571,Happy,The Beef Seeds,r&b,0.640,0.758,10,-5.204,1,0.1600,0.665000,0.000000,0.1270,0.9320,86.529,218044
14741,ONE,Rev Theory,rock,0.398,0.966,4,-2.352,0,0.0453,0.000006,0.000000,0.3030,0.5200,90.016,208196
25570,Palace/Curse,The Internet,r&b,0.447,0.625,10,-8.212,0,0.3230,0.035100,0.000727,0.2430,0.2610,67.104,440013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7836,"Captain Save a Hoe (feat. The Click, D-Shot, B...",E-40,rap,0.753,0.514,7,-12.235,1,0.2600,0.061200,0.000000,0.2920,0.5630,166.015,287600
764,Macarena - Bayside Boys Remix,Los Del Rio,pop,0.746,0.909,11,-6.032,1,0.0580,0.206000,0.000002,0.0656,0.9620,103.189,222027
23613,How Crazy Is That,Derrick Ryan,r&b,0.782,0.326,6,-12.198,0,0.0754,0.058500,0.000009,0.1320,0.1970,123.940,193548
3422,Song To The Siren (Remastered),This Mortal Coil,pop,0.216,0.240,10,-12.598,1,0.0406,0.843000,0.000006,0.1350,0.0875,128.679,211093


In [28]:
user_index = 4  # Change based on your user profile index
user_id = user_profiles.iloc[user_index]['userid']

print(user_id)

34c5b62e9510fbf18271381aba7cf683


In [29]:
import numpy as np
import gc  # Garbage Collector interface

def calculate_interaction_batch(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data, batch_size=100):
    num_users = user_profiles.shape[0]
    num_songs = spotify_data.shape[0]
    interaction_scores = np.zeros((num_users, num_songs))
    
    print(f"Starting batch processing... Total users: {num_users}, Total songs: {num_songs}")

    # Batch processing
    for start_idx in range(0, num_users, batch_size):
        end_idx = min(start_idx + batch_size, num_users)
        print(f"Processing batch from user index {start_idx} to {end_idx - 1}")

        # Calculate distances and similarity scores for the batch
        user_batch = user_features_scaled[start_idx:end_idx]
        distances = np.sqrt(((user_batch[:, np.newaxis, :] - spotify_features_scaled[np.newaxis, :, :]) ** 2).sum(axis=2))
        feature_similarity = np.exp(-distances)
        print(f"Feature similarity calculated for batch. Shape: {feature_similarity.shape}")

        # Calculate genre and artist similarity
        genre_similarity = (user_profiles['genre'].values[start_idx:end_idx, np.newaxis] == spotify_data['genre'].values[np.newaxis, :]).astype(int)
        artist_similarity = (user_profiles['artist'].values[start_idx:end_idx, np.newaxis] == spotify_data['artist'].values[np.newaxis, :]).astype(int)
        print(f"Genre and artist similarity calculated for batch.")

        # Calculate composite score for the batch
        interaction_scores[start_idx:end_idx] = 0.7 * feature_similarity + 0.2 * genre_similarity + 0.1 * artist_similarity
        print(f"Interaction scores updated for batch. Current shape of scores array: {interaction_scores.shape}")

        # Explicitly call garbage collection
        gc.collect()
        print(f"Garbage collection triggered after processing batch.")

    return interaction_scores

# Usage example with debugging
interaction_scores = calculate_interaction_batch(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data, batch_size=100)
interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction
interaction = (interaction_scores >= interaction_threshold).astype(int)

# Flatten interaction matrix and features for neural network input
X = spotify_features_scaled.repeat(len(user_profiles), axis=0)
y = interaction.flatten()
print("Data prepared for neural network input.")

# def calculate_interaction_vectorized(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data):
#     # Euclidean distances
#     distances = np.sqrt(((user_features_scaled[:, np.newaxis, :] - spotify_features_scaled[np.newaxis, :, :]) ** 2).sum(axis=2))
#     print(distances)
#     # Scale distances into similarity scores (exp(-distance))
#     feature_similarity = np.exp(-distances)
#     print(feature_similarity)
#     # Genre and artist similarity (binary 0 or 1)
#     genre_similarity = (user_profiles['genre'].values[:, np.newaxis] == spotify_data['genre'].values[np.newaxis, :]).astype(int)
#     print(genre_similarity)
#     artist_similarity = (user_profiles['artist'].values[:, np.newaxis] == spotify_data['artist'].values[np.newaxis, :]).astype(int)
#     print(artist_similarity)
    
#     # Composite score
#     interaction_scores = 0.7 * feature_similarity + 0.2 * genre_similarity + 0.1 * artist_similarity
#     print(interaction_scores)
#     return interaction_scores

# # Generate interaction scores
# interaction_scores = calculate_interaction_vectorized(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data)
# interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction
# interaction = (interaction_scores >= interaction_threshold).astype(int)

# # Flatten interaction matrix and features for neural network input
# X = spotify_features_scaled.repeat(len(user_profiles), axis=0)
# y = interaction.flatten()

Starting batch processing... Total users: 1294, Total songs: 8208
Processing batch from user index 0 to 99
Feature similarity calculated for batch. Shape: (100, 8208)


Genre and artist similarity calculated for batch.
Interaction scores updated for batch. Current shape of scores array: (1294, 8208)
Garbage collection triggered after processing batch.
Processing batch from user index 100 to 199
Feature similarity calculated for batch. Shape: (100, 8208)
Genre and artist similarity calculated for batch.
Interaction scores updated for batch. Current shape of scores array: (1294, 8208)
Garbage collection triggered after processing batch.
Processing batch from user index 200 to 299
Feature similarity calculated for batch. Shape: (100, 8208)
Genre and artist similarity calculated for batch.
Interaction scores updated for batch. Current shape of scores array: (1294, 8208)
Garbage collection triggered after processing batch.
Processing batch from user index 300 to 399
Feature similarity calculated for batch. Shape: (100, 8208)
Genre and artist similarity calculated for batch.
Interaction scores updated for batch. Current shape of scores array: (1294, 8208)
G

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Neural network setup with Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

model = Sequential([
    Dense(64, activation='relu', input_dim=X_train.shape[1], kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=0.001, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

# Fitting the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), 
          callbacks=[early_stopping, model_checkpoint, reduce_lr])

gc.collect()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m265529/265529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1609s[0m 6ms/step - accuracy: 0.7494 - loss: 0.5874 - val_accuracy: 0.7497 - val_loss: 0.5658 - learning_rate: 0.0010
Epoch 2/100
[1m265529/265529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1495s[0m 6ms/step - accuracy: 0.7502 - loss: 0.5653 - val_accuracy: 0.7497 - val_loss: 0.5658 - learning_rate: 0.0010
Epoch 3/100
[1m265529/265529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1446s[0m 5ms/step - accuracy: 0.7501 - loss: 0.5654 - val_accuracy: 0.7497 - val_loss: 0.5657 - learning_rate: 0.0010
Epoch 4/100
[1m265529/265529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1591s[0m 6ms/step - accuracy: 0.7500 - loss: 0.5654 - val_accuracy: 0.7497 - val_loss: 0.5658 - learning_rate: 0.0010
Epoch 5/100
[1m265529/265529[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1318s[0m 5ms/step - accuracy: 0.7500 - loss: 0.5655 - val_accuracy: 0.7497 - val_loss: 0.5658 - learning_rate: 0.0010
Epoch 6/100
[1m2655

KeyboardInterrupt: 

In [None]:
# Predict interaction scores for all songs using batch prediction
predicted_scores = model.predict(spotify_features_scaled).flatten()
spotify_data['predicted_interaction'] = predicted_scores
gc.collect()  # Clear memory of no longer needed large objects
print("prediction complete and memory cleared.")

In [None]:
# k-NN model using enhanced features
features_with_score = features + ['predicted_interaction']
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(spotify_data[features_with_score])
print("k-NN model set up.")

user_index = 3  # Change based on your user profile index
user_id = user_profiles.iloc[user_index]['userid']
user_top_genre = user_profiles.iloc[user_index]['genre']
user_feature_vector = user_features_scaled[user_index].reshape(1, -1)
user_predicted_score = model.predict(user_feature_vector).flatten()[0]
query_vector = np.append(user_feature_vector, user_predicted_score).reshape(1, -1)

In [None]:
# Finding top 5 nearest songs
distances, indices = knn.kneighbors(query_vector)
recommended_songs = spotify_data.iloc[indices[0]]

# Output recommended songs
print(f"Recommended Songs for User: {user_id}, Top Genre: {user_top_genre}")
print(recommended_songs[['track', 'artist', 'genre']])

# Validation
predicted_interactions = model.predict(X_test).flatten()
rmse = mean_squared_error(y_test, predicted_interactions, squared=False)
print("RMSE for neural network predictions:", rmse)

In [None]:
# # Predict interaction scores for all songs

# print("starting prediction") 

# predicted_scores = model.predict(spotify_features_scaled).flatten()
# spotify_data['predicted_interaction'] = predicted_scores

# # k-NN model using enhanced features
# features_with_score = features + ['predicted_interaction']
# knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
# knn.fit(spotify_data[features_with_score])

# # Example user query using k-NN with neural network outputs
# user_index = 3  # Change based on your user profile index
# user_id = user_profiles.iloc[user_index]['userid']
# user_top_genre = user_profiles.iloc[user_index]['genre']
# user_feature_vector = user_features_scaled[user_index].reshape(1, -1)
# user_predicted_score = model.predict(user_feature_vector).flatten()[0]
# query_vector = np.append(user_feature_vector, user_predicted_score).reshape(1, -1)

# # Finding top 5 nearest songs
# distances, indices = knn.kneighbors(query_vector)
# recommended_songs = spotify_data.iloc[indices[0]]

# # Output recommended songs
# print(f"Recommended Songs for User: {user_id}, Top Genre: {user_top_genre}")
# print(recommended_songs[['track', 'artist', 'genre']])

# # Validation
# predicted_interactions = model.predict(X_test).flatten()
# rmse = mean_squared_error(y_test, predicted_interactions, squared=False)
# print("RMSE for neural network predictions:", rmse)
