In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import gc # garbage collection
# Load Data
spotify_data = pd.read_csv('cleaned_spotify_data.csv')
user_profiles = pd.read_csv('cleaned_user_profiles.csv')
user_profiles = user_profiles.sample(frac = 0.1, random_state = 42)
spotify_data = spotify_data.sample(frac = 0.25, random_state = 42)

spotify_data = pd.get_dummies(spotify_data, columns=['genre'], prefix='genre')
user_profiles = pd.get_dummies(user_profiles, columns=['genre'], prefix='genre')

spotify_genres = spotify_data.columns[spotify_data.columns.str.startswith('genre_')]
user_genres = user_profiles.columns[user_profiles.columns.str.startswith('genre_')]
for column in set(spotify_genres).union(set(user_genres)):
    if column not in spotify_data:
        spotify_data[column] = 0
    if column not in user_profiles:
        user_profiles[column] = 0

spotify_data = spotify_data.sort_index(axis=1)
user_profiles = user_profiles.sort_index(axis=1)



In [14]:
# Define features
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo'] + list(spotify_genres)
spotify_features = spotify_data[features]
user_features = user_profiles[features]

# Scale data (excluding one-hot encoded genres)
non_genre_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']
scaler = StandardScaler()
spotify_features[non_genre_features] = scaler.fit_transform(spotify_features[non_genre_features])
user_features[non_genre_features] = scaler.transform(user_features[non_genre_features])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_features[non_genre_features] = scaler.fit_transform(spotify_features[non_genre_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_features[non_genre_features] = scaler.transform(user_features[non_genre_features])


In [15]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [16]:
spotify_data

Unnamed: 0,acousticness,artist,danceability,duration_ms,energy,genre_edm,genre_latin,genre_pop,genre_r&b,genre_rap,genre_rock,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,track,valence
30056,0.004910,Jeriqo,0.520,216347,0.789,True,False,False,False,False,False,0.000013,0,0.0816,-7.717,1,0.0432,174.026,I Miss You,0.4150
11827,0.265000,The Who,0.651,378707,0.661,False,False,False,False,False,True,0.003130,9,0.1060,-11.405,1,0.0511,156.371,Who Are You,0.4890
23571,0.665000,The Beef Seeds,0.640,218044,0.758,False,False,False,True,False,False,0.000000,10,0.1270,-5.204,1,0.1600,86.529,Happy,0.9320
14741,0.000006,Rev Theory,0.398,208196,0.966,False,False,False,False,False,True,0.000000,4,0.3030,-2.352,0,0.0453,90.016,ONE,0.5200
25570,0.035100,The Internet,0.447,440013,0.625,False,False,False,True,False,False,0.000727,10,0.2430,-8.212,0,0.3230,67.104,Palace/Curse,0.2610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7836,0.061200,E-40,0.753,287600,0.514,False,False,False,False,True,False,0.000000,7,0.2920,-12.235,1,0.2600,166.015,"Captain Save a Hoe (feat. The Click, D-Shot, B...",0.5630
764,0.206000,Los Del Rio,0.746,222027,0.909,False,False,True,False,False,False,0.000002,11,0.0656,-6.032,1,0.0580,103.189,Macarena - Bayside Boys Remix,0.9620
23613,0.058500,Derrick Ryan,0.782,193548,0.326,False,False,False,True,False,False,0.000009,6,0.1320,-12.198,0,0.0754,123.940,How Crazy Is That,0.1970
3422,0.843000,This Mortal Coil,0.216,211093,0.240,False,False,True,False,False,False,0.000006,10,0.1350,-12.598,1,0.0406,128.679,Song To The Siren (Remastered),0.0875


In [17]:
user_profiles

Unnamed: 0,acousticness,artist,danceability,duration_ms,energy,genre_edm,genre_latin,genre_pop,genre_r&b,genre_rap,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,userid,valence
7277,0.118041,MGMT,0.625620,244504.687389,0.722032,False,False,False,False,False,...,0.120698,5.124334,0.228630,-7.030853,0.563055,0.078786,116.860638,4.0,8f9087f6282ab8940e0a05b8d8d72cf9,0.570952
5321,0.181056,Arctic Monkeys,0.555587,244961.344340,0.700704,False,False,False,False,False,...,0.079662,5.292453,0.191401,-7.286527,0.576651,0.067234,123.241649,4.0,6836dde6f2c7a33f2cafb9469e4a1202,0.490708
8420,0.188772,Florence + The Machine,0.564706,245634.352941,0.703518,False,False,True,False,False,...,0.095612,6.035294,0.180544,-5.656824,0.670588,0.048295,118.410988,4.0,a6c22b2427bab3befa12c198f68c4c6f,0.390315
1739,0.612133,Bon Iver,0.518190,237518.047619,0.345952,False,False,True,False,False,...,0.054606,6.142857,0.120857,-11.615143,0.571429,0.051938,130.011000,4.0,21d45478089ae25edde1c7a255ff5b47,0.264229
2677,0.304905,Jimi Hendrix,0.555882,230432.462185,0.577571,False,False,False,False,False,...,0.136620,5.411765,0.139308,-9.705840,0.663866,0.063864,115.163176,4.0,34c5b62e9510fbf18271381aba7cf683,0.568057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,0.170129,Jennifer Lopez,0.621918,234689.062257,0.721430,False,False,True,False,False,...,0.023088,4.194553,0.156794,-6.002946,0.665370,0.066950,116.684696,4.0,1fdc364dce244a7c39335a0fffe1a221,0.544607
8963,0.131267,Lykke Li,0.719333,256157.666667,0.753333,False,False,True,False,False,...,0.106600,9.000000,0.113933,-5.298000,0.000000,0.038433,127.963667,4.0,b263af1b49ee150de26e55e744820381,0.449333
4617,0.926000,Daughter,0.509000,240560.000000,0.141000,False,False,True,False,False,...,0.049600,1.000000,0.124000,-16.469000,1.000000,0.033900,125.012000,4.0,5a4add9f7c91ef3292b3993bf514138e,0.115000
2218,0.235443,alt-J,0.581212,243006.817308,0.655636,False,False,True,False,False,...,0.096350,5.272436,0.161989,-7.062340,0.641026,0.052084,119.791381,4.0,2b374164c54d35a59f89eb54d157f346,0.431925


In [18]:
# unique vals in genre
spotify_data[spotify_genres].sum().sort_values(ascending=False)

genre_edm      1511
genre_rap      1453
genre_pop      1361
genre_r&b      1326
genre_rock     1287
genre_latin    1270
dtype: int64

In [19]:
def calculate_interaction_batch(user_features, spotify_features, batch_size=100):
    num_users = user_features.shape[0]
    num_songs = spotify_features.shape[0]
    interaction_scores = np.zeros((num_users, num_songs))

    # Convert DataFrame to numpy for calculation
    user_features_np = user_features.to_numpy()
    spotify_features_np = spotify_features.to_numpy()

    print(f"Starting batch processing... Total users: {num_users}, Total songs: {num_songs}")

    # Batch processing
    for start_idx in range(0, num_users, batch_size):
        end_idx = min(start_idx + batch_size, num_users)
        print(f"Processing batch from user index {start_idx} to {end_idx - 1}")

        # Calculate distances and similarity scores for the batch
        user_batch = user_features_np[start_idx:end_idx]
        distances = np.sqrt(((user_batch[:, np.newaxis, :] - spotify_features_np[np.newaxis, :, :]) ** 2).sum(axis=2))
        feature_similarity = np.exp(-distances)
        print(f"Feature similarity calculated for batch. Shape: {feature_similarity.shape}")

        # Calculate genre similarity using dot product of one-hot encoded vectors
        genre_similarity = np.dot(user_profiles.iloc[start_idx:end_idx][spotify_genres].to_numpy(), spotify_data[spotify_genres].to_numpy().T)
        print(f"Genre similarity calculated for batch.")

        # Calculate composite score for the batch
        interaction_scores[start_idx:end_idx] = 0.7 * feature_similarity + 0.3 * genre_similarity
        print(f"Interaction scores updated for batch. Current shape of scores array: {interaction_scores.shape}")

        # Explicitly call garbage collection
        gc.collect()
        print(f"Garbage collection triggered after processing batch.")

    return interaction_scores

# Usage example with debugging
interaction_scores = calculate_interaction_batch(user_features, spotify_features, batch_size=100)
interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction
interaction = (interaction_scores >= interaction_threshold).astype(int)

# Flatten interaction matrix and features for neural network input
X = spotify_features.values.repeat(len(user_profiles), axis=0)
y = interaction.flatten()
print("Data prepared for neural network input.")

# def calculate_interaction_vectorized(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data):
#     # Euclidean distances
#     distances = np.sqrt(((user_features_scaled[:, np.newaxis, :] - spotify_features_scaled[np.newaxis, :, :]) ** 2).sum(axis=2))
#     print(distances)
#     # Scale distances into similarity scores (exp(-distance))
#     feature_similarity = np.exp(-distances)
#     print(feature_similarity)
#     # Genre and artist similarity (binary 0 or 1)
#     genre_similarity = (user_profiles['genre'].values[:, np.newaxis] == spotify_data['genre'].values[np.newaxis, :]).astype(int)
#     print(genre_similarity)
#     artist_similarity = (user_profiles['artist'].values[:, np.newaxis] == spotify_data['artist'].values[np.newaxis, :]).astype(int)
#     print(artist_similarity)
    
#     # Composite score
#     interaction_scores = 0.7 * feature_similarity + 0.2 * genre_similarity + 0.1 * artist_similarity
#     print(interaction_scores)
#     return interaction_scores

# # Generate interaction scores
# interaction_scores = calculate_interaction_vectorized(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data)
# interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction
# interaction = (interaction_scores >= interaction_threshold).astype(int)

# # Flatten interaction matrix and features for neural network input
# X = spotify_features_scaled.repeat(len(user_profiles), axis=0)
# y = interaction.flatten()

Starting batch processing... Total users: 1294, Total songs: 8208
Processing batch from user index 0 to 99


TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation, LeakyReLU, PReLU, GaussianNoise
from tensorflow.keras.optimizers import Adam

model = Sequential([
    # Adding Gaussian Noise to input layer for regularization
    GaussianNoise(0.1, input_shape=(X_train.shape[1],)),
    
    # First dense layer with LeakyReLU
    Dense(256),
    BatchNormalization(),
    LeakyReLU(alpha=0.01),
    Dropout(0.3),

    # Second dense layer with ELU
    Dense(128),
    BatchNormalization(),
    Activation('elu'),
    Dropout(0.3),

    # Third dense layer with PReLU
    Dense(64),
    PReLU(),
    BatchNormalization(),
    Dropout(0.2),

    # Fourth layer, going back to a standard ReLU to compare performance
    Dense(64),
    Activation('relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Fifth layer with sigmoid activation used for binary classification output
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

In [None]:
predicted_scores = model.predict(spotify_features.values).flatten()
spotify_data['predicted_interaction'] = predicted_scores
gc.collect()  # Clear memory of no longer needed large objects
print("Prediction complete and memory cleared.")

In [None]:
features_with_score = features + ['predicted_interaction']
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(spotify_data[features_with_score])
print("k-NN model set up.")

user_index = 3  # Change based on your user profile index
user_id = user_profiles.iloc[user_index]['userid']
user_top_genre = user_profiles.iloc[user_index]['genre']
user_feature_vector = user_features.iloc[user_index].values.reshape(1, -1)
user_predicted_score = model.predict(user_feature_vector).flatten()[0]
query_vector = np.append(user_feature_vector, user_predicted_score).reshape(1, -1)
print("User profile and query vector set up.")

In [None]:
# Finding top 5 nearest songs
distances, indices = knn.kneighbors(query_vector)
recommended_songs = spotify_data.iloc[indices[0]]

# Output recommended songs
print(f"Recommended Songs for User: {user_id}, Top Genre: {user_top_genre}")
print(recommended_songs[['track', 'artist', 'genre']])

# Validation
predicted_interactions = model.predict(X_test).flatten()
rmse = mean_squared_error(y_test, predicted_interactions, squared=False)
print("RMSE for neural network predictions:", rmse)

In [None]:
# # Predict interaction scores for all songs

# print("starting prediction") 

# predicted_scores = model.predict(spotify_features_scaled).flatten()
# spotify_data['predicted_interaction'] = predicted_scores

# # k-NN model using enhanced features
# features_with_score = features + ['predicted_interaction']
# knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
# knn.fit(spotify_data[features_with_score])

# # Example user query using k-NN with neural network outputs
# user_index = 3  # Change based on your user profile index
# user_id = user_profiles.iloc[user_index]['userid']
# user_top_genre = user_profiles.iloc[user_index]['genre']
# user_feature_vector = user_features_scaled[user_index].reshape(1, -1)
# user_predicted_score = model.predict(user_feature_vector).flatten()[0]
# query_vector = np.append(user_feature_vector, user_predicted_score).reshape(1, -1)

# # Finding top 5 nearest songs
# distances, indices = knn.kneighbors(query_vector)
# recommended_songs = spotify_data.iloc[indices[0]]

# # Output recommended songs
# print(f"Recommended Songs for User: {user_id}, Top Genre: {user_top_genre}")
# print(recommended_songs[['track', 'artist', 'genre']])

# # Validation
# predicted_interactions = model.predict(X_test).flatten()
# rmse = mean_squared_error(y_test, predicted_interactions, squared=False)
# print("RMSE for neural network predictions:", rmse)
