In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import gc

# Load Data
spotify_data = pd.read_csv('cleaned_spotify_data.csv')
user_profiles = pd.read_csv('cleaned_user_profiles.csv')
user_profiles = user_profiles.sample(frac = 0.05, random_state = 42)
features_to_drop = ['key', 'mode', 'duration_ms', 'liveness']
#check if the features are in the dataframe before dropping them
spotify_data = spotify_data.drop(features_to_drop, axis = 1, errors = 'ignore')
user_profiles = user_profiles.drop(features_to_drop, axis = 1, errors = 'ignore')



In [33]:
spotify_data = spotify_data.sample(frac = 0.25, random_state = 42)

features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']
spotify_features = spotify_data[features]
user_features = user_profiles[features]

# Data Scaling
scaler = StandardScaler()
spotify_features_scaled = scaler.fit_transform(spotify_features)
spotify_features_scaled = spotify_features_scaled
user_features_scaled = scaler.transform(user_features)
user_features_scaled = user_features_scaled

In [34]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [35]:
spotify_data

Unnamed: 0,track,artist,genre,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo
30056,I Miss You,Jeriqo,edm,0.520,0.789,-7.717,0.0432,0.004910,0.000013,0.4150,174.026
11827,Who Are You,The Who,rock,0.651,0.661,-11.405,0.0511,0.265000,0.003130,0.4890,156.371
23571,Happy,The Beef Seeds,r&b,0.640,0.758,-5.204,0.1600,0.665000,0.000000,0.9320,86.529
14741,ONE,Rev Theory,rock,0.398,0.966,-2.352,0.0453,0.000006,0.000000,0.5200,90.016
25570,Palace/Curse,The Internet,r&b,0.447,0.625,-8.212,0.3230,0.035100,0.000727,0.2610,67.104
...,...,...,...,...,...,...,...,...,...,...,...
7836,"Captain Save a Hoe (feat. The Click, D-Shot, B...",E-40,rap,0.753,0.514,-12.235,0.2600,0.061200,0.000000,0.5630,166.015
764,Macarena - Bayside Boys Remix,Los Del Rio,pop,0.746,0.909,-6.032,0.0580,0.206000,0.000002,0.9620,103.189
23613,How Crazy Is That,Derrick Ryan,r&b,0.782,0.326,-12.198,0.0754,0.058500,0.000009,0.1970,123.940
3422,Song To The Siren (Remastered),This Mortal Coil,pop,0.216,0.240,-12.598,0.0406,0.843000,0.000006,0.0875,128.679


In [36]:
user_index = 4  # Change based on your user profile index
user_id = user_profiles.iloc[user_index]['userid']

print(user_id)

34c5b62e9510fbf18271381aba7cf683


In [37]:
import numpy as np
import gc  # Garbage Collector interface

def calculate_interaction_batch(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data, batch_size=100):
    num_users = user_profiles.shape[0]
    num_songs = spotify_data.shape[0]
    interaction_scores = np.zeros((num_users, num_songs))
    
    print(f"Starting batch processing... Total users: {num_users}, Total songs: {num_songs}")

    # Batch processing
    for start_idx in range(0, num_users, batch_size):
        end_idx = min(start_idx + batch_size, num_users)
        print(f"Processing batch from user index {start_idx} to {end_idx - 1}")

        # Calculate distances and similarity scores for the batch
        user_batch = user_features_scaled[start_idx:end_idx]
        distances = np.sqrt(((user_batch[:, np.newaxis, :] - spotify_features_scaled[np.newaxis, :, :]) ** 2).sum(axis=2))
        feature_similarity = np.exp(-distances)
        print(f"Feature similarity calculated for batch. Shape: {feature_similarity.shape}")

        # Calculate genre and artist similarity
        genre_similarity = (user_profiles['genre'].values[start_idx:end_idx, np.newaxis] == spotify_data['genre'].values[np.newaxis, :]).astype(int)
        artist_similarity = (user_profiles['artist'].values[start_idx:end_idx, np.newaxis] == spotify_data['artist'].values[np.newaxis, :]).astype(int)
        print(f"Genre and artist similarity calculated for batch.")

        # Calculate composite score for the batch
        interaction_scores[start_idx:end_idx] = 0.4 * feature_similarity + 0.55 * genre_similarity + 0.05 * artist_similarity
        print(f"Interaction scores updated for batch. Current shape of scores array: {interaction_scores.shape}")

        # Explicitly call garbage collection
        gc.collect()
        print(f"Garbage collection triggered after processing batch.")

    return interaction_scores

# Usage example with debugging
interaction_scores = calculate_interaction_batch(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data, batch_size=100)
interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction
interaction = (interaction_scores >= interaction_threshold).astype(int)

# Flatten interaction matrix and features for neural network input
X = spotify_features_scaled.repeat(len(user_profiles), axis=0)
y = interaction.flatten()
print("Data prepared for neural network input.")

# def calculate_interaction_vectorized(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data):
#     # Euclidean distances
#     distances = np.sqrt(((user_features_scaled[:, np.newaxis, :] - spotify_features_scaled[np.newaxis, :, :]) ** 2).sum(axis=2))
#     print(distances)
#     # Scale distances into similarity scores (exp(-distance))
#     feature_similarity = np.exp(-distances)
#     print(feature_similarity)
#     # Genre and artist similarity (binary 0 or 1)
#     genre_similarity = (user_profiles['genre'].values[:, np.newaxis] == spotify_data['genre'].values[np.newaxis, :]).astype(int)
#     print(genre_similarity)
#     artist_similarity = (user_profiles['artist'].values[:, np.newaxis] == spotify_data['artist'].values[np.newaxis, :]).astype(int)
#     print(artist_similarity)
    
#     # Composite score
#     interaction_scores = 0.7 * feature_similarity + 0.2 * genre_similarity + 0.1 * artist_similarity
#     print(interaction_scores)
#     return interaction_scores

# # Generate interaction scores
# interaction_scores = calculate_interaction_vectorized(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data)
# interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction
# interaction = (interaction_scores >= interaction_threshold).astype(int)

# # Flatten interaction matrix and features for neural network input
# X = spotify_features_scaled.repeat(len(user_profiles), axis=0)
# y = interaction.flatten()

Starting batch processing... Total users: 647, Total songs: 8208
Processing batch from user index 0 to 99
Feature similarity calculated for batch. Shape: (100, 8208)
Genre and artist similarity calculated for batch.
Interaction scores updated for batch. Current shape of scores array: (647, 8208)
Garbage collection triggered after processing batch.
Processing batch from user index 100 to 199
Feature similarity calculated for batch. Shape: (100, 8208)
Genre and artist similarity calculated for batch.
Interaction scores updated for batch. Current shape of scores array: (647, 8208)
Garbage collection triggered after processing batch.
Processing batch from user index 200 to 299
Feature similarity calculated for batch. Shape: (100, 8208)
Genre and artist similarity calculated for batch.
Interaction scores updated for batch. Current shape of scores array: (647, 8208)
Garbage collection triggered after processing batch.
Processing batch from user index 300 to 399
Feature similarity calculated 

In [38]:
# Neural network setup with Dropout
model = Sequential([
    Dense(64, activation='relu', input_dim=len(features)),
    Dropout(0.1),  # Dropout to prevent overfitting
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output a score between 0 and 1
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("Model compiled.")

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets. Training model...")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Model compiled.
Data split into training and testing sets. Training model...


In [39]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
gc.collect()

Epoch 1/10
[1m132765/132765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 1ms/step - accuracy: 0.7498 - loss: 0.5634 - val_accuracy: 0.7497 - val_loss: 0.5627
Epoch 2/10
[1m132765/132765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 1ms/step - accuracy: 0.7499 - loss: 0.5625 - val_accuracy: 0.7497 - val_loss: 0.5627
Epoch 3/10
[1m132765/132765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 1ms/step - accuracy: 0.7499 - loss: 0.5624 - val_accuracy: 0.7497 - val_loss: 0.5627
Epoch 4/10
[1m132765/132765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 1ms/step - accuracy: 0.7499 - loss: 0.5624 - val_accuracy: 0.7497 - val_loss: 0.5627
Epoch 5/10
[1m132765/132765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 1ms/step - accuracy: 0.7502 - loss: 0.5621 - val_accuracy: 0.7497 - val_loss: 0.5627
Epoch 6/10
[1m132765/132765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 1ms/step - accuracy: 0.7501 - loss: 0.5622 - val_accuracy: 0.7497

4041

In [40]:
# Predict interaction scores for all songs using batch prediction
predicted_scores = model.predict(spotify_features_scaled).flatten()
spotify_data['predicted_interaction'] = predicted_scores
gc.collect()  # Clear memory of no longer needed large objects
print("prediction complete and memory cleared.")

[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
prediction complete and memory cleared.


In [41]:
# k-NN model using enhanced features
features_with_score = features + ['predicted_interaction']
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(spotify_data[features_with_score])
print("k-NN model set up.")

user_index = 4  # Change based on your user profile index
user_id = user_profiles.iloc[user_index]['userid']
user_top_genre = user_profiles.iloc[user_index]['genre']
user_feature_vector = user_features_scaled[user_index].reshape(1, -1)
user_predicted_score = model.predict(user_feature_vector).flatten()[0]
query_vector = np.append(user_feature_vector, user_predicted_score).reshape(1, -1)

k-NN model set up.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


In [42]:
# Finding top 5 nearest songs
distances, indices = knn.kneighbors(query_vector)
recommended_songs = spotify_data.iloc[indices[0]]

# Output recommended songs
print(f"Recommended Songs for User: {user_id}, Top Genre: {user_top_genre}")
print(recommended_songs[['track', 'artist', 'genre']])

# Validation
predicted_interactions = model.predict(X_test).flatten()
rmse = mean_squared_error(y_test, predicted_interactions, squared=False)
print("RMSE for neural network predictions:", rmse)

Recommended Songs for User: 34c5b62e9510fbf18271381aba7cf683, Top Genre: rock
                                   track             artist  genre
11363              Hi, How're You Doin'?   DREAMS COME TRUE   rock
13764  Still Crazy After All These Years         Paul Simon   rock
6502                         cold nights             itssvd    rap
19325                               Culo            Pitbull  latin
22513                       Heard a Word  Michelle Williams    r&b
[1m   59/33192[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m28s[0m 871us/step 



[1m33192/33192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 793us/step
RMSE for neural network predictions: 0.4332059377779928




In [43]:
# # Predict interaction scores for all songs

# print("starting prediction") 

# predicted_scores = model.predict(spotify_features_scaled).flatten()
# spotify_data['predicted_interaction'] = predicted_scores

# # k-NN model using enhanced features
# features_with_score = features + ['predicted_interaction']
# knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
# knn.fit(spotify_data[features_with_score])

# # Example user query using k-NN with neural network outputs
# user_index = 3  # Change based on your user profile index
# user_id = user_profiles.iloc[user_index]['userid']
# user_top_genre = user_profiles.iloc[user_index]['genre']
# user_feature_vector = user_features_scaled[user_index].reshape(1, -1)
# user_predicted_score = model.predict(user_feature_vector).flatten()[0]
# query_vector = np.append(user_feature_vector, user_predicted_score).reshape(1, -1)

# # Finding top 5 nearest songs
# distances, indices = knn.kneighbors(query_vector)
# recommended_songs = spotify_data.iloc[indices[0]]

# # Output recommended songs
# print(f"Recommended Songs for User: {user_id}, Top Genre: {user_top_genre}")
# print(recommended_songs[['track', 'artist', 'genre']])

# # Validation
# predicted_interactions = model.predict(X_test).flatten()
# rmse = mean_squared_error(y_test, predicted_interactions, squared=False)
# print("RMSE for neural network predictions:", rmse)


In [44]:
import plotly.graph_objects as go

# Assuming you have selected three features from your dataset for the plot
# Let's say 'feature1', 'feature2', and 'feature3' are the columns you're interested in

# Extract these features for the nearest neighbors
nn_features = spotify_data.iloc[indices[0]][['feature1', 'feature2', 'feature3']]

# Extract the query user's features (assuming the query_vector includes these three features)
query_features = query_vector[0, :3]  # Make sure this slicing matches the feature indices

# Create a scatter plot for the neighbors
trace_neighbors = go.Scatter3d(
    x=nn_features['feature1'],
    y=nn_features['feature2'],
    z=nn_features['feature3'],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.8),
    name='Nearest Neighbors'
)

# Add the query user's point
trace_query = go.Scatter3d(
    x=[query_features[0]],
    y=[query_features[1]],
    z=[query_features[2]],
    mode='markers',
    marker=dict(size=10, color='red', opacity=1),
    name='Query User'
)

# Define the layout and plot everything
layout = go.Layout(
    title='3D Scatter Plot of User and Nearest Neighbors in Feature Space',
    margin=dict(l=0, r=0, b=0, t=0),
    scene=dict(
        xaxis_title='Feature 1',
        yaxis_title='Feature 2',
        zaxis_title='Feature 3'
    )
)

fig = go.Figure(data=[trace_neighbors, trace_query], layout=layout)
fig.show()


KeyError: "None of [Index(['feature1', 'feature2', 'feature3'], dtype='object')] are in the [columns]"