In [17]:
import pandas as pd
import numpy as np


# Load data
spotify_data = pd.read_csv('cleaned_spotify_data.csv').sample(frac=0.1, random_state=42)
user_profiles = pd.read_csv('cleaned_user_profiles.csv').sample(frac=0.25, random_state=42)

In [18]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

categorical_features = ['genre', 'artist', 'mode']  # mode as categorical if it represents major/minor
numerical_features = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 
                      'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

# Handling missing values for numerical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Scale features
])

In [19]:
from sklearn.decomposition import TruncatedSVD

# Adjust the categorical transformer to include dimensionality reduction
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('svd', TruncatedSVD(n_components=50))  # Reduce dimensions to 50
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

spotify_features_scaled = preprocessor.fit_transform(spotify_data)
user_features_scaled = preprocessor.transform(user_profiles)

In [20]:
spotify_features_scaled = preprocessor.fit_transform(spotify_data)
user_features_scaled = preprocessor.transform(user_profiles)

In [21]:
spotify_features_scaled.shape, user_features_scaled.shape

((3283, 61), (3236, 61))

In [22]:
# Fetching new column names for categorical features
genre_columns = [col for col in preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out() if 'genre' in col]
artist_columns = [col for col in preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out() if 'artist' in col]

print("Genre columns:", genre_columns)
print("Sample artist columns:", artist_columns[:5])  # Displaying first few artist columns to keep the output manageable


Genre columns: ['genre_edm', 'genre_latin', 'genre_pop', 'genre_r&b', 'genre_rap', 'genre_rock']
Sample artist columns: ['artist_!deladap', 'artist_$ANFI', 'artist_$IFRA', 'artist_$uicideBoy$', 'artist_-M-']


In [23]:
import numpy as np
import gc  # Import garbage collector

def calculate_interaction_vectorized_batched(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data, batch_size=100):
    user_features_scaled = user_features_scaled.astype(np.float32)
    spotify_features_scaled = spotify_features_scaled.astype(np.float32)

    num_users = user_features_scaled.shape[0]
    num_songs = spotify_features_scaled.shape[0]
    interaction_scores = np.zeros((num_users, num_songs), dtype=np.float32)
    
    # Define the genre and artist column indices
    genre_indices = np.array([user_profiles.columns.get_loc(name) for name in genre_columns])
    artist_indices = np.array([user_profiles.columns.get_loc(name) for name in artist_columns])

    for start in range(0, num_users, batch_size):
        end = min(start + batch_size, num_users)
        batch_user_features = user_features_scaled[start:end]
        print(f"Processing user batch: {start//batch_size+1}/{(num_users-1)//batch_size+1}, Users {start}-{end-1}")
        for start_song in range(0, num_songs, batch_size):
            end_song = min(start_song + batch_size, num_songs)
            batch_spotify_features = spotify_features_scaled[start_song:end_song]

            distances = np.sqrt(((batch_user_features[:, np.newaxis, :] - batch_spotify_features[np.newaxis, :, :]) ** 2).sum(axis=2))
            feature_similarity = np.exp(-distances).astype(np.float32)

            genre_similarity = np.equal(user_profiles.iloc[start:end, genre_indices].values[:, np.newaxis, :], 
                                        spotify_data.iloc[start_song:end_song, genre_indices].values[np.newaxis, :, :]).astype(np.float32).mean(axis=2)
            artist_similarity = np.equal(user_profiles.iloc[start:end, artist_indices].values[:, np.newaxis, :], 
                                         spotify_data.iloc[start_song:end_song, artist_indices].values[np.newaxis, :, :]).astype(np.float32).mean(axis=2)

            batch_interaction_scores = 0.4 * feature_similarity + 0.5 * genre_similarity + 0.1 * artist_similarity
            interaction_scores[start:end, start_song:end_song] = batch_interaction_scores

            # Optional: Clear memory of temporary variables
            del distances, feature_similarity, genre_similarity, artist_similarity
            gc.collect()

    return interaction_scores

# def calculate_interaction_vectorized_batched(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data, batch_size=100):
#     # Define data as float32 to save memory
#     user_features_scaled = user_features_scaled.astype(np.float32)
#     spotify_features_scaled = spotify_features_scaled.astype(np.float32)

#     num_users = user_features_scaled.shape[0]
#     num_songs = spotify_features_scaled.shape[0]
#     interaction_scores = np.zeros((num_users, num_songs), dtype=np.float32)  # Use float32 for interaction scores
    
#     for start in range(0, num_users, batch_size):
#         end = min(start + batch_size, num_users)
#         batch_user_features = user_features_scaled[start:end]
#         print(f"Processing user batch: {start//batch_size+1}/{(num_users-1)//batch_size+1}, Users {start}-{end-1}")
#         for start_song in range(0, num_songs, batch_size):
#             end_song = min(start_song + batch_size, num_songs)
#             batch_spotify_features = spotify_features_scaled[start_song:end_song]

#             distances = np.sqrt(((batch_user_features[:, np.newaxis, :] - batch_spotify_features[np.newaxis, :, :]) ** 2).sum(axis=2))
#             feature_similarity = np.exp(-distances).astype(np.float32)  # Convert to float32 immediately

#             genre_similarity = np.equal(user_profiles['genre'].values[start:end][:, np.newaxis], spotify_data['genre'].values[start_song:end_song][np.newaxis, :]).astype(np.float32)
#             artist_similarity = np.equal(user_profiles['artist'].values[start:end][:, np.newaxis], spotify_data['artist'].values[start_song:end_song][np.newaxis, :]).astype(np.float32)

#             batch_interaction_scores = 0.4 * feature_similarity + 0.5 * genre_similarity + 0.1 * artist_similarity
#             interaction_scores[start:end, start_song:end_song] = batch_interaction_scores

#             # Optional: Clear memory of temporary variables
#             del distances, feature_similarity, genre_similarity, artist_similarity
#             gc.collect()

#     return interaction_scores


In [24]:
# Convert user_profiles and spotify_data back to DataFrame for genre and artist columns
user_profiles = pd.DataFrame(user_profiles, columns=preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out())
spotify_data = pd.DataFrame(spotify_data, columns=preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out())

# Execute interaction calculations
interaction_scores = calculate_interaction_vectorized_batched(user_features_scaled, spotify_features_scaled, user_profiles, spotify_data)

# Compute interaction threshold
interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction

# Calculate interaction
interaction = (interaction_scores >= interaction_threshold).astype(int)

# Flatten interaction matrix and features for neural network input
X = spotify_features_scaled.repeat(len(user_profiles), axis=0)
y = interaction.flatten()

interaction_scores.shape, interaction_threshold


Processing user batch: 1/33, Users 0-99


Processing user batch: 2/33, Users 100-199
Processing user batch: 3/33, Users 200-299
Processing user batch: 4/33, Users 300-399
Processing user batch: 5/33, Users 400-499
Processing user batch: 6/33, Users 500-599
Processing user batch: 7/33, Users 600-699
Processing user batch: 8/33, Users 700-799
Processing user batch: 9/33, Users 800-899
Processing user batch: 10/33, Users 900-999
Processing user batch: 11/33, Users 1000-1099
Processing user batch: 12/33, Users 1100-1199
Processing user batch: 13/33, Users 1200-1299
Processing user batch: 14/33, Users 1300-1399
Processing user batch: 15/33, Users 1400-1499
Processing user batch: 16/33, Users 1500-1599
Processing user batch: 17/33, Users 1600-1699
Processing user batch: 18/33, Users 1700-1799
Processing user batch: 19/33, Users 1800-1899
Processing user batch: 20/33, Users 1900-1999
Processing user batch: 21/33, Users 2000-2099
Processing user batch: 22/33, Users 2100-2199
Processing user batch: 23/33, Users 2200-2299
Processing use

((3236, 3283), 0.01602406334131956)

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
import numpy as np
import gc

# Assuming X, y, and spotify_features_scaled are properly computed as discussed

# Custom focal loss implementation
def focal_loss(gamma=2., alpha=4.):
    gamma = float(gamma)
    alpha = float(alpha)
    def focal_loss_fixed(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        y_true = tf.cast(y_true, tf.float32)
        alpha_t = y_true*alpha + (tf.keras.backend.ones_like(y_true) - y_true)*(1-alpha)
        p_t = y_true*y_pred + (tf.keras.backend.ones_like(y_true) - y_true)*(1-y_pred)
        fl = - alpha_t * tf.keras.backend.pow((tf.keras.backend.ones_like(y_true) - p_t), gamma) * tf.keras.backend.log(p_t)
        return tf.keras.backend.mean(fl)
    return focal_loss_fixed

# Neural network setup with Dropout
model = Sequential([
    Dense(64, activation='relu', input_dim=X.shape[1]),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss=focal_loss(), metrics=['accuracy'])

print("Model compiled.")

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets. Training model...")

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
gc.collect()

# Predict interaction scores for all songs using batch prediction
predicted_scores = model.predict(spotify_features_scaled).flatten()
spotify_data['predicted_interaction'] = predicted_scores
gc.collect()  # Clear memory of no longer needed large objects
print("Prediction complete and memory cleared.")

# k-NN model using enhanced features
features_with_score = np.concatenate([genre_columns, artist_columns, ['predicted_interaction']])
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(spotify_data[features_with_score])
print("k-NN model set up.")

user_index = 3  # Adjust based on user profile index
user_id = user_profiles.iloc[user_index]['userid']
user_top_genre = user_profiles.iloc[user_index]['genre']
user_feature_vector = user_features_scaled[user_index].reshape(1, -1)
user_predicted_score = model.predict(user_feature_vector).flatten()[0]
query_vector = np.append(user_feature_vector, user_predicted_score).reshape(1, -1)

# Finding top 5 nearest songs
distances, indices = knn.kneighbors(query_vector)
recommended_songs = spotify_data.iloc[indices[0]]

# Output recommended songs
print(f"Recommended Songs for User: {user_id}, Top Genre: {user_top_genre}")
print(recommended_songs[['track', 'artist', 'genre']])

# Validation
predicted_interactions = model.predict(X_test).flatten()
rmse = mean_squared_error(y_test, predicted_interactions, squared=False)
print("RMSE for neural network predictions:", rmse)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Model compiled.
Data split into training and testing sets. Training model...
Epoch 1/10
[1m265595/265595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m792s[0m 3ms/step - accuracy: 0.2497 - loss: -35.8333 - val_accuracy: 0.2506 - val_loss: -35.8444
Epoch 2/10
[1m265595/265595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m557s[0m 2ms/step - accuracy: 0.2497 - loss: -35.9030 - val_accuracy: 0.2506 - val_loss: -35.8444
Epoch 3/10
[1m265595/265595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m556s[0m 2ms/step - accuracy: 0.2499 - loss: -35.8948 - val_accuracy: 0.2506 - val_loss: -35.8444
Epoch 4/10
[1m265595/265595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m550s[0m 2ms/step - accuracy: 0.2499 - loss: -35.8951 - val_accuracy: 0.2506 - val_loss: -35.8444
Epoch 5/10
[1m265595/265595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m565s[0m 2ms/step - accuracy: 0.2499 - loss: -35.8944 - val_accuracy: 0.2506 - val_loss: -35.8444
Epoch 6/10
[1m265595/265595[0m [32m━━━━━━━━━━━━━━━

ValueError: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values