In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import gc  # Garbage Collector interface

# Load Data
spotify_data = pd.read_csv('cleaned_spotify_data.csv')
user_profiles = pd.read_csv('cleaned_user_profiles.csv')
user_profiles = user_profiles.sample(frac = 0.05, random_state = 42)
spotify_data = spotify_data.sample(frac = 0.25, random_state = 42)

# One-hot encode the 'genre' column for both datasets
spotify_data = pd.get_dummies(spotify_data, columns=['genre'], prefix='genre')
user_profiles = pd.get_dummies(user_profiles, columns=['genre'], prefix='genre')

# Ensuring both dataframes have the same genre columns
spotify_genres = spotify_data.columns[spotify_data.columns.str.startswith('genre_')]
user_genres = user_profiles.columns[user_profiles.columns.str.startswith('genre_')]
for column in set(spotify_genres).union(set(user_genres)):
    if column not in spotify_data:
        spotify_data[column] = 0
    if column not in user_profiles:
        user_profiles[column] = 0

spotify_data = spotify_data.sort_index(axis=1)
user_profiles = user_profiles.sort_index(axis=1)

features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo'] + list(spotify_genres)
spotify_features = spotify_data[features]
user_features = user_profiles[features]

# Data Scaling (exclude one-hot encoded genres from scaling)
non_genre_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']
scaler = StandardScaler()
spotify_features[non_genre_features] = scaler.fit_transform(spotify_features[non_genre_features])
user_features[non_genre_features] = scaler.transform(user_features[non_genre_features])

spotify_features = spotify_features.astype(np.float16)
user_features = user_features.astype(np.float16)
def calculate_interaction_batch(user_features, spotify_features, batch_size=100):
    num_users = user_features.shape[0]
    num_songs = spotify_features.shape[0]
    interaction_scores = np.zeros((num_users, num_songs))
    
    print(f"Starting batch processing... Total users: {num_users}, Total songs: {num_songs}")

    # Batch processing
    for start_idx in range(0, num_users, batch_size):
        end_idx = min(start_idx + batch_size, num_users)
        print(f"Processing batch from user index {start_idx} to {end_idx - 1}")

        # Calculate distances and similarity scores for the batch
        user_batch = user_features[start_idx:end_idx]
        distances = np.sqrt(((user_batch[:, np.newaxis, :] - spotify_features[np.newaxis, :, :]) ** 2).sum(axis=2))
        feature_similarity = np.exp(-distances)
        print(f"Feature similarity calculated for batch. Shape: {feature_similarity.shape}")

        # Calculate genre similarity using dot product of one-hot encoded vectors
        genre_similarity = np.dot(user_profiles.iloc[start_idx:end_idx][spotify_genres].values, spotify_data[spotify_genres].values.T)
        print(f"Genre similarity calculated for batch.")

        # Calculate composite score for the batch
        interaction_scores[start_idx:end_idx] = 0.7 * feature_similarity + 0.3 * genre_similarity
        print(f"Interaction scores updated for batch. Current shape of scores array: {interaction_scores.shape}")

        # Explicitly call garbage collection
        gc.collect()
        print(f"Garbage collection triggered after processing batch.")

    return interaction_scores

# Prepare data for model input
X = spotify_features.values.repeat(len(user_profiles), axis=0)
interaction_scores = calculate_interaction_batch(user_features, spotify_features)  # Assuming function is defined to calculate interaction scores
interaction_threshold = np.percentile(interaction_scores, 75)  # top 25% as positive interaction
y = (interaction_scores >= interaction_threshold).astype(int).flatten()

# Define the model architecture
model = Sequential([
    Dense(64, activation='relu', input_dim=X.shape[1]),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

    # Evaluate the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[1]} of {scores[1]}; Loss of {scores[0]}')

    # Predictions
    predictions = (model.predict(X_test) > 0.5).astype(int)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    print(f'Precision for fold {fold_no}: {precision}')
    print(f'Recall for fold {fold_no}: {recall}')
    print(f'F1 Score for fold {fold_no}: {f1}')

    fold_no += 1

gc.collect()  # Clear memory after processing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_features[non_genre_features] = scaler.fit_transform(spotify_features[non_genre_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_features[non_genre_features] = scaler.transform(user_features[non_genre_features])


Starting batch processing... Total users: 647, Total songs: 8208
Processing batch from user index 0 to 99


InvalidIndexError: (slice(None, None, None), None, slice(None, None, None))