In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from scipy.spatial.distance import cdist

In [2]:
import sys
project_root = '../../src/'
sys.path.insert(0, project_root)

In [3]:
user_data = pd.read_csv('../../datasets/user_month_datasets/user1_1month_listening_history.csv')
print(user_data.columns)

Index(['duration (ms)', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'spec_rate', 'labels', 'uri', 'user_id', 'group_no', 'day'],
      dtype='object')


In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def convert_days_to_datetime(df, start_date=None, time_range=('09:00', '23:00')):
    """
    Convert numeric days (1-30) to datetime format, distributing songs from the same day
    across different times within a specified range.
    
    Args:
        df: DataFrame containing 'day' column with values 1-30
        start_date: Optional starting date (defaults to 30 days ago)
        time_range: Tuple of strings ('HH:MM', 'HH:MM') for the time range
    
    Returns:
        DataFrame with 'day' column converted to datetime
    """
    df = df.copy()
    
    # Convert time range to datetime objects for easier manipulation
    time_start = pd.to_datetime(time_range[0]).time()
    time_end = pd.to_datetime(time_range[1]).time()
    
    # Calculate total minutes in the time range
    minutes_start = time_start.hour * 60 + time_start.minute
    minutes_end = time_end.hour * 60 + time_end.minute
    
    # Set start date if not provided
    if start_date is None:
        start_date = datetime.now() - timedelta(days=30)
    
    # Create a dictionary to store times for each day
    day_times = {}
    
    # Group by days to handle multiple songs per day
    for day_num in df['day'].unique():
        # Count songs for this day
        songs_in_day = df[df['day'] == day_num].shape[0]
        
        # Generate random minutes within the time range
        random_minutes = sorted(
            random.sample(
                range(minutes_start, minutes_end),
                min(songs_in_day, minutes_end - minutes_start)
            )
        )
        
        # If we have more songs than minutes, randomly assign times to remaining songs
        if songs_in_day > len(random_minutes):
            extra_times = [
                random.randint(minutes_start, minutes_end)
                for _ in range(songs_in_day - len(random_minutes))
            ]
            random_minutes.extend(extra_times)
            random_minutes.sort()
        
        # Convert minutes to time objects
        times = [
            (datetime.min + timedelta(minutes=m)).time()
            for m in random_minutes
        ]
        
        # Store in dictionary
        day_times[day_num] = times
    
    # Function to get next available time for a day
    time_counters = {day: 0 for day in day_times.keys()}
    
    def get_datetime_for_song(row):
        day_num = row['day']
        day_date = start_date + timedelta(days=day_num - 1)
        
        # Get time for this song
        times = day_times[day_num]
        time_idx = time_counters[day_num]
        song_time = times[time_idx]
        
        # Update counter
        time_counters[day_num] = (time_idx + 1) % len(times)
        
        # Combine date and time
        return datetime.combine(day_date.date(), song_time)
    
    # Apply the conversion
    df['day'] = df.apply(get_datetime_for_song, axis=1)
    
    return df

# used to convert the days column from (0,30) to actual datetime

In [5]:
user_data = convert_days_to_datetime(user_data)


In [32]:
user_data.columns

Index(['duration (ms)', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'spec_rate', 'labels', 'uri', 'user_id', 'group_no', 'day'],
      dtype='object')

In [6]:
class ContentBasedMusicRecommender:
    def __init__(self, data, feature_weights=None):
        """
        Initialize the recommender system.
        
        Args:
            feature_weights (dict): Optional weights for different features
        """
        self.scaler = StandardScaler()
        self.data = data
        self.uri_to_index = {uri: idx for idx, uri in enumerate(data['uri'])}
        # self.index_to_uri = {idx: uri for uri, idx in self.uri_to_index.items()}
        self.index_to_uri = {idx: uri for idx, uri in enumerate(self.data['uri'])}

        self.feature_cols = data.columns
        self.feature_cols = [col for col in self.feature_cols if col not in ['user_id', 'group_no', 'uri', 'day']]
        
        # Default feature weights if none provided
        self.feature_weights = feature_weights or {
            'duration (ms)': 0.05,
            'danceability': 0.15,
            'energy': 0.15,
            'loudness': 0.05,
            'speechiness': 0.1,
            'acousticness': 0.1,
            'instrumentalness': 0.1,
            'liveness': 0.05,
            'valence': 0.15,
            'tempo': 0.05,
            'spec_rate': 0.05,
            'labels': 0.05,
        }
        
        self.temporal_model = None
        self.similarity_matrix = None
        self.song_features = None
        self.song_index = None

    def preprocess_features(self, df):
        """
        Preprocess the features by scaling and applying weights.
        """
        # Scale the features
       
        scaled_features = self.scaler.fit_transform(df[self.feature_cols])
        
        # Apply feature weights
        weighted_features = scaled_features * np.array(
            [self.feature_weights[col] for col in self.feature_cols]
        )
        
        return weighted_features

    def fit(self):
        """
        Fit the recommender system with user's listening history.
        
        Args:
            df: DataFrame containing song features and listening history
        """
        # Preprocess features
        self.song_features = self.preprocess_features(self.data)
        self.song_index = self.data.index
        
        # Calculate similarity matrix
        self.similarity_matrix = cosine_similarity(self.song_features)
        
        # Train temporal model (XGBoost for feature importance over time)
        self.train_temporal_model()

    def train_temporal_model(self):
        """
        Train a model to learn temporal patterns in feature preferences.
        """
        # Create temporal features
        self.data['day_of_week'] = pd.to_datetime(self.data['day']).dt.dayofweek
        # Prepare features for temporal model
        X = self.data[self.feature_cols + ['day_of_week']]
        y = self.data[self.feature_cols].shift(-1)  # Predict next day's preferences
        
        # Remove last row with NaN target
        X = X[:-1]
        y = y[:-1]
        
        # Train XGBoost model
        self.temporal_model = xgb.XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3
        )
        self.temporal_model.fit(X, y)

    def get_recommendations(self, current_song_uri, n_recommendations=5, 
                          current_day=None, diversity_weight=0.3):
        """
        Get song recommendations based on current song URI and temporal patterns.
        
        Args:
            current_song_uri: URI of the current song
            n_recommendations: Number of recommendations to return
            current_day: Current day of week (0-6)
            diversity_weight: Weight for diversity in recommendations
        
        Returns:
            List of recommended song URIs and their similarity scores
        """
        # Get index for current song URI
        current_idx = self.uri_to_index[current_song_uri]
        
        # Get similar songs based on content
        similar_scores = self.similarity_matrix[current_idx]
        
        if current_day is not None:
            # Get temporal predictions
            current_features = self.song_features[current_idx]
            temporal_input = np.append(current_features, current_day)
            predicted_preferences = self.temporal_model.predict(
                temporal_input.reshape(1, -1)
            )
            
            # Combine content-based and temporal scores
            temporal_similarities = cosine_similarity(
                predicted_preferences, 
                self.song_features
            )[0]
            
            # Weighted combination
            final_scores = (0.7 * similar_scores + 
                          0.3 * temporal_similarities)
        else:
            final_scores = similar_scores
        
        # Apply diversity penalty
        recommendations = []
        recommended_scores = []
        excluded_indices = set()
        
        for _ in range(n_recommendations):
            # Mask already recommended songs
            masked_scores = final_scores.copy()
            masked_scores[list(excluded_indices)] = -np.inf
            
            # Get next best recommendation
            next_idx = np.argmax(masked_scores)
            next_uri = self.index_to_uri[next_idx]
            
            recommendations.append(next_uri)
            recommended_scores.append(masked_scores[next_idx])
            excluded_indices.add(next_idx)
            
            # Apply diversity penalty to similar songs
            penalty = self.similarity_matrix[next_idx] * diversity_weight
            final_scores -= penalty
        
        # Return recommendations with their similarity scores
        return list(zip(recommendations, recommended_scores))
    def get_feature_importance(self):
        """
        Get feature importance from the temporal model.
        """
        importance_dict = dict(zip(
            self.feature_cols + ['day_of_week'],
            self.temporal_model.feature_importances_
        ))
        return importance_dict

def create_recommendation_pipeline(data):
    """
    Create and train a complete recommendation pipeline.
    
    Args:
        df: DataFrame with song features and listening history
    
    Returns:
        Trained recommender system
    """
    # Initialize recommender with custom weights
    recommender = ContentBasedMusicRecommender(
        data=data,
        feature_weights={
            'duration (ms)': 0.05,
            'danceability': 0.15,
            'energy': 0.15,
            'loudness': 0.05,
            'speechiness': 0.1,
            'acousticness': 0.1,
            'instrumentalness': 0.1,
            'liveness': 0.05,
            'valence': 0.15,
            'tempo': 0.05,
            'spec_rate': 0.05,
            'labels': 0.05,
        }
    )
    
    # Fit the recommender
    recommender.fit()
    
    return recommender

In [7]:
# user_data.drop('day_of_week', axis=1, inplace=True)
recommender = create_recommendation_pipeline(user_data)

In [8]:
print(len(user_data['uri'].unique()))
print(len(list(recommender.index_to_uri.keys())))

794
1550


In [9]:
current_song_uri = recommender.index_to_uri[8]
recommendations = recommender.get_recommendations(
    current_song_uri,
    n_recommendations=10,
    current_day=2  # e.g., Wednesday
)

# Print recommendations
for uri, score in recommendations:
    print(f"Recommended song URI: {uri}, Similarity score: {score:.3f}")

Recommended song URI: spotify:track:0Y0N7rSoFH5Qm5kackabuy, Similarity score: 0.691
Recommended song URI: spotify:track:18YDZkzXYFjqCNYXBqQ77O, Similarity score: 0.393
Recommended song URI: spotify:track:3pgh2eNJYm63jFn2SEpR2r, Similarity score: 0.159
Recommended song URI: spotify:track:4wFIX5TaIwCJEUp1Hl6kOR, Similarity score: 0.178
Recommended song URI: spotify:track:3nUOnATHktg7VW5TtWeJnd, Similarity score: 0.144
Recommended song URI: spotify:track:3MPcfaxPTQPR3w58qaODvX, Similarity score: 0.106
Recommended song URI: spotify:track:02XDObqhj6pi5TCPPM2qK0, Similarity score: 0.195
Recommended song URI: spotify:track:0eddjTFvjJ0bHab5dR6CvH, Similarity score: 0.129
Recommended song URI: spotify:track:4vSqkSAxdKQ7IYCrfkO9IZ, Similarity score: 0.148
Recommended song URI: spotify:track:3x4Zx0rPoW6kErSDK8SSnW, Similarity score: 0.170


In [13]:
def gen_playlists(users, n_recommendations = 20):
    gen_playlists = {}
    for user in users:
        file_path = f'../../datasets/user_month_datasets/user{user}_1month_listening_history.csv'
        user_data = pd.read_csv(file_path)
        user_data = convert_days_to_datetime(user_data)
        recommender = create_recommendation_pipeline(user_data)
        current_song = np.random.choice(list(recommender.index_to_uri.keys()))
        current_day = datetime.now().weekday()
        recommendations = recommender.get_recommendations(
            recommender.index_to_uri[current_song],
            n_recommendations=n_recommendations,
            current_day= current_day
        )
        gen_playlists[user] = [x[0] for x in recommendations]
    return gen_playlists


In [15]:
import json

users = []
for i in range(1, 11):
    users.append(i)

playlists = gen_playlists(users)
# write playlists to json file
with open('generated_playlists.json', 'w') as f:
    f.write(json.dumps(playlists, indent=4))