In [52]:
import pandas as pd
import numpy as np

## Data Preparation

In [53]:
df = pd.read_csv('../data/spotify_dataset.csv', on_bad_lines='skip')

In [54]:
# Fixed Column Headers (removed quotes, whitespace, and simplified the column names)
df.columns = df.columns.str.replace("\"", "").str.strip().str.replace("name","")

In [55]:
df = df.dropna() # Removes rows with NULL values

In [56]:
# Column that combines song and artist
df['track_artist'] = df['track'] + ', ' + df['artist']

In [57]:
n_tracks = df.track_artist.nunique()
n_users = df.user_id.nunique()
n_interactions = len(df)

## Data Validation

In [58]:
# Split into training and validation datasets randomly
# Random sampling of users changes the baseline (.002...)
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, test_size=0.2)

In [59]:
# Frequency of most common songs
n_rec = 10
freq = df_train.track_artist.value_counts().iloc[:n_rec].index.values

In [60]:
# Grouping Songs by Playlist
val_group = df_val.groupby('user_id').track_artist.apply(set)

In [61]:
n_val = len(val_group)
recommendations = np.repeat([freq], n_val, axis=0)

In [62]:
# Finds out how often the top 10 tracks appear in validation dataset playlists
def avg_prec(recommendations, val_group):
    average_precision = 0

    for i, playlist in enumerate(val_group):
        count = 0
        for track_artist in recommendations[i]:
            if track_artist in playlist:
                count = count + 1
        precision = count / n_rec
        average_precision = average_precision + precision

    average_precision = average_precision / n_val
    return average_precision

In [63]:
avg_prec(recommendations, val_group)

0.02673762087092024

From the data validation performed, we know the baseline is approximately between .026 and .028.

## Content-Based Recommendations

In [64]:
# Find out overlapping users in both datasets
# We only need to make recs for them (this number varies due to random sampling)
target_users = set(df_val.user_id) & set(df_train.user_id)
len(target_users)

15341

The number of users that we should make recs for is a bit over 15000, as they exist in both the traning and validation datasets.

In [65]:
df_train_subset = df_train[df_train.user_id.isin(target_users)]

Now we create vectorizors for turning users

In [66]:
# Sklearn Imports
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.base import TransformerMixin, BaseEstimator

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import normalize

In [None]:
class ToDictTranformer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.columns = None
    
    def fit(self, df, y=None):
        self.columns = df.columns
        return self

    def transform(self, df):
        return df.to_dict(orient='records')

    def get_feature_names_out(self, *args, **kwargs):
        return self.columns