In [32]:
import pandas as pd
import numpy as np

## Data Preparation

In [33]:
df = pd.read_csv('../data/spotify_dataset.csv', on_bad_lines='skip')

In [34]:
len(df)

12891680

In [35]:
df.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [36]:
# Fixed Column Headers (removed quotes, whitespace, and simplified the column names)
df.columns = df.columns.str.replace("\"", "").str.strip().str.replace("name","")

In [37]:
df.head()

Unnamed: 0,user_id,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [38]:
df.dtypes

user_id     object
artist      object
track       object
playlist    object
dtype: object

In [39]:
df.isnull().sum()

user_id         0
artist      33568
track          85
playlist     1246
dtype: int64

In [40]:
df = df.dropna() # Removes rows with NULL values

In [41]:
df.isnull().sum()

user_id     0
artist      0
track       0
playlist    0
dtype: int64

In [42]:
len(df)

12856838

## Exploratory Data Analysis

In [43]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print()

user_id
['9cc0cfd4d7d7885102480dd99e7a90d6' '07f0fc3be95dcd878966b1f9572ff670'
 '944c80d26922ae634d6ce445b1fdff7f' 'c5cdf06b5f1836006ef2a2fe4f5ff103'
 'f3743cac98b7255c3c4a23be09dee7e6']
15914

artist
['Elvis Costello' 'Elvis Costello & The Attractions' 'Tiffany Page'
 'Lissie' 'Paul McCartney']
289603

track
['(The Angels Wanna Wear My) Red Shoes'
 "(What's So Funny 'Bout) Peace, Love And Understanding"
 '7 Years Too Late' 'Accidents Will Happen' 'Alison']
2004523

playlist
['HARD ROCK 2010' 'IOW 2012' '2080' 'C418' 'Chill out']
157320



In [44]:
# Possible ideas:
# Group song and artist, so that same name songs are distinct
# Goal is to recommend songs


In [45]:
# Column that combines song and artist
df['track_artist'] = df['track'] + ', ' + df['artist']

Checking Sparsity

In [46]:
# Checking Sparsity
n_tracks = df.track_artist.nunique()
n_playlists = df.playlist.nunique()
n_interactions = len(df)

print(n_tracks, n_playlists, n_interactions)
n_interactions / (n_tracks * n_playlists)

2789650 157320 12856838


2.9295473070294357e-05

We will have a very sparse matrix.

## Validation

In [47]:
row_split = int(n_interactions * .9)
print(row_split)

11571154


In [48]:
# Split into training and validation datasets
df_train = df.iloc[:row_split]
df_val = df.iloc[row_split:]

In [49]:
# Find 10 most common songs
df_train.track_artist.value_counts().iloc[:10]

Midnight City, M83                                           2363
Get Lucky - Radio Edit, Daft Punk                            2109
Radioactive, Imagine Dragons                                 2083
Wake Me Up, Avicii                                           2032
Little Talks, Of Monsters and Men                            2021
Royals, Lorde                                                2010
Ho Hey, The Lumineers                                        1988
Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis    1843
Pompeii, Bastille                                            1841
Blurred Lines, Robin Thicke                                  1806
Name: track_artist, dtype: int64

In [50]:
# Frequency of most common songs
n_rec = 10
freq = df_train.track_artist.value_counts().iloc[:n_rec].index.values
freq

array(['Midnight City, M83', 'Get Lucky - Radio Edit, Daft Punk',
       'Radioactive, Imagine Dragons', 'Wake Me Up, Avicii',
       'Little Talks, Of Monsters and Men', 'Royals, Lorde',
       'Ho Hey, The Lumineers',
       "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",
       'Pompeii, Bastille', 'Blurred Lines, Robin Thicke'], dtype=object)

In [51]:
# Grouping Songs by Playlist
val_group = df_val.groupby('playlist').track_artist.apply(set)
val_group

playlist
 !!                                              {Christine, The House Of Love}
 . bounc ' in . beats .       {Nights Off, Siriusmo, Doin' it Right, Daft Pu...
 ASLA Birthday April 13/13    {Scream & Shout, will.i.am, One Way Or Another...
 Ambient-Chill                {Ghostwriter, RJD2, Titanium / Pavane, The Pia...
 Desire                       {A Sky Full of Stars, Coldplay, Salsa Tequila,...
                                                    ...                        
🗽🗽🗽                           {Automatic, Miranda Lambert, Last Name, Carrie...
😔❤️😴😘                         {All of Me, John Legend, Stay, Hurts, Love Me ...
😕                             {Own It, Drake, Sober, P!nk, Furthest Thing, D...
🙌                             {Only, Nicki Minaj, Poetic Justice, Kendrick L...
🚙                             {American Oxygen, Rihanna, Get Low, Dillon Fra...
Name: track_artist, Length: 19450, dtype: object

In [52]:
n_val = len(val_group)
recommendations = np.repeat([freq], n_val, axis=0)
recommendations[:10]

array([['Midnight City, M83', 'Get Lucky - Radio Edit, Daft Punk',
        'Radioactive, Imagine Dragons', 'Wake Me Up, Avicii',
        'Little Talks, Of Monsters and Men', 'Royals, Lorde',
        'Ho Hey, The Lumineers',
        "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",
        'Pompeii, Bastille', 'Blurred Lines, Robin Thicke'],
       ['Midnight City, M83', 'Get Lucky - Radio Edit, Daft Punk',
        'Radioactive, Imagine Dragons', 'Wake Me Up, Avicii',
        'Little Talks, Of Monsters and Men', 'Royals, Lorde',
        'Ho Hey, The Lumineers',
        "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",
        'Pompeii, Bastille', 'Blurred Lines, Robin Thicke'],
       ['Midnight City, M83', 'Get Lucky - Radio Edit, Daft Punk',
        'Radioactive, Imagine Dragons', 'Wake Me Up, Avicii',
        'Little Talks, Of Monsters and Men', 'Royals, Lorde',
        'Ho Hey, The Lumineers',
        "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",

In [53]:
# Finds out how often the top 10 tracks appear in validation dataset playlists
average_precision = 0

for i, playlist in enumerate(val_group):
    count = 0
    for track_artist in recommendations[i]:
        if track_artist in playlist:
            count = count + 1
    precision = count / n_rec
    average_precision = average_precision + precision

average_precision = average_precision / n_val
average_precision

0.008483290488431786

This is our baseline. Now we can try to improve it using different methods.