In [81]:
import pandas as pd
import numpy as np

## Data Preparation

In [82]:
df = pd.read_csv('../data/spotify_dataset.csv', on_bad_lines='skip')

In [83]:
len(df)

12891680

In [84]:
df.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [85]:
# Fixed Column Headers (removed quotes, whitespace, and simplified the column names)
df.columns = df.columns.str.replace("\"", "").str.strip().str.replace("name","")

In [86]:
df.head()

Unnamed: 0,user_id,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [87]:
df.dtypes

user_id     object
artist      object
track       object
playlist    object
dtype: object

In [88]:
df.isnull().sum()

user_id         0
artist      33568
track          85
playlist     1246
dtype: int64

In [89]:
df = df.dropna() # Removes rows with NULL values

In [90]:
df.isnull().sum()

user_id     0
artist      0
track       0
playlist    0
dtype: int64

In [91]:
len(df)

12856838

## Exploratory Data Analysis

In [92]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5])
    print(df[col].nunique())
    print()

user_id
['9cc0cfd4d7d7885102480dd99e7a90d6' '07f0fc3be95dcd878966b1f9572ff670'
 '944c80d26922ae634d6ce445b1fdff7f' 'c5cdf06b5f1836006ef2a2fe4f5ff103'
 'f3743cac98b7255c3c4a23be09dee7e6']
15914

artist
['Elvis Costello' 'Elvis Costello & The Attractions' 'Tiffany Page'
 'Lissie' 'Paul McCartney']
289603

track
['(The Angels Wanna Wear My) Red Shoes'
 "(What's So Funny 'Bout) Peace, Love And Understanding"
 '7 Years Too Late' 'Accidents Will Happen' 'Alison']
2004523

playlist
['HARD ROCK 2010' 'IOW 2012' '2080' 'C418' 'Chill out']
157320



In [93]:
# Column that combines song and artist
df['track_artist'] = df['track'] + ', ' + df['artist']

Checking Sparsity

In [94]:
# Checking Sparsity
n_tracks = df.track_artist.nunique()
n_users = df.user_id.nunique()
n_interactions = len(df)

print(n_tracks, n_users, n_interactions)
n_interactions / (n_tracks * n_users)

2789650 15914 12856838


0.0002896043624116318

We will have a very sparse matrix.

## Validation

In [95]:
# Split into training and validation datasets randomly
# Random sampling of users changes the baseline (.002...)
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, test_size=0.2)

In [96]:
# Find 10 most common songs
df_train.track_artist.value_counts().iloc[:10]

Midnight City, M83                                           2107
Radioactive, Imagine Dragons                                 1870
Get Lucky - Radio Edit, Daft Punk                            1864
Little Talks, Of Monsters and Men                            1828
Wake Me Up, Avicii                                           1810
Royals, Lorde                                                1742
Ho Hey, The Lumineers                                        1723
Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis    1657
Blurred Lines, Robin Thicke                                  1605
Pompeii, Bastille                                            1600
Name: track_artist, dtype: int64

In [97]:
# Frequency of most common songs
n_rec = 10
freq = df_train.track_artist.value_counts().iloc[:n_rec].index.values
freq

array(['Midnight City, M83', 'Radioactive, Imagine Dragons',
       'Get Lucky - Radio Edit, Daft Punk',
       'Little Talks, Of Monsters and Men', 'Wake Me Up, Avicii',
       'Royals, Lorde', 'Ho Hey, The Lumineers',
       "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",
       'Blurred Lines, Robin Thicke', 'Pompeii, Bastille'], dtype=object)

In [98]:
# Grouping Songs by User
val_group = df_val.groupby('user_id').track_artist.apply(set)
val_group

user_id
00055176fea33f6e027cd3302289378b    {Treasure, Bruno Mars, I Miss You, blink-182, ...
0007f3dd09c91198371454c608d47f22    {Hurt, Johnny Cash, Bittersweet Memories, Bull...
000b0f32b5739f052b9d40fcc5c41079    {My Love, Sofia Källgren, Let It Snow, Christe...
000c11a16c89aa4b14b328080f5954ee    {Gravity - Live At The Fillmore Version, Sara ...
00123e0f544dee3ab006aa7f1e5725a7    {Free Four - 2011 Remastered Version, Pink Flo...
                                                          ...                        
ffe32d5412269f3041c58cbf0dde3306    {Be True, Bruce Springsteen, Do You Love Me, T...
ffec270eae226caa14ddaef291d73fff    {Up 2 U, Walk the Moon, Freedom, Afrojack, We ...
fff60baf392613ed33f745b89a9b38f7    {Anthrax in the Pick 'n Mix, Jasper Carrott, A...
fff616055993498d6127f3f467cf9f2b    {Holland Road, Mumford & Sons, Down in the Val...
fff77dadf8528083c920b9c018847e8b    {42, Coldplay, Season, The Academy Is..., Anth...
Name: track_artist, Length: 15419, dtype: obje

In [99]:
n_val = len(val_group)
recommendations = np.repeat([freq], n_val, axis=0)
recommendations[:10]

array([['Midnight City, M83', 'Radioactive, Imagine Dragons',
        'Get Lucky - Radio Edit, Daft Punk',
        'Little Talks, Of Monsters and Men', 'Wake Me Up, Avicii',
        'Royals, Lorde', 'Ho Hey, The Lumineers',
        "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",
        'Blurred Lines, Robin Thicke', 'Pompeii, Bastille'],
       ['Midnight City, M83', 'Radioactive, Imagine Dragons',
        'Get Lucky - Radio Edit, Daft Punk',
        'Little Talks, Of Monsters and Men', 'Wake Me Up, Avicii',
        'Royals, Lorde', 'Ho Hey, The Lumineers',
        "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",
        'Blurred Lines, Robin Thicke', 'Pompeii, Bastille'],
       ['Midnight City, M83', 'Radioactive, Imagine Dragons',
        'Get Lucky - Radio Edit, Daft Punk',
        'Little Talks, Of Monsters and Men', 'Wake Me Up, Avicii',
        'Royals, Lorde', 'Ho Hey, The Lumineers',
        "Can't Hold Us - feat. Ray Dalton, Macklemore & Ryan Lewis",

In [100]:
# Finds out how often the top 10 tracks appear in validation dataset users
average_precision = 0

for i, user in enumerate(val_group):
    count = 0
    for track_artist in recommendations[i]:
        if track_artist in user:
            count = count + 1
    precision = count / n_rec
    average_precision = average_precision + precision

average_precision = average_precision / n_val
average_precision

0.027083468448019267

This is our baseline. Now we can try to improve it using different methods.