# Million song dataset playground

In [57]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [58]:
df = pd.read_csv('kaggle_visible_evaluation_triplets.txt', delimiter='\t', names=['user','song','playcount'])

In [59]:
print df.shape
df.head()

(1450933, 3)


Unnamed: 0,user,song,playcount
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1


In [20]:
# top users
df.user.value_counts().head()

7d90be8dfdbde170f036ce8a4b915440137cb11c    53
03ad93fdb01506ce205f4708decf8e4b1ae90fff    52
2e424b28bff1f62a2dae22a918f09f9c30c06d1b    52
d30e18323f15426c3cdc8585252ed34459916f51    52
016a24e91a72c159a5048ab1b9b2ba5ce761b526    52
Name: user, dtype: int64

# Playcount for each song

In [60]:
# count number of plays for each song
song_hash = {}
for idx, row in df.iterrows():
    if row.song in song_hash:
        song_hash[row.song] += row.playcount
    else:
        song_hash[row.song] = row.playcount

In [61]:
# total play of song
song_hash['SOBONKR12A58A7A7E0']

35432

In [28]:
np.sum(df.song=='SOBONKR12A58A7A7E0')

4136

# Sort songs per playcount

In [62]:
# sort songs by total playcount
song_sorted = sorted(song_hash, key=song_hash.get, reverse=True)

In [63]:
# top 5 songs
song_sorted[:5]

['SOBONKR12A58A7A7E0',
 'SOAUWYT12A81C206F1',
 'SOSXLTC12AF72A7F54',
 'SOFRQTD12A81C233C0',
 'SOEGIYH12A6D4FC0E3']

# Listening history per user

In [64]:
# hash of songs per user
user_hash = {}
for idx, row in df.iterrows():
    if row.user in user_hash:
        user_hash[row.user].append(row.song)
    else:
        user_hash[row.user] = [row.song]

In [67]:
user_hash['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d']

['SOBONKR12A58A7A7E0',
 'SOEGIYH12A6D4FC0E3',
 'SOFLJQZ12A6D4FADA6',
 'SOHTKMO12AB01843B0',
 'SODQZCY12A6D4F9D11',
 'SOXLOQG12AF72A2D55']

# canonical ordering for kaggle submission

In [65]:
canonical_user = pd.read_csv('kaggle_users.txt', delimiter='\t', names=['user'])

In [66]:
canonical_user.head()

Unnamed: 0,user
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d
1,d7083f5e1d50c264277d624340edaaf3dc16095b
2,d68dc6fc25248234590d7668a11e3335534ae4b4
3,9be82340a8b5ef32357fe5af957ccd54736ece95
4,841b2394ae3a9febbd6b06497b4a8ee8eb24b7f8


In [68]:
canonical_song = pd.read_csv('kaggle_songs.txt', delimiter=' ', names=['song', 'index'])

In [69]:
canonical_song.head()

Unnamed: 0,song,index
0,SOAAADD12AB018A9DD,1
1,SOAAADE12A6D4F80CC,2
2,SOAAADF12A8C13DF62,3
3,SOAAADZ12A8C1334FB,4
4,SOAAAFI12A6D4F9C66,5


In [70]:
hash_song_index = {}
for idx, row in canonical_song.iterrows():
    hash_song_index[row.song] = row['index']

In [71]:
hash_song_index['SOAAADF12A8C13DF62']

3

In [82]:
rec_dic = {}
for idx, row in canonical_user.iterrows():
    recommendation = []
    for song in song_sorted:
        if song not in user_hash[row.user]:
            recommendation.append(hash_song_index[song])
            if len(recommendation) >= 500:
                break
    rec_dic[row.user] = recommendation

KeyboardInterrupt: 

In [81]:
rec_dic['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d']

{'841b2394ae3a9febbd6b06497b4a8ee8eb24b7f8': [25150,
  12985,
  288653,
  91177,
  68212,
  14397,
  217471,
  319911,
  307202,
  123630,
  244143,
  221730,
  25323,
  54386,
  177172,
  291341,
  87433,
  302247,
  315812,
  311604,
  314086,
  305991,
  334187,
  177486,
  243307,
  333259,
  19682,
  52478,
  266750,
  212702,
  243769,
  24825,
  241705,
  177574,
  384072,
  192716,
  142602,
  351764,
  165401,
  277126,
  281075,
  248603,
  45592,
  311262,
  261596,
  49781,
  289658,
  86545,
  348629,
  170541,
  38941,
  183796,
  302369,
  205438,
  245936,
  190006,
  180413,
  8402,
  329869,
  310418,
  373947,
  357810,
  52176,
  2078,
  208383,
  154239,
  115162,
  242151,
  301674,
  195955,
  105434,
  49770,
  187097,
  55364,
  26935,
  307140,
  329834,
  334240,
  40311,
  172688,
  345265,
  126831,
  207916,
  107193,
  97972,
  285387,
  211334,
  136611,
  337625,
  183790,
  374902,
  354625,
  292298,
  383719,
  223714,
  65358,
  212017,
  233632,
  

In [56]:
rec = pd.Dataframe(rec_dic, names=['user', 'recommendation'])

KeyboardInterrupt: 