In [49]:
import pandas as pd
import numpy as np

# Read songs data

In [74]:
songs = pd.read_csv('song_data.csv')

In [75]:
songs.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters


# Truncate data so that the computations fit in your tiny memory of a laptop

In [52]:
len(songs)

807666

In [76]:
songs = songs[0:10000]
len(songs)

10000

# Convert string based user and song ids into number based ones

In [77]:
songs[['user_id_no', 'song_id_no']] = songs[['user_id', 'song_id']].apply(lambda x: x.astype('category').cat.codes)
songs.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song,user_id_no,song_id_no
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson,418,79
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia,418,224
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West,418,379
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson,418,385
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters,418,616


# Get number of unique users and songs

In [55]:
n_users = songs.user_id.unique().shape[0]
n_items = songs.song_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of songs = ' + str(n_items)

Number of users = 595 | Number of songs = 5173


# Split data into training and test sets

In [56]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(songs, test_size=0.25)

# Create two user-item matrices, one for training and another for testing

In [57]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[7], line[8]] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[7], line[8]] = line[3]

# Calculate cosine similarity amongst users and songs

In [58]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

# Make Predictions

In [59]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [60]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [61]:
item_prediction

array([[ 0.01182354,  0.01182833,  0.01185597, ...,  0.01181935,
         0.01184605,  0.0118131 ],
       [ 0.00581485,  0.00581721,  0.00583081, ...,  0.0058128 ,
         0.00582593,  0.00580972],
       [ 0.00232594,  0.00232688,  0.00233232, ...,  0.00232512,
         0.00233037,  0.00232389],
       ..., 
       [ 0.00910994,  0.00911363,  0.00913493, ...,  0.00910671,
         0.00912729,  0.0091019 ],
       [ 0.00348891,  0.00349033,  0.00349848, ...,  0.00348768,
         0.00349556,  0.00348583],
       [ 0.00891611,  0.00891972,  0.00894057, ...,  0.00891296,
         0.00893309,  0.00890824]])

In [62]:
user_prediction

array([[ 0.00583323,  0.00583323,  0.01089873, ...,  0.00752173,
         0.00583323,  0.02440674],
       [-0.00017023, -0.00017023,  0.00489303, ...,  0.00151753,
        -0.00017023,  0.01839506],
       [-0.00365113, -0.00365113,  0.0014087 , ..., -0.00196452,
        -0.00365113,  0.01490158],
       ..., 
       [ 0.0031261 ,  0.0031261 ,  0.00818144, ...,  0.00481122,
         0.0031261 ,  0.02166232],
       [-0.00247553, -0.00247553,  0.00257896, ..., -0.0007907 ,
        -0.00247553,  0.01605757],
       [ 0.00293038,  0.00293038,  0.00798598, ...,  0.00461558,
         0.00293038,  0.02146757]])

# Look at predictions for the 0th user

In [63]:
predictions=user_prediction[0, :]

In [64]:
len(predictions)

5173

In [65]:
predictions

array([ 0.00583323,  0.00583323,  0.01089873, ...,  0.00752173,
        0.00583323,  0.02440674])

# Look at predictions for the 100th user

In [66]:
predictions=user_prediction[100, :]

In [67]:
predictions

array([-0.00403966, -0.00403966,  0.00101495, ..., -0.00235479,
       -0.00403966,  0.01449389])

In [68]:
predictions.max()

0.50479029925586305

In [69]:
predictions.min()

-0.0057245237357444355

In [70]:
np.argmax(predictions)

322

In [71]:
predictions[322]

0.50479029925586305

# Get song details for the top song for 100th user

In [73]:
songs[songs['song_id_no'] == 322].head(1)

Unnamed: 0,user_id,song_id,listen_count,title,artist,song,user_id_no,song_id_no
91,e006b1a48f466bf59feefed32bec6494495a4436,SOBONKR12A58A7A7E0,2,You're The One,Dwight Yoakam,You're The One - Dwight Yoakam,514,322


# Check that this song was not listened by the user as per our data

In [80]:
songs[(songs['song_id_no'] == 322) & (songs['user_id_no'] == 100)]

Unnamed: 0,user_id,song_id,listen_count,title,artist,song,user_id_no,song_id_no
