In [1]:
import json
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

# Index
* [Load data](#Load-data)
* [Answer question 1](#Answer-question-1)
* [Answer question 2](#Answer-question-2)
* [Answer question 3](#Answer-question-3)
    * [count each song's play times by each user](#count-each-song's-play-times-by-each-user)
    * [build song similarity matrix](#build-song-similarity-matrix)
    * [recommend](#recommend)
* [Answer question 4](#Answer-question-4)

## Load data

In [2]:
def load_data():
    with open("song.json", "rt") as inf:
        data = json.load(inf)

    data = pd.DataFrame(data)
    data.set_index("id", inplace=True)
    data["time_played"] = pd.to_datetime(data.time_played)
    data['user_sign_up_date'] = pd.to_datetime(data.user_sign_up_date)

    return data

data = load_data()

In [3]:
data.head()

Unnamed: 0_level_0,song_played,time_played,user_id,user_sign_up_date,user_state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GOQMMKSQQH,Hey Jude,2015-06-11 21:51:35,122,2015-05-16,Louisiana
HWKKBQKNWI,We Can Work It Out,2015-06-06 16:49:19,3,2015-05-01,Ohio
DKQSXVNJDH,Back In the U.S.S.R.,2015-06-14 02:11:29,35,2015-05-04,New Jersey
HLHRIDQTUW,P.s. I Love You,2015-06-08 12:26:10,126,2015-05-16,Illinois
SUKJCSBCYW,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00,6,2015-05-01,New Jersey


## Answer question 1
What are the top 3 and the bottom 3 states in terms of number of users?

In [4]:
user_counts = data.groupby("user_state").user_id.agg(lambda ids: len(np.unique(ids)))
user_counts.sort_values(inplace=True, ascending=False)

In [6]:
print "top 3 states in #users: "
user_counts.iloc[:3]

top 3 states in #users: 


user_state
New York      23
California    21
Texas         15
Name: user_id, dtype: int64

In [7]:
print "bottom 3 states in #users: "
user_counts.iloc[:-4:-1]

bottom 3 states in #users: 


user_state
Arizona        1
New Mexico     1
Connecticut    1
Name: user_id, dtype: int64

## Answer question 2
What are the top 3 and the bottom 3 states in terms of user engagement? You can choose how to mathematically deﬁne user engagement. What the CEO cares about here is in which states users are using the product a lot/very little

I define <span style='color:orange;font-weight:bold;font-size:1.5em'>'average play event per hour'</span> as a metric to measure user engagement of a state.

In [13]:
def count_by_state(df):
    """ all data in df come from the same state """
    total_played = df.shape[0]
    first_play_dt = df.time_played.min()
    last_play_dt = df.time_played.max()
    duration = last_play_dt - first_play_dt
    duration_hours = duration.total_seconds()/60.0
    return pd.Series([first_play_dt,last_play_dt, duration,duration_hours, total_played],
                     index=["first_play_dt",'last_play_dt','duration','duration_hours','total_played'])

In [14]:
counts_by_states = data.groupby("user_state").apply(count_by_state)

In [17]:
counts_by_states.head()

Unnamed: 0_level_0,first_play_dt,last_play_dt,duration,duration_hours,total_played,hr_average
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York,2015-06-01 06:14:45,2015-06-28 21:36:40,27 days 15:21:55,39801.916667,469,0.011783
California,2015-06-01 06:33:03,2015-06-28 20:35:50,27 days 14:02:47,39722.783333,425,0.010699
Texas,2015-06-01 06:09:04,2015-06-28 20:28:35,27 days 14:19:31,39739.516667,230,0.005788
Ohio,2015-06-01 05:02:54,2015-06-28 22:22:25,27 days 17:19:31,39919.516667,209,0.005236
Florida,2015-06-01 09:29:39,2015-06-28 22:59:27,27 days 13:29:48,39689.8,180,0.004535


In [18]:
counts_by_states["hr_average"] = counts_by_states.total_played/counts_by_states.duration_hours
counts_by_states.sort_values(by="hr_average",ascending=False,inplace=True)
counts_by_states

Unnamed: 0_level_0,first_play_dt,last_play_dt,duration,duration_hours,total_played,hr_average
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York,2015-06-01 06:14:45,2015-06-28 21:36:40,27 days 15:21:55,39801.916667,469,0.011783
California,2015-06-01 06:33:03,2015-06-28 20:35:50,27 days 14:02:47,39722.783333,425,0.010699
Texas,2015-06-01 06:09:04,2015-06-28 20:28:35,27 days 14:19:31,39739.516667,230,0.005788
Ohio,2015-06-01 05:02:54,2015-06-28 22:22:25,27 days 17:19:31,39919.516667,209,0.005236
Florida,2015-06-01 09:29:39,2015-06-28 22:59:27,27 days 13:29:48,39689.8,180,0.004535
Pennsylvania,2015-06-01 05:19:08,2015-06-28 21:44:20,27 days 16:25:12,39865.2,179,0.00449
North Carolina,2015-06-01 12:40:31,2015-06-28 23:26:38,27 days 10:46:07,39526.116667,154,0.003896
Illinois,2015-06-01 12:15:13,2015-06-28 18:07:10,27 days 05:51:57,39231.95,149,0.003798
Georgia,2015-06-01 06:41:36,2015-06-28 21:37:34,27 days 14:55:58,39775.966667,135,0.003394
Missouri,2015-06-01 05:36:55,2015-06-28 18:32:34,27 days 12:55:39,39655.65,127,0.003203


In [19]:
print "top 3 states in user engagement"
counts_by_states.iloc[:3]

top 3 states in user engagement


Unnamed: 0_level_0,first_play_dt,last_play_dt,duration,duration_hours,total_played,hr_average
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York,2015-06-01 06:14:45,2015-06-28 21:36:40,27 days 15:21:55,39801.916667,469,0.011783
California,2015-06-01 06:33:03,2015-06-28 20:35:50,27 days 14:02:47,39722.783333,425,0.010699
Texas,2015-06-01 06:09:04,2015-06-28 20:28:35,27 days 14:19:31,39739.516667,230,0.005788


In [21]:
print "bottom 3 states in user engagement"
counts_by_states.iloc[-3:]

bottom 3 states in user engagement


Unnamed: 0_level_0,first_play_dt,last_play_dt,duration,duration_hours,total_played,hr_average
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Connecticut,2015-06-06 19:23:58,2015-06-28 13:16:32,21 days 17:52:34,31312.566667,16,0.000511
New Mexico,2015-06-01 05:22:30,2015-06-28 13:15:58,27 days 07:53:28,39353.466667,17,0.000432
Kansas,2015-06-05 15:01:50,2015-06-27 09:02:15,21 days 18:00:25,31320.416667,8,0.000255


## Answer question 3
The CEO wants to send a gift to the ﬁrst user who signed-up for each state. That is, the ﬁrst user who signed-up from California, from Oregon, etc. Can you give him a list of those users?

In [22]:
def find_first_signup(df):
    idx = df.user_sign_up_date.argmin()
    return df.loc[idx,["user_id","user_sign_up_date"]]

first_users = data.groupby("user_state").apply(find_first_signup)
first_users.sort_values(by="user_sign_up_date")

Unnamed: 0_level_0,user_id,user_sign_up_date
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,5,2015-05-01
Texas,7,2015-05-01
Oregon,1,2015-05-01
Ohio,3,2015-05-01
North Carolina,2,2015-05-01
New Mexico,4,2015-05-01
New Jersey,6,2015-05-01
Pennsylvania,11,2015-05-02
New York,19,2015-05-02
Minnesota,8,2015-05-02


## Answer question 4
Build a function that takes as an input any of the songs in the data and returns the most likely song to be listened next. That is, if, for instance, a user is currently listening to "Eight Days A Week", which song has the highest probability of being played right after it by the same user? This is going to be v1 of a song recommendation model.

To answer this question, I follow the 'Collaborative Filtering' idea, that is: if two songs are both liked by the same set of users, then they must be similar.

### count each song's play times by each user

In [23]:
def count_by_song(df):
    """ all data in df come from the same song"""
    return pd.Series( Counter(df.user_id) )

counts_by_songs = data.groupby("song_played").apply(count_by_song)
counts_by_songs = counts_by_songs.unstack(fill_value=0)

In [25]:
# each row is a song
# each column represents a user
# [i,j] represents number of times user 'j' plays song 'i'
counts_by_songs.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0,0,1,3,0,2,0,0,0,0,...,0,0,3,3,0,2,0,0,2,0
A Hard Day's Night,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Across The Universe,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### build song similarity matrix

In [30]:
# normalize each song's vector to have unit norm
# which simplifies 'cosine similarity' to dot-product of two vectors
cnts_by_songs_normed = normalize(counts_by_songs,axis=1)

# songs_similarity is a [S,S] matrix, where 'S' is #songs
# the higher songs_similarity[i,j] indicates the more similar between song[i] and song[j]
songs_similarity = cnts_by_songs_normed.dot(cnts_by_songs_normed.T)

# transform np.ndarray to pd.DataFrame
songs_similarity = pd.DataFrame(songs_similarity,index=counts_by_songs.index,columns=counts_by_songs.index)

In [31]:
songs_similarity.head()

song_played,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,BALLAD OF JOHN AND YOKO,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,1.0,0.235702,0.074536,0.119523,0.212132,0.355023,0.329404,0.152145,0.210819,0.172133,...,0.464938,0.030429,0.508964,0.223607,0.359092,0.037268,0.318198,0.35322,0.087841,0.0
A Hard Day's Night,0.235702,1.0,0.0,0.0,0.1,0.136931,0.111803,0.0,0.0,0.091287,...,0.259548,0.129099,0.210099,0.0,0.0,0.0,0.05,0.195468,0.074536,0.0
A Saturday Club Xmas/Crimble Medley,0.074536,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109435,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
ANYTIME AT ALL,0.119523,0.0,0.0,1.0,0.0,0.154303,0.094491,0.109109,0.0,0.0,...,0.116991,0.0,0.138107,0.089087,0.183942,0.0,0.0,0.146845,0.0,0.0
Across The Universe,0.212132,0.1,0.0,0.0,1.0,0.091287,0.0,0.0,0.0,0.0,...,0.138426,0.0,0.116722,0.0,0.0,0.0,0.0,0.043437,0.0,0.0


### recommend

In [32]:
### find top K most similar of each song
def most_similar_songs(s,topk):
    # [0] must be itself
    similar_ones = s.sort_values(ascending=False)[1:topk+1].index.values
    return pd.Series(similar_ones,index = ["similar#{}".format(i) for i in xrange(1,topk+1)])

songs_similarity.apply(most_similar_songs,topk=1,axis=1)

Unnamed: 0_level_0,similar#1
song_played,Unnamed: 1_level_1
A Day In The Life,Come Together
A Hard Day's Night,Come Together
A Saturday Club Xmas/Crimble Medley,GIRL
ANYTIME AT ALL,Can't Buy Me Love
Across The Universe,Revolution
All My Loving,Let It Be
All You Need Is Love,A Day In The Life
And Your Bird Can Sing,All My Loving
BAD BOY,Hey Jude
BALLAD OF JOHN AND YOKO,Golden Slumbers


## Answer question 4

How would you set up a test to check whether your model works well and is improving engagement?

we need to perform a A/B test:
1. randomly split users into two groups, one Control group and one Experiment group
2. Control group has no recommendation strategy
3. Experiment group recommend the next song
4. after running some time, perform a one-tailed t-test on 'average #play per hour'
    * H0: population 'average #play per hour' is same in two groups
    * HA: experiment group's population 'average #play per hour' is higher than control group's 