## My Sportify - Music Recommendation System -

### README

In [2]:
# Let's get started !

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import datetime
import math
import gc

from subprocess import check_output
print(check_output(["ls", "../KKBox"]).decode("utf8"))

kkbox.ipynb
members.csv
sample_submission.csv
song_extra_info.csv
songs.csv
sportify.ipynb
test.csv
train.csv



This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
# Create DataFrane from KKBox's CSV fsiles

path = '../KKBox/'
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
songs = pd.read_csv(path + "songs.csv")
members = pd.read_csv(path + "members.csv")
song_extra_info = pd.read_csv(path + "song_extra_info.csv")
print("Done Loading...")

Done Loading...


In [4]:
# Checking general infos of each DataFrame

print("-----train-----")
print(train.shape, train.columns)
print("\n-----test-----")
print(test.shape, test.columns)
print("\n-----songs-----")
print(songs.shape, songs.columns)
print("\n-----members-----")
print(members.shape, members.columns)
print("\n-----song_extra_info-----")
print(song_extra_info.shape, song_extra_info.columns)

-----train-----
(7377418, 6) Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target'],
      dtype='object')

-----test-----
(2556790, 6) Index(['id', 'msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type'],
      dtype='object')

-----songs-----
(2296320, 7) Index(['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language'],
      dtype='object')

-----members-----
(34403, 7) Index(['msno', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date'],
      dtype='object')

-----song_extra_info-----
(2295971, 3) Index(['song_id', 'name', 'isrc'], dtype='object')


### Data1. training data

In [5]:
# Drop irrevarent features for recommendation
# Rename name of columns as you like
# This data doesn't have any null value (exelent!)

train.drop(labels=["source_system_tab", "source_screen_name", "source_type"], axis=1, inplace=True)
train.columns = ["UserID", "SongID", "Target"]
print(train.isnull().sum())
train.head(3)

UserID    0
SongID    0
Target    0
dtype: int64


Unnamed: 0,UserID,SongID,Target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,1


In [6]:
# The number of unique user is 30,755
# The number of unique song is 359,966

train.nunique()

UserID     30755
SongID    359966
Target         2
dtype: int64

### Data2. songs data

In [7]:
# Drop irrevarent features for recommendation
# Rename name of columns as you like

songs = songs[["song_id", "language", "artist_name", "genre_ids", "song_length"]]
songs.columns = ["SongID", "Language", "Artist", "Category", "Length"]
print(songs.isnull().sum())
songs.head(3)

SongID          0
Language        1
Artist          0
Category    94116
Length          0
dtype: int64


Unnamed: 0,SongID,Language,Artist,Category,Length
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,3.0,張信哲 (Jeff Chang),465,247640
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,31.0,BLACKPINK,444,197328
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,31.0,SUPER JUNIOR,465,231781


In [8]:
# Songs : 2,296,320 (359,966 songs in training data)
# Language : 9
# Artist : 222,363
# Category : 1,044

songs.nunique()

SongID      2296320
Language         10
Artist       222363
Category       1045
Length       146534
dtype: int64

### Data3. members data

In [9]:
#Drop irrevarent features for recommendation
#Rename name of columns as you like

members = members[["msno", "bd", "gender", "city"]]
members.columns = ["UserID", "Age", "Sex", "City"]
print("The total number of data is:", len(members))
print("--------------------")
print(members.isnull().sum())
members.head(3)

The total number of data is: 34403
--------------------
UserID        0
Age           0
Sex       19902
City          0
dtype: int64


Unnamed: 0,UserID,Age,Sex,City
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,0,,1
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,0,,1
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,0,,1


In [10]:
# There might be so many outliers in 'Age' 

print(members.describe())
members.nunique()

                Age          City
count  34403.000000  34403.000000
mean      12.280935      5.371276
std       18.170251      6.243929
min      -43.000000      1.000000
25%        0.000000      1.000000
50%        0.000000      1.000000
75%       25.000000     10.000000
max     1051.000000     22.000000


UserID    34403
Age          95
Sex           2
City         21
dtype: int64

In [11]:
# Modify the values in 'Age' column 
# I use my criteria of age is 6 ~ 90 years old who could register for membership

members.loc[(members.Age < 6) | (members.Age > 90), "Age"] = np.nan
members["Age"].fillna(round(members["Age"].mean()), inplace=True)
members["Age"].describe()

count    34403.000000
mean        28.952126
std          5.894283
min          7.000000
25%         29.000000
50%         29.000000
75%         29.000000
max         90.000000
Name: Age, dtype: float64

### Data4. song_extra_info data

In [32]:
# Add a few modification

song_extra_info = pd.read_csv(path + "song_extra_info.csv")
song_extra_info.columns = ["SongID", "SongTitle", "ISRC"]
print(song_extra_info.isnull().sum())
song_extra_info.head(3)

SongID            0
SongTitle         2
ISRC         136548
dtype: int64


Unnamed: 0,SongID,SongTitle,ISRC
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303


In [33]:
# ISRC shows the year of release 
# Coverting from ISRC to Year of release

def convert(ISRC):
    if type(ISRC) == str:
        if int(ISRC[5:7]) < 18:
            return 2000 + int(ISRC[5:7])
        else:
            return 1900 + int(ISRC[5:7])
    else:
        return np.nan

song_extra_info["Year"] = song_extra_info["ISRC"].apply(convert)
song_extra_info.head()

Unnamed: 0,SongID,SongTitle,ISRC,Year
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043,2012.0
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015,2016.0
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303,2008.0
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446,2013.0
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001,2013.0


In [36]:
# Insert mean in null value in Year column 
# This time we drop old ISRC as duplicates and SongTitle as irrevant features

song_extra_info["Year"].fillna(round(song_extra_info["Year"].mean()), inplace=True)
song_extra_info.drop(labels=["SongTitle","ISRC"], axis=1, inplace=True)
song_extra_info.head()

Unnamed: 0,SongID,Year
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,2012.0
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,2016.0
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,2008.0
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,2013.0
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,2013.0


### Data5. test data

In [13]:
# Finally check the test data

print(test.shape)
test.head(3)

(2556790, 6)


Unnamed: 0,id,msno,song_id,source_system_tab,source_screen_name,source_type
0,0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,my library,Local playlist more,local-library
1,1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,my library,Local playlist more,local-library
2,2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,discover,,song-based-playlist


In [14]:
# Drop irrevarent features for recommendation

test = test[["id", "msno", "song_id"]]
test.columns = ["RowID", "UserID", "SongID"]
print(test.isnull().sum())
test.head(3)

RowID     0
UserID    0
SongID    0
dtype: int64


Unnamed: 0,RowID,UserID,SongID
0,0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=
1,1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=
2,2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=


### Now Let's get started

In [15]:
# # Merge user data and songs data

train = train.merge(songs, on="SongID", how="left")
test = test.merge(songs, on="SongID", how="left")

In [42]:
# Merge extra infos of songs to training data

train = train.merge(song_extra_info, on="SongID", how="left")
train = train[["UserID", "SongID", "Language", "Artist", "Category", "Length", "Year", "Target"]]
train.head(3)

Unnamed: 0,UserID,SongID,Language,Artist,Category,Length,Year,Target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,52.0,Bastille,359,206471.0,2016.0,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,52.0,Various Artists,1259,284584.0,1999.0,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,52.0,Nas,1259,225396.0,2006.0,1


In [46]:
# Merge extra infos of songs to test data

test = test.merge(song_extra_info, on="SongID", how="left")
test.head(3)

Unnamed: 0,RowID,UserID,SongID,Language,Artist,Category,Length,Year
0,0,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,WmHKgKMlp1lQMecNdNvDMkvIycZYHnFwDT72I5sIssc=,3.0,梁文音 (Rachel Liang),458,224130.0,2014.0
1,1,V8ruy7SGk7tDm3zA51DPpn6qutt+vmKMBKa21dp54uM=,y/rsZ9DC7FwK5F2PK2D5mj+aOBUJAjuu3dZ14NgE0vM=,3.0,林俊傑 (JJ Lin),465,320470.0,2010.0
2,2,/uQAlrAkaczV+nWCd2sPF2ekvXPRipV7q0l+gbLuxjw=,8eZLFOdGVdXBSqoAv5nsLigeH2BvKXzTQYtUM53I0k4=,17.0,Yu Takahashi (高橋優),2022,315899.0,2010.0
