### WSDM-KKBOX Music Recommendation System

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime
import math
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from sklearn import metrics
%matplotlib inline

## Load data

In [2]:
df_train = pd.read_csv('../input/1208_train_add_popular.csv')
df_test = pd.read_csv('../input/1208_test_add_popular.csv')

In [3]:
age_bin = pd.read_csv('../input/members_quant_age.csv')
temp = age_bin[['msno','quant_age']]
df_train = df_train.merge(temp, on='msno', how='left')
df_test = df_test.merge(temp, on='msno', how='left')

In [4]:
member_svd = pd.read_csv('./member_svd.csv')
song_svd = pd.read_csv('./song_svd.csv')

In [5]:
df_train = df_train.merge(member_svd, on='msno', how='left')
df_test = df_test.merge(member_svd, on='msno', how='left')
df_train = df_train.merge(song_svd, on='song_id', how='left')
df_test = df_test.merge(song_svd, on='song_id', how='left')

In [6]:
n_component = 48

song_list = ['song_component_%d'%i for i in range(n_component)]
member_list = ['member_component_%d'%i for i in range(n_component)]

In [7]:
for col in df_train.columns:
    try:
        if col in song_list:
            df_train[col] = df_train[col].astype(np.float16)
            df_test[col] = df_test[col].astype(np.float16)
    except:
        print col,'error!'

In [8]:
for col in df_train.columns:
    try:
        if col in member_list:
            df_train[col] = df_train[col].astype(np.float16)
            df_test[col] = df_test[col].astype(np.float16)
    except:
        print col,'error!'

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Columns: 143 entries, Unnamed: 0 to song_component_47
dtypes: bool(1), float16(96), float64(3), int64(32), object(11)
memory usage: 3.9+ GB


In [10]:
artist_prob = pd.read_csv('../input/artist_prob.csv')
artist_prob = artist_prob[artist_prob['one']+artist_prob['zero'] > 20]
art_probability = artist_prob[['artist_name','prob']]
art_probability = art_probability[art_probability['prob'] > 0.5]

In [11]:
art_probability

Unnamed: 0,artist_name,prob
1,Various Artists,0.509710
6,BIGBANG TAEYANG,0.590254
8,OneRepublic,0.527190
9,Lindsey Stirling,0.542553
10,嚴爵 (Yen-j),0.505250
11,林俊傑 (JJ Lin),0.586352
12,周杰倫 (Jay Chou),0.600864
13,吳汶芳 (Fang Wu),0.530609
14,aMEI (張惠妹),0.501190
18,五月天 (Mayday),0.554825


In [12]:
df_train['artist_prob'] = df_train['artist_name'].isin(art_probability['artist_name'])
df_test['artist_prob'] = df_test['artist_name'].isin(art_probability['artist_name'])

In [13]:
import re

def find_chinese (x):
    if re.findall(ur'[\u4e00-\u9fff]+', x.decode("utf-8")):
        return True
    else:
        return False

In [14]:
df_train['Chinese_artist'] = df_train['artist_name'].apply(find_chinese)
df_test['Chinese_artist'] = df_test['artist_name'].apply(find_chinese)

In [15]:
df_train.drop(['Unnamed: 0'],axis=1,inplace=True)
df_train.drop(['Unnamed: 0.1'],axis=1,inplace=True)
df_test.drop(['Unnamed: 0'],axis=1,inplace=True)
df_test.drop(['Unnamed: 0.1'],axis=1,inplace=True)

In [16]:
df_train.drop(['count_artist_played'],axis=1,inplace=True) # don't know where it come from
df_test.drop(['count_artist_played'],axis=1,inplace=True)

In [17]:
df_train['gender'] =  df_train['gender'].fillna('None')
df_test['gender'] =  df_test['gender'].fillna('None')

In [18]:
df_train['city_age_gender'] =  (df_train['bd'] == 0) & (df_train['city'] == 1) & (df_train['gender'] == 'None')
df_test['city_age_gender'] =  (df_test['bd'] == 0) & (df_test['city'] == 1) & (df_test['gender'] == 'None' )

In [19]:
cols_as_npint32 = [
    'membership_days',
    'lyricists_count',
    'composer_count',
    'artist_count',
    'artist_composer',
    'count_song_played',
    'lyricists_1',
    'lyricists_2',
    'lyricists_3',
    'artist_name_1',
    'artist_name_2',
    'artist_name_3',
    'composer_1',
    'composer_2',
    'composer_3'
]

cols_as_npint16 = [
    'genre_ids_count',
    'is_featured',
    'registered_via',                 
    'registration_year',       
    'registration_month',       
    'registration_day',         
    'expiration_year',          
    'expiration_month',         
    'expiration_day',
    'popular',
    'quant_age',
    'city',
]

cols_as_npfloat16 = [
    'language',
    'song_year',
    'bd'
]


for col in df_train.columns:
    try:
        if df_train[col].dtype == object:
            df_train[col] = df_train[col].astype('category')
            df_test[col] = df_test[col].astype('category')
        elif col in cols_as_npint32:
            df_train[col] = df_train[col].astype(np.int32)
            df_test[col] = df_test[col].astype(np.int32)
        elif col in cols_as_npint8:
            df_train[col] = df_train[col].astype(np.int16)
            df_test[col] = df_test[col].astype(np.int16)
        elif col in cols_as_npfloat16:
            df_train[col] = df_train[col].astype(np.float16)
            df_test[col] = df_test[col].astype(np.float16)
    except:
        print col,'error!'

target error!
city error!
bd error!
registered_via error!
registration_year error!
registration_month error!
registration_day error!
expiration_year error!
expiration_month error!
expiration_day error!
song_length error!
language error!
song_year error!
genre_ids_count error!
is_featured error!
chinese error!
popular error!
quant_age error!
member_component_0 error!
member_component_1 error!
member_component_2 error!
member_component_3 error!
member_component_4 error!
member_component_5 error!
member_component_6 error!
member_component_7 error!
member_component_8 error!
member_component_9 error!
member_component_10 error!
member_component_11 error!
member_component_12 error!
member_component_13 error!
member_component_14 error!
member_component_15 error!
member_component_16 error!
member_component_17 error!
member_component_18 error!
member_component_19 error!
member_component_20 error!
member_component_21 error!
member_component_22 error!
member_component_23 error!
member_component_24

In [20]:
df_train['msno_dup'] = df_train['msno'].duplicated()
df_test['msno_dup'] = df_test['msno'].duplicated()

In [21]:
def source_type (x):
    if x == 'local-library' or x == 'local-playlist':
        return True
    else:
        return False

df_train['local_source'] = df_train['source_type'].apply(source_type)
df_test['local_source'] = df_test['source_type'].apply(source_type)

def source_screen_name (x):
    if x == 'Local playlist more':
        return True
    else:
        return False

df_train['local_source_screen'] = df_train['source_screen_name'].apply(source_screen_name)
df_test['local_source_screen'] = df_test['source_screen_name'].apply(source_screen_name)

In [22]:
#df_train.drop(['song_id'],axis=1,inplace=True)
#df_test.drop(['song_id'],axis=1,inplace=True)

In [23]:
#df_train['timestamp'] = np.arange(len(df_train))
#df_test['timestamp'] = np.arange(len(df_test)) + len(df_train)

In [24]:
## varience
def timestamp_map(x):
    if x < 7377418:
        x = (x - 0.0) / (7377417.0 - 0.0) * (1484236800.0 - 1471190400.0) + 1471190400.0
    else:
        x = (x - 7377417.0) / (9934207.0 - 7377417.0) * (1488211200.0 - 1484236800.0) + 1484236800.0

    return x

In [25]:
#df_train['timestamp'] = df_train['timestamp'].apply(timestamp_map)
#df_test['timestamp'] = df_test['timestamp'].apply(timestamp_map)

In [26]:
df_train_1 = df_train
#df_train_2 = df_train.drop(['msno'],axis=1)

In [27]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Columns: 146 entries, msno to local_source_screen
dtypes: bool(7), category(11), float16(96), float64(3), int32(15), int64(14)
memory usage: 3.0 GB


In [28]:
#df_train.drop(['count_song_played'],axis=1,inplace=True)
#df_test.drop(['count_song_played'],axis=1,inplace=True)

In [29]:
#msno_check = df_train['msno'].drop_duplicates()

In [30]:
del df_train

In [31]:
del artist_prob
del age_bin
del member_svd
del song_svd

---

# Train & predict

### train/test sets

In [32]:
X_train_1 = df_train_1.drop(['target'], axis=1)
Y_train_1 = df_train_1['target'].values

X_test = df_test.drop(['id'], axis=1)
ids = df_test['id'].values



d_train_final_1 = lgb.Dataset(X_train_1, Y_train_1)
watchlist_final_1 = lgb.Dataset(X_train_1, Y_train_1)
#watchlist_final_1 = lgb.Dataset(X_train_val[vali_length:], y_train_val[vali_length:])

In [33]:
del df_train_1

### Model 1

In [34]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'learning_rate': 0.3 ,
    'verbose': 0,
    'num_leaves': 128,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'max_depth': -1,
    'num_rounds': 200,
    'metric' : 'auc'
}

model_f1 = lgb.train(params, train_set=d_train_final_1,
                     valid_sets=watchlist_final_1, verbose_eval=5)



[5]	valid_0's auc: 0.75789
[10]	valid_0's auc: 0.780771
[15]	valid_0's auc: 0.794585
[20]	valid_0's auc: 0.803608
[25]	valid_0's auc: 0.81112
[30]	valid_0's auc: 0.815642
[35]	valid_0's auc: 0.82017
[40]	valid_0's auc: 0.823275
[45]	valid_0's auc: 0.826572
[50]	valid_0's auc: 0.829239
[55]	valid_0's auc: 0.831529
[60]	valid_0's auc: 0.833449
[65]	valid_0's auc: 0.835719
[70]	valid_0's auc: 0.838234
[75]	valid_0's auc: 0.840022
[80]	valid_0's auc: 0.84156
[85]	valid_0's auc: 0.843178
[90]	valid_0's auc: 0.84437
[95]	valid_0's auc: 0.845629
[100]	valid_0's auc: 0.846415
[105]	valid_0's auc: 0.847797
[110]	valid_0's auc: 0.849136
[115]	valid_0's auc: 0.850244
[120]	valid_0's auc: 0.851862
[125]	valid_0's auc: 0.853242
[130]	valid_0's auc: 0.854632
[135]	valid_0's auc: 0.855732
[140]	valid_0's auc: 0.856985
[145]	valid_0's auc: 0.858409
[150]	valid_0's auc: 0.859584
[155]	valid_0's auc: 0.860283
[160]	valid_0's auc: 0.861346
[165]	valid_0's auc: 0.862516
[170]	valid_0's auc: 0.863795
[175]

### Model 1-2

In [35]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'dart',
    'learning_rate': 0.3 ,
    'verbose': 0,
    'num_leaves': 128,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'max_depth': -1,
    'num_rounds': 200,
    'metric' : 'auc'
}

model_f1_2 = lgb.train(params, train_set=d_train_final_1,
                     valid_sets=watchlist_final_1, verbose_eval=5)

[5]	valid_0's auc: 0.75789
[10]	valid_0's auc: 0.775139
[15]	valid_0's auc: 0.788589
[20]	valid_0's auc: 0.799102
[25]	valid_0's auc: 0.806366
[30]	valid_0's auc: 0.810755
[35]	valid_0's auc: 0.814132
[40]	valid_0's auc: 0.818404
[45]	valid_0's auc: 0.820198
[50]	valid_0's auc: 0.821302
[55]	valid_0's auc: 0.824328
[60]	valid_0's auc: 0.825945
[65]	valid_0's auc: 0.826951
[70]	valid_0's auc: 0.827681
[75]	valid_0's auc: 0.828908
[80]	valid_0's auc: 0.829681
[85]	valid_0's auc: 0.829639
[90]	valid_0's auc: 0.830253
[95]	valid_0's auc: 0.830963
[100]	valid_0's auc: 0.832671
[105]	valid_0's auc: 0.833016
[110]	valid_0's auc: 0.832735
[115]	valid_0's auc: 0.83494
[120]	valid_0's auc: 0.835692
[125]	valid_0's auc: 0.836338
[130]	valid_0's auc: 0.837367
[135]	valid_0's auc: 0.838498
[140]	valid_0's auc: 0.838927
[145]	valid_0's auc: 0.839272
[150]	valid_0's auc: 0.839794
[155]	valid_0's auc: 0.840307
[160]	valid_0's auc: 0.839576
[165]	valid_0's auc: 0.840666
[170]	valid_0's auc: 0.843032
[1

In [None]:
X_train_2 = df_train_2.drop(['target'], axis=1)
Y_train_2 = df_train_2['target'].values

d_train_final_2 = lgb.Dataset(X_train_2, Y_train_2)
watchlist_final_2 = lgb.Dataset(X_train_2, Y_train_2)
#watchlist_final_2 = lgb.Dataset(X_train_val[vali_length:], y_train_val[vali_length:])

In [None]:
del df_train_2

### Model 2

In [None]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'learning_rate': 0.3 ,
    'verbose': 0,
    'num_leaves': 128,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'max_depth': -1,
    'num_rounds': 200,
    'metric' : 'auc'
}

model_f2 = lgb.train(params, train_set=d_train_final_2,
                     valid_sets=watchlist_final_2, verbose_eval=5)

In [None]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'dart',
    'learning_rate': 0.3 ,
    'verbose': 0,
    'num_leaves': 128,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'max_depth': -1,
    'num_rounds': 200,
    'metric' : 'auc'
}

model_f2_2 = lgb.train(params, train_set=d_train_final_2,
                     valid_sets=watchlist_final_2, verbose_eval=5)

In [None]:
df_test['check'] = df_test['msno'].isin(msno_check)

In [None]:
X_test['name'] = X_test['name'].astype('category')

### Make predictions

In [36]:
X_test = df_test.drop(['id'], axis=1)
ids = df_test['id'].values

p_test_1 = model_f1.predict(X_test)
p_test_1_2 = model_f1_2.predict(X_test)
#p_test_2 = model_f2.predict(X_test[X_test['check'] != True].drop(['check','msno'], axis=1))
#p_test_2_2 = model_f2_2.predict(X_test[X_test['check'] != True].drop(['check','msno'], axis=1))
p_test_avg_1 = np.mean([p_test_1, p_test_1_2], axis = 0)
#p_test_avg_2 = np.mean([p_test_2, p_test_2_2], axis = 0)

In [None]:
#p_test_1 = model_f1.predict(X_test[X_test['check'] == True].drop(['check'], axis=1))
#p_test_2 = model_f2.predict(X_test[X_test['check'] != True].drop(['check','msno'], axis=1))
#p_test_avg_1 = np.mean([p_test_1, p_test_1_2], axis = 0)
#p_test_avg_2 = np.mean([p_test_2, p_test_2_2], axis = 0)

### Save to submission.csv

In [39]:
cur_time = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
file_name = 'submission_' + cur_time + '.csv'

subm = pd.DataFrame()
#ids = np.append(df_test[df_test['check'] == True]['id'].values, df_test[df_test['check'] != True]['id'].values)
#p_test_avg = np.append(p_test_1_2, p_test_2_2)
#p_test_avg = np.append(p_test_avg_1, p_test_avg_2)
subm['id'] = ids
subm['target'] = p_test_avg_1
#subm['target'] = subm['target'].apply(alignment)
subm.to_csv(file_name, index=False, float_format = '%.5f')
print('saved as '+ file_name)

saved as submission_20171228-202310.csv
