In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import time

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head(1)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1


In [3]:
members = pd.read_csv('members.csv')
members.head(1)

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time,expiration_date
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,,7,20110820,20170920


In [4]:
def process_date(df, columns):
    for column in columns:
        df[column + '_year'] = df[column].apply(lambda x: int(str(x)[0:4]))
        df[column + '_month'] = df[column].apply(lambda x: int(str(x)[4:6]))
        df[column + '_date'] = df[column].apply(lambda x: int(str(x)[6:8]))
        
    df.drop(columns, axis=1, inplace=True)
    return df

members = process_date(members, ['registration_init_time', 'expiration_date'])

train = train.merge(members, on='msno', how='left')
test = test.merge(members, on='msno', how='left')
del members

In [5]:
songs = pd.read_csv('songs.csv')
songs.head(1)

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0


In [6]:
songs_genres = songs['genre_ids'].apply(lambda x: [int(v) for v in str(x).split('|') if v != 'nan'])

In [7]:
unique_genres = None
for genres in songs_genres:
    genres = np.array(genres)
    if unique_genres is None:
        unique_genres = genres
        continue
    unique_genres = np.unique(np.concatenate([unique_genres, genres]))

In [8]:
one_hot = np.zeros((len(songs_genres), len(unique_genres)))
for song_id, song_genres in enumerate(songs_genres):
    for genre in song_genres:
        genre_id = list(unique_genres).index(genre)
        one_hot[song_id, genre_id] = 1
        
for genre_id, genre in enumerate(unique_genres):
    songs['genre_' + str(int(genre))] = one_hot[:, genre_id]

In [9]:
songs = songs.drop(['genre_ids'], axis=1)

train = train.merge(songs, on='song_id', how='left')

In [10]:
extra_songs = pd.read_csv('song_extra_info.csv')
extra_songs.head(1)

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043


In [11]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        val = int(isrc[5:7])
        if val > 17:
            return 1900 + val
        else:
            return 2000 + val
    else:
        return np.nan
    
extra_songs['song_year'] = extra_songs['isrc'].apply(isrc_to_year)
extra_songs.drop(['isrc', 'name'], axis=1, inplace=True)

train = train.merge(extra_songs, on='song_id', how='left')
del extra_songs 

In [12]:
#train = train.head(100000)

In [13]:
len(train)

7377418

# Task 1

In [14]:
train = train.fillna(0)

X = train.drop(['target'], axis=1)
y = train['target'].values

In [15]:
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

In [16]:
code2song_id = dict(list(zip(X['song_id'].cat.codes, X['song_id'])))

In [17]:
for col in X.select_dtypes(include=['category']).columns:
    X[col] = X[col].cat.codes

In [18]:
X_values = X.values

In [19]:
from sklearn.utils import shuffle

X_values, y = shuffle(X_values, y)

In [None]:
from sklearn.model_selection import KFold, train_test_split
from datetime import datetime

start = datetime.now()
X_train, X_test, y_train, y_test = train_test_split(X_values, y, test_size=0.2)
print(datetime.now() - start)

In [21]:
# Xgboost
params = {}
params['objective'] = 'binary:logistic'
params['tree_method'] = 'gpu_hist'
params['learning_rate'] = 0.1
params['n_estimators'] = 500
params['predictor'] = 'gpu_predictor'
params['eta'] = 0.75
params['max_depth'] = 10
params['eval_metric'] = 'auc'

In [22]:
ml = xgb.XGBClassifier(**params)

In [23]:
start = datetime.now()
ml.fit(X_train, y_train)
print(datetime.now() - start)

0:01:59.124885


In [24]:
prediction = ml.predict(X_test)

In [25]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, prediction))
print(metrics.classification_report(y_test, prediction))

0.7116524245166352
              precision    recall  f1-score   support

           0       0.72      0.69      0.70    732231
           1       0.71      0.73      0.72    743253

    accuracy                           0.71   1475484
   macro avg       0.71      0.71      0.71   1475484
weighted avg       0.71      0.71      0.71   1475484



In [26]:
from sklearn.model_selection import KFold

# Evaluation
kf = KFold(5)

ground_truths = []
predictions = []
start = datetime.now()
for iteration, (train_idxs, test_idxs) in enumerate(kf.split(X)):
    print('fold number', iteration + 1)
    X_train, X_test = X_values[train_idxs], X_values[test_idxs]
    y_train, y_test = y[train_idxs], y[test_idxs]

    ml = xgb.XGBClassifier(**params)
    ml.fit(X_train, y_train)
    prediction = ml.predict(X_test)
    
    ground_truths.append(y_test)
    predictions.append(prediction)
print(datetime.now() - start)

fold number 1
fold number 2
fold number 3
fold number 4
fold number 5
0:12:40.328978


In [27]:
from sklearn.metrics import roc_auc_score

def score_auc(ground_truths, predictions):
    aucs = []
    for gt, pred in zip(ground_truths, predictions):
        roc_auc = roc_auc_score(gt, pred)
        print(roc_auc)
        aucs.append(roc_auc)
    mean_auc = np.mean(aucs)
    print('mean', mean_auc)
    
score_auc(ground_truths, predictions)

0.7109350188174736
0.7116193357529643
0.7110302702352628
0.7104240678080854
0.7109987373604509
mean 0.7110014859948474


In [30]:
from catboost import CatBoostClassifier

start = datetime.now()
catboost = CatBoostClassifier(iterations=100, task_type='GPU', 
                              verbose=False, depth=16, devices='0')
catboost.fit(X_train, y_train)
print(datetime.now() - start)

0:04:17.993207


In [31]:
prediction = catboost.predict(X_test)

In [32]:
print(roc_auc_score(y_test, prediction))
print(metrics.classification_report(y_test, prediction))

0.7051889900337003
              precision    recall  f1-score   support

           0       0.71      0.68      0.70    732069
           1       0.70      0.73      0.72    743415

    accuracy                           0.71   1475484
   macro avg       0.71      0.71      0.71   1475484
weighted avg       0.71      0.71      0.71   1475484



In [33]:
from sklearn.model_selection import KFold

# Evaluation
kf = KFold(5)

cat_ground_truths = []
cat_predictions = []
start = datetime.now()
for iteration, (train_idxs, test_idxs) in enumerate(kf.split(X)):
    print('fold number', iteration + 1)
    X_train, X_test = X_values[train_idxs], X_values[test_idxs]
    y_train, y_test = y[train_idxs], y[test_idxs]

    catboost = CatBoostClassifier(n_estimators=600, task_type='GPU', verbose=False)
    catboost.fit(X_train, y_train)
    prediction = catboost.predict(X_test)
    
    cat_ground_truths.append(y_test)
    cat_predictions.append(prediction)
print(datetime.now() - start)

fold number 1
fold number 2
fold number 3
fold number 4
fold number 5
0:22:54.830234


In [34]:
score_auc(cat_ground_truths, cat_predictions)

0.641851324696947
0.6417614445986597
0.6420560858664655
0.6417603580499702
0.6417692636523153
mean 0.6418396953728716


# Task 2

In [150]:
X.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time_year',
       ...
       'genre_2192', 'genre_2194', 'genre_2206', 'genre_2213', 'genre_2215',
       'genre_2219', 'genre_2238', 'genre_2245', 'genre_2248', 'song_year'],
      dtype='object', length=212)

In [141]:
data = X_train[:50000]
y_data = y_train[:50000]

In [155]:
def extract_histories(X):
    users = np.unique(X['msno'].values)
    return [X[X['msno'] == user]['song_id'].astype(str).tolist() for user in users]
        
histories = extract_histories(X.head(50000))
len(histories)

4240

In [185]:
from gensim.models import Word2Vec

model = Word2Vec(window=10, sg=1, hs=0, size=300, negative=10, alpha=0.03, min_alpha=0.0007)
model.build_vocab(histories, progress_per=100)

In [186]:
model.train(histories, total_examples=model.corpus_count, epochs=10, report_delay=1)

(258745, 500000)

In [187]:
print(model[model.wv.vocab].shape)
print(model)

(1853, 300)
Word2Vec(vocab=1853, size=300, alpha=0.03)


  """Entry point for launching an IPython kernel.


In [188]:
model.init_sims(replace=True)

In [222]:
def get_similars(category_code, top=5):
    similars = model.similar_by_vector(model[category_code], topn=top + 1)
    df = []
    for code, similarity in similars:
        song_id = code2song_id[int(code)]
        info = songs[songs['song_id'] == song_id].to_dict('records')[0]
        #print(info)
        info['similarity'] = similarity
        df.append(info)
        
    return pd.DataFrame(df)
code = list(model.wv.vocab.items())[0][0]
get_similars(code)

  
  


Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,similarity
0,T8ZrM8qHA5EckfuOWkaACBvXTK8kc1Dfi3rxnkMSscE=,154749,465,The Age Of Innocence II,,,52.0,1.0
1,qcVW/MncLvTIEE6d0EKb3SZNhU5qNsJ09LlRp4VY4so=,250044,465,Janet Jackson,,,52.0,0.998845
2,RL5pk3IUmdveT6HKPtkpwIaDzz81x+TCcSoxxyd2avM=,191373,465,Jason Mraz,Colbie Caillat| Jason Mraz| Timothy Fagan,,52.0,0.996742
3,IDb1NqYAL1TVoqnfPExvRyuHq4gQ+Fsz96l1Ef8zHMk=,238840,465,Dream5,菊谷知樹,ラッキィ池田＆高木貴司,17.0,0.993482
4,nt+hY8boPKCRNeo7Oha7roBlTsJMhStJvKL2jJ4DPEo=,239281,465,畢書盡 (Bii),畢書盡+天才,天才,3.0,0.992076
5,IY9FSyfgKX5R4wK4XUiHXNgrg3VI0/L3ORGbH5AVpCc=,330710,458,徐佳瑩 (Lala Hsu),鄭楠,陳信延,3.0,0.99104
