In [1]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import catboost

In [2]:
# Loading Data

train = pd.read_csv('data/train.csv')
songs = pd.read_csv('data/songs.csv')
song_labels = pd.read_csv('data/song_labels.csv')
test = pd.read_csv('data/test.csv')
save_for_later = pd.read_csv('data/save_for_later.csv')
dummy_submission = pd.read_csv('data/dummy_submission.csv')

In [3]:
# songs preprocessing

song_labels_new = song_labels.groupby('label_id').sum().sort_values('count', ascending = False)
song_labels_new.reset_index(inplace = True)
principal_song_labels = song_labels_new['label_id'].to_numpy()[:100]

for i in principal_song_labels:
    label = song_labels[song_labels['label_id'] == i]
    label = label.reset_index().drop(['index', 'label_id'], axis = 1)
    label[f'count{i}'] = label['count']
    label.drop(['count'], axis = 1, inplace = True)
    songs = pd.merge(songs, label, on = 'platform_id', how = 'left')
    
for i in principal_song_labels:
    songs[f'count{i}'] = songs[f'count{i}'].fillna(0)
    
songs = songs.drop(['platform_id'], axis = 1)

song_scores = train.merge(train.groupby('song_id').mean(), on = 'song_id', how = 'left')[['song_id', 'score_y']].drop_duplicates('song_id', keep = 'first')

songs = songs.merge(song_scores, on = 'song_id', how = 'left')

song_num_ratings = train['song_id'].value_counts().to_frame()

song_num_ratings['num_ratings'] = song_num_ratings['song_id']
song_num_ratings['song_id'] = song_num_ratings.index
songs = songs.merge(song_num_ratings, on = 'song_id', how = 'left')

songs.drop_duplicates('song_id', keep = 'first', inplace = True)

In [4]:
# train preprocessing

f = pd.merge(train, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')
train = f

X_train = pd.merge(train, songs, on = ['song_id'], how = 'left')
Y_train = X_train['score']
X_train.drop(['score'], axis = 1, inplace = True)
X_train.drop('song_id', axis = 1, inplace = True)

X_train['released_year'] = X_train['released_year'].fillna(-999)
X_train['language'] = X_train['language'].fillna('none')
X_train['number_of_comments'] = X_train['number_of_comments'].fillna(-999)

In [None]:
# Training

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, train_size = 0.8)

from catboost import CatBoostRegressor

model = CatBoostRegressor()

eval_dataset = catboost.Pool(data=x_test,
                    label=y_test,
                    cat_features=[0, 1, 3])
model.fit(x_train, y_train, eval_set = eval_dataset, cat_features = [0, 1, 3], use_best_model = True, plot = True)

In [None]:
# test preprocessing
test = pd.merge(test, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')
X_test = pd.merge(test, songs, on = ['song_id'], how = 'left')
X_test.drop('song_id', axis = 1, inplace = True)

X_test['released_year'] = X_test['released_year'].fillna(-999)
X_test['language'] = X_test['language'].fillna('none')
X_test['number_of_comments'] = X_test['number_of_comments'].fillna(-999)

In [None]:
# Predicting
y_test_pred = model.predict(X_test)