In [1]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import catboost

In [2]:
# Loading Data

train = pd.read_csv('data/train.csv')
songs = pd.read_csv('data/songs.csv')
song_labels = pd.read_csv('data/song_labels.csv')
test = pd.read_csv('data/test.csv')
save_for_later = pd.read_csv('data/save_for_later.csv')
dummy_submission = pd.read_csv('data/dummy_submission.csv')

In [3]:
from sklearn.model_selection import train_test_split
train_init, train_rem = train_test_split(train, train_size = 0.6, random_state = 1023)

In [4]:
customer_id_list = train_init['customer_id'].unique()

In [5]:
song_id_list = train_init['song_id'].unique()

In [6]:
learning_rate = 1e-2
iters = 35
dim = 10
reg = 0.05
# Initialization

customer_weights = {}
for customer in customer_id_list:
    customer_weights[customer] = np.random.uniform(0, 10e-10, dim)
    
song_weights = {}
for song in song_id_list:
    song_weights[song] = np.random.uniform(0, 10e-10, dim)

In [None]:
# Training
import time
start = time.time()
for i in range(iters):
    for k in range(len(train.index.to_numpy())):
        customer_weight = customer_weights[train['customer_id'].iloc[k]]
        song_weight = song_weights[train['song_id'].iloc[k]]
        y = train['score'].iloc[k]
        
        temp = y - np.dot(customer_weight, song_weight)
        customer_weight_new = customer_weight + learning_rate*(temp*song_weight-reg*customer_weight)
        song_weight_new = song_weight + learning_rate*(temp*customer_weight-reg*song_weight)
        customer_weights[train['customer_id'].iloc[k]] = customer_weight_new
        song_weights[train['song_id'].iloc[k]] = song_weight_new
    estimates_train = []
    for k in range(len(train.index.to_numpy())):
        customer_weight = customer_weights[train['customer_id'].iloc[k]]
        song_weight = song_weights[train['song_id'].iloc[k]]
        estimate = np.dot(customer_weight, song_weight)
        estimates_train.append(estimate)

    estimates_validation = []
    for k in range(len(validation.index.to_numpy())):
        customer_weight = customer_weights[validation['customer_id'].iloc[k]]
        song_weight = song_weights[validation['song_id'].iloc[k]]
        estimate = np.dot(customer_weight, song_weight)
        estimates_validation.append(estimate)
    y_train = train['score'].to_numpy()
    y_val = validation['score'].to_numpy()
    estimates_train = np.array(estimates_train)
    estimates_validation = np.array(estimates_validation)
    train_error = (1/len(y_train))*np.linalg.norm(y_train - estimates_train)**2
    val_error = (1/len(y_val))*np.linalg.norm(y_val - estimates_validation)**2
    print(f'iter {i} train {train_error} val {val_error}')
print(time.time()-start)

In [None]:
10

In [None]:
len(y_val)

In [None]:
customer_weights['K47325']

In [None]:
customer_weights_df = pd.DataFrame(customer_weights)

In [None]:
customer_weights_df = customer_weights_df.transpose()

In [None]:
customer_weights_df['customer_id'] = customer_weights_df.index

In [None]:
train = train.merge(customer_weights_df, on = 'customer_id', how = 'left')
validation = validation.merge(customer_weights_df, on = 'customer_id', how = 'left')

In [None]:
song_weights_df = pd.DataFrame(song_weights)

In [None]:
song_weights_df = song_weights_df.transpose()

In [None]:
song_weights_df['song_id'] = song_weights_df.index

In [None]:
songs = songs.merge(song_weights_df, on = 'song_id', how = 'left')

In [None]:
estimates_train = []
for k in range(len(train.index.to_numpy())):
    customer_weight = customer_weights[train['customer_id'].iloc[k]]
    song_weight = song_weights[train['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates_train.append(estimate)

estimates_validation = []
for k in range(len(validation.index.to_numpy())):
    customer_weight = customer_weights[validation['customer_id'].iloc[k]]
    song_weight = song_weights[validation['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates_validation.append(estimate)

In [None]:
validation['estimates'] = estimates_validation
train['estimates'] = estimates_train

In [None]:
song_labels_new = song_labels.groupby('label_id').sum().sort_values('count', ascending = False)

In [None]:
song_labels_new.reset_index(inplace = True)

In [None]:
principal_song_labels = song_labels_new['label_id'].to_numpy()[:10]

In [None]:
for i in principal_song_labels:
    label = song_labels[song_labels['label_id'] == i]
    label = label.reset_index().drop(['index', 'label_id'], axis = 1)
    label[f'count{i}'] = label['count']
    label.drop(['count'], axis = 1, inplace = True)
    songs = pd.merge(songs, label, on = 'platform_id', how = 'left')

In [None]:
for i in principal_song_labels:
    songs[f'count{i}'] = songs[f'count{i}'].fillna(0)

In [None]:
songs = songs.drop(['platform_id'], axis = 1)

In [None]:
song_scores = train.merge(train.groupby('song_id').mean(), on = 'song_id', how = 'left')[['song_id', 'score_y']].drop_duplicates('song_id', keep = 'first')

In [None]:
songs = songs.merge(song_scores, on = 'song_id', how = 'left')

In [None]:
song_num_ratings = train['song_id'].value_counts().to_frame()

In [None]:
song_num_ratings['num_ratings'] = song_num_ratings['song_id']

In [None]:
song_num_ratings['song_id'] = song_num_ratings.index

In [None]:
songs = songs.merge(song_num_ratings, on = 'song_id', how = 'left')

In [None]:
songs.drop_duplicates('song_id', keep = 'first', inplace = True)

In [None]:
f = pd.merge(train, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
fv = pd.merge(validation, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
train = f
validation = fv

In [None]:
X_train = pd.merge(train, songs, on = ['song_id'], how = 'left')
X_val = pd.merge(validation, songs, on = ['song_id'], how = 'left')

In [None]:
Y_train = X_train['score']
Y_val = X_val['score']

In [None]:
X_train.drop(['score'], axis = 1, inplace = True)
X_val.drop(['score'], axis = 1, inplace = True)

In [None]:
X_train.drop('song_id', axis = 1, inplace = True)
X_val.drop('song_id', axis = 1, inplace = True)

In [None]:
X_train['released_year'] = X_train['released_year'].fillna(-999)
X_train['language'] = X_train['language'].fillna('none')
X_train['number_of_comments'] = X_train['number_of_comments'].fillna(-999)

X_val['released_year'] = X_val['released_year'].fillna(-999)
X_val['language'] = X_val['language'].fillna('none')
X_val['number_of_comments'] = X_val['number_of_comments'].fillna(-999)

In [None]:
X_train.columns.to_numpy().tolist()

In [None]:
from catboost import CatBoostRegressor

In [None]:
model = CatBoostRegressor()

In [None]:
eval_dataset = catboost.Pool(data=X_val,
                    label=Y_val,
                    cat_features=[0, 1, 13, 15])
model.fit(X_train, Y_train, eval_set = eval_dataset, cat_features = [0, 1, 13, 15], use_best_model = True, plot = True)

In [None]:
validation

In [None]:
customer_weights_df[customer_weights_df['customer_id'] == 'F38860']