In [1]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import catboost

In [2]:
# Loading Data

train = pd.read_csv('data/train.csv')
songs = pd.read_csv('data/songs.csv')
song_labels = pd.read_csv('data/song_labels.csv')
test = pd.read_csv('data/test.csv')
save_for_later = pd.read_csv('data/save_for_later.csv')
dummy_submission = pd.read_csv('data/dummy_submission.csv')

In [3]:
from sklearn.model_selection import train_test_split
train_init, train_rem = train_test_split(train, train_size = 0.3, random_state = 1023)

In [4]:
customer_id_list = train['customer_id'].unique()

In [5]:
song_id_list = train['song_id'].unique()

In [6]:
# Initializing Matrix Factorization
learning_rate = 1e-2
iters = 200
dim = 100
reg = 0.05
# Initialization

customer_weights = {}
for customer in customer_id_list:
    np.random.seed(0)
    customer_weights[customer] = np.random.uniform(0, 10e-10, dim)
    
song_weights = {}
for song in song_id_list:
    np.random.seed(0)
    song_weights[song] = np.random.uniform(0, 10e-10, dim)

In [7]:
# Training Matrix Factorization
import time
start = time.time()
for i in range(iters):
    for k in range(len(train_init.index.to_numpy())):
        customer_weight = customer_weights[train_init['customer_id'].iloc[k]]
        song_weight = song_weights[train_init['song_id'].iloc[k]]
        y = train_init['score'].iloc[k]
        
        temp = y - np.dot(customer_weight, song_weight)
        customer_weight_new = customer_weight + learning_rate*(temp*song_weight-reg*customer_weight)
        song_weight_new = song_weight + learning_rate*(temp*customer_weight-reg*song_weight)
        customer_weights[train_init['customer_id'].iloc[k]] = customer_weight_new
        song_weights[train_init['song_id'].iloc[k]] = song_weight_new
    estimates_train_rem = []
    for k in range(len(train_rem.index.to_numpy())):
        customer_weight = customer_weights[train_rem['customer_id'].iloc[k]]
        song_weight = song_weights[train_rem['song_id'].iloc[k]]
        estimate = np.dot(customer_weight, song_weight)
        estimates_train_rem.append(estimate)
    estimates_train_init = []
    for k in range(len(train_init.index.to_numpy())):
        customer_weight = customer_weights[train_init['customer_id'].iloc[k]]
        song_weight = song_weights[train_init['song_id'].iloc[k]]
        estimate = np.dot(customer_weight, song_weight)
        estimates_train_init.append(estimate)
    y_train_rem = train_rem['score'].to_numpy()
    y_train_init = train_init['score'].to_numpy()
    estimates_train_rem = np.array(estimates_train_rem)
    estimates_train_init = np.array(estimates_train_init)
    train_error_rem = (1/len(y_train_rem))*np.linalg.norm(y_train_rem - estimates_train_rem)**2
    train_error_init = (1/len(y_train_init))*np.linalg.norm(y_train_init - estimates_train_init)**2
    print(f'iter {i} rem {train_error_rem} init {train_error_init}')
print(time.time()-start)

iter 0 rem 16.468380150358996 init 16.471464380938077
iter 1 rem 16.468380150358815 init 16.471464380937658
iter 2 rem 16.46838015033611 init 16.471464380910817
iter 3 rem 16.468380149296404 init 16.471464379674675
iter 4 rem 16.468380101237393 init 16.471464322481573
iter 5 rem 16.46837787859608 init 16.471461676762562
iter 6 rem 16.46827507643663 init 16.47133929776594
iter 7 rem 16.46352510100062 init 16.465685315418927
iter 8 rem 16.254189181816624 init 16.217798411108355
iter 9 rem 12.97644646798974 init 12.62725456277531
iter 10 rem 8.265125241857886 init 7.86070600249367
iter 11 rem 5.304282714451716 init 4.863500396344908
iter 12 rem 3.5823195478599836 init 3.122908353683876
iter 13 rem 2.5911292296646065 init 2.1429962728478285
iter 14 rem 2.006440004754722 init 1.5868442950640496
iter 15 rem 1.6457175913214688 init 1.2592702417636767
iter 16 rem 1.413674303453657 init 1.0588741270074125
iter 17 rem 1.2594117779500078 init 0.9325113419205444
iter 18 rem 1.1540366232481511 init

KeyboardInterrupt: 

In [None]:
customer_weights_df = pd.DataFrame(customer_weights)

In [None]:
customer_weights_df = customer_weights_df.transpose()

In [None]:
customer_weights_df['customer_id'] = customer_weights_df.index

In [None]:
train_rem = train_rem.merge(customer_weights_df, on = 'customer_id', how = 'left')

In [None]:
song_weights_df = pd.DataFrame(song_weights)

In [None]:
song_weights_df = song_weights_df.transpose()

In [None]:
song_weights_df['song_id'] = song_weights_df.index

In [None]:
songs = songs.merge(song_weights_df, on = 'song_id', how = 'left')

In [None]:
estimates_train = []
for k in range(len(train_rem.index.to_numpy())):
    customer_weight = customer_weights[train_rem['customer_id'].iloc[k]]
    song_weight = song_weights[train_rem['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates_train.append(estimate)

In [None]:
train_rem['estimates'] = estimates_train

In [None]:
song_labels_pivot = song_labels.pivot_table(index = 'platform_id', columns = 'label_id', values = 'count')

In [None]:
song_labels_pivot = song_labels_pivot.fillna(0)

In [None]:
song_labels_pivot = song_labels_pivot.applymap(lambda x: np.log(1+np.abs(x)))

In [None]:
song_labels_pivot

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=100, max_iter = 1000, verbose = 1, random_state = 1)

In [None]:
song_labels_transformed = nmf.fit_transform(song_labels_pivot)

In [None]:
song_labels_transformed.shape

In [None]:
song_labels_transformed_df = pd.DataFrame(song_labels_transformed, index = song_labels_pivot.index)

In [None]:
song_labels_transformed_df

In [None]:
songs = pd.merge(songs, song_labels_transformed_df, on = 'platform_id', how = 'left')

In [None]:
songs

In [None]:
songs = songs.drop(['platform_id'], axis = 1)

In [None]:
train_song_mean = train.groupby('song_id').mean()
song_scores = train.merge(train_song_mean, on = 'song_id', how = 'left')
song_scores = song_scores[['song_id', 'score_y']]
song_scores.drop_duplicates('song_id', keep = 'first', inplace = True)

In [None]:
songs = songs.merge(song_scores, on = 'song_id', how = 'left')

In [None]:
song_num_ratings = train['song_id'].value_counts().to_frame()

In [None]:
song_num_ratings['num_ratings'] = song_num_ratings['song_id']

In [None]:
song_num_ratings['song_id'] = song_num_ratings.index

In [None]:
songs = songs.merge(song_num_ratings, on = 'song_id', how = 'left')

In [None]:
songs.drop_duplicates('song_id', keep = 'first', inplace = True)

In [None]:
f = pd.merge(train_rem, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
train_rem = f

In [None]:
X_train = pd.merge(train_rem, songs, on = ['song_id'], how = 'left')

In [None]:
Y_train = X_train['score']

In [None]:
X_train.drop(['score'], axis = 1, inplace = True)

In [None]:
X_train['released_year'] = X_train['released_year'].fillna(-999)
X_train['language'] = X_train['language'].fillna('none')
X_train['number_of_comments'] = X_train['number_of_comments'].fillna(-999)

In [None]:
X_train.columns.to_numpy().tolist().index('Exist')

In [None]:
from catboost import CatBoostRegressor

In [None]:
model = CatBoostRegressor(cat_features = [0,1, 103, 105])

In [None]:
model.fit(X_train, Y_train, cat_features = [0,1, 103, 105], plot = True)

In [None]:
test = test.merge(customer_weights_df, on = 'customer_id', how = 'left')

In [None]:
estimates_test = []
for k in range(len(test.index.to_numpy())):
    customer_weight = customer_weights[test['customer_id'].iloc[k]]
    song_weight = song_weights[test['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates_test.append(estimate)

In [None]:
test['estimates'] = estimates_test

In [None]:
test = pd.merge(test, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
X_test = pd.merge(test, songs, on = ['song_id'], how = 'left')

In [None]:
X_test['released_year'] = X_test['released_year'].fillna(-999)
X_test['language'] = X_test['language'].fillna('none')
X_test['number_of_comments'] = X_test['number_of_comments'].fillna(-999)

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
y_final = pd.DataFrame(y_test_pred)

In [None]:
y_final['score'] = y_final[0]

In [None]:
y_final.drop(0, axis = 1, inplace = True)

In [None]:
y_final['test_row_id'] = y_final.index

In [None]:
y_final = y_final[['test_row_id', 'score']]

In [None]:
y_final.to_csv('colab_nmfv2.csv', index = False)