In [1]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import catboost

In [2]:
# Loading Data

train = pd.read_csv('data/train.csv')
songs = pd.read_csv('data/songs.csv')
song_labels = pd.read_csv('data/song_labels.csv')
test = pd.read_csv('data/test.csv')
save_for_later = pd.read_csv('data/save_for_later.csv')
dummy_submission = pd.read_csv('data/dummy_submission.csv')

In [None]:
from sklearn.model_selection import train_test_split
train_init, train_rem = train_test_split(train, train_size = 0.3, random_state = 1023)

In [None]:
customer_id_list = train['customer_id'].unique()

In [None]:
song_id_list = train['song_id'].unique()

In [None]:
train_init['customer_id'].nunique()

In [None]:
# Initializing Matrix Factorization
learning_rate = 1e-2
iters = 200
dim = 100
reg = 0.05
# Initialization

customer_weights = {}
for customer in customer_id_list:
    customer_weights[customer] = np.random.uniform(0, 10e-10, dim)
    
song_weights = {}
for song in song_id_list:
    song_weights[song] = np.random.uniform(0, 10e-10, dim)

In [None]:
# Training Matrix Factorization
import time
start = time.time()
for i in range(iters):
    for k in range(len(train_init.index.to_numpy())):
        customer_weight = customer_weights[train_init['customer_id'].iloc[k]]
        song_weight = song_weights[train_init['song_id'].iloc[k]]
        y = train_init['score'].iloc[k]
        
        temp = y - np.dot(customer_weight, song_weight)
        customer_weight_new = customer_weight + learning_rate*(temp*song_weight-reg*customer_weight)
        song_weight_new = song_weight + learning_rate*(temp*customer_weight-reg*song_weight)
        customer_weights[train_init['customer_id'].iloc[k]] = customer_weight_new
        song_weights[train_init['song_id'].iloc[k]] = song_weight_new
    estimates_train_rem = []
    for k in range(len(train_rem.index.to_numpy())):
        customer_weight = customer_weights[train_rem['customer_id'].iloc[k]]
        song_weight = song_weights[train_rem['song_id'].iloc[k]]
        estimate = np.dot(customer_weight, song_weight)
        estimates_train_rem.append(estimate)
    estimates_train_init = []
    for k in range(len(train_init.index.to_numpy())):
        customer_weight = customer_weights[train_init['customer_id'].iloc[k]]
        song_weight = song_weights[train_init['song_id'].iloc[k]]
        estimate = np.dot(customer_weight, song_weight)
        estimates_train_init.append(estimate)
    y_train_rem = train_rem['score'].to_numpy()
    y_train_init = train_init['score'].to_numpy()
    estimates_train_rem = np.array(estimates_train_rem)
    estimates_train_init = np.array(estimates_train_init)
    train_error_rem = (1/len(y_train_rem))*np.linalg.norm(y_train_rem - estimates_train_rem)**2
    train_error_init = (1/len(y_train_init))*np.linalg.norm(y_train_init - estimates_train_init)**2
    print(f'iter {i} rem {train_error_rem} init {train_error_init}')
print(time.time()-start)

In [None]:
customer_weights_df = pd.DataFrame(customer_weights)

In [None]:
customer_weights_df = customer_weights_df.transpose()

In [None]:
customer_weights_df['customer_id'] = customer_weights_df.index

In [None]:
# Appending generated customer features
train_rem = train_rem.merge(customer_weights_df, on = 'customer_id', how = 'left')

In [None]:
song_weights_df = pd.DataFrame(song_weights)

In [None]:
song_weights_df = song_weights_df.transpose()

In [None]:
song_weights_df['song_id'] = song_weights_df.index

In [None]:
# Appending generated song features
songs = songs.merge(song_weights_df, on = 'song_id', how = 'left')

In [None]:
# Estimated scores from Matrix Factorization
estimates_train = []
for k in range(len(train_rem.index.to_numpy())):
    customer_weight = customer_weights[train_rem['customer_id'].iloc[k]]
    song_weight = song_weights[train_rem['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates_train.append(estimate)

In [None]:
train_rem['estimates'] = estimates_train

In [None]:
song_labels_new = song_labels.groupby('label_id').sum().sort_values('count', ascending = False)

In [None]:
song_labels_new.reset_index(inplace = True)

In [None]:
principal_song_labels = song_labels_new['label_id'].to_numpy()[:100]

In [None]:
for i in principal_song_labels:
    label = song_labels[song_labels['label_id'] == i]
    label = label.reset_index().drop(['index', 'label_id'], axis = 1)
    label[f'count{i}'] = label['count']
    label.drop(['count'], axis = 1, inplace = True)
    songs = pd.merge(songs, label, on = 'platform_id', how = 'left')

In [None]:
for i in principal_song_labels:
    songs[f'count{i}'] = songs[f'count{i}'].fillna(0)

In [None]:
songs = songs.drop(['platform_id'], axis = 1)

In [3]:
song_scores = train.merge(train.groupby('song_id').mean(), on = 'song_id', how = 'left')[['song_id', 'score_y']].drop_duplicates('song_id', keep = 'first')

In [None]:
train_song_mean = train.groupby('song_id').mean()
song_scores = train.merge(train_song_mean, on = 'song_id', how = 'left')
song_scores = song_scores[['song_id', 'score_y']]
song_scores.drop_duplicates('song_id', keep = 'first', inplace = True)

In [4]:
songs = songs.merge(song_scores, on = 'song_id', how = 'left')

In [5]:
songs

Unnamed: 0,song_id,platform_id,released_year,language,number_of_comments,score_y
0,8328,T17332564,2013.0,eng,1936,3.562500
1,8591,T773514,1925.0,eng,420,4.105263
2,718,U10975,1929.0,eng,5485,3.706667
3,6593,Y12839,1968.0,,200,4.266667
4,743,Q28881,2002.0,en-US,10411,4.132184
...,...,...,...,...,...,...
9976,2048,X15811568,2013.0,eng,4793,3.657895
9977,2552,W9597,1952.0,eng,1365,3.796875
9978,6741,T105574,2006.0,en-US,1296,3.687500
9979,4564,W13624367,2012.0,eng,1961,3.982143


In [None]:
song_num_ratings = train['song_id'].value_counts().to_frame()

In [None]:
song_num_ratings['num_ratings'] = song_num_ratings['song_id']

In [None]:
song_num_ratings['song_id'] = song_num_ratings.index

In [None]:
songs = songs.merge(song_num_ratings, on = 'song_id', how = 'left')

In [None]:
songs.drop_duplicates('song_id', keep = 'first', inplace = True)

In [None]:
f = pd.merge(train_rem, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
train_rem = f

In [None]:
X_train = pd.merge(train_rem, songs, on = ['song_id'], how = 'left')

In [None]:
Y_train = X_train['score']

In [None]:
X_train.drop(['score'], axis = 1, inplace = True)

In [None]:
X_train['released_year'] = X_train['released_year'].fillna(-999)
X_train['language'] = X_train['language'].fillna('none')
X_train['number_of_comments'] = X_train['number_of_comments'].fillna(-999)

In [None]:
X_train.columns.to_numpy().tolist()

In [None]:
from catboost import CatBoostRegressor

In [None]:
model = CatBoostRegressor()

In [None]:
model.fit(X_train, Y_train, cat_features = [0,1, 103, 105], plot = True)

In [None]:
test = test.merge(customer_weights_df, on = 'customer_id', how = 'left')

In [None]:
estimates_test = []
for k in range(len(test.index.to_numpy())):
    customer_weight = customer_weights[test['customer_id'].iloc[k]]
    song_weight = song_weights[test['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates_test.append(estimate)

In [None]:
test['estimates'] = estimates_test

In [None]:
test = pd.merge(test, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
X_test = pd.merge(test, songs, on = ['song_id'], how = 'left')

In [None]:
X_test['released_year'] = X_test['released_year'].fillna(-999)
X_test['language'] = X_test['language'].fillna('none')
X_test['number_of_comments'] = X_test['number_of_comments'].fillna(-999)

In [None]:
X_test.columns.to_numpy().tolist()

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
y_final = pd.DataFrame(y_test_pred)

In [None]:
y_final['score'] = y_final[0]

In [None]:
y_final.drop(0, axis = 1, inplace = True)

In [None]:
y_final['test_row_id'] = y_final.index

In [None]:
y_final = y_final[['test_row_id', 'score']]

In [None]:
y_final.to_csv('colab_regularised.csv', index = False)

In [None]:
customer_weights_df.to_csv('data/customer_weights.csv', index = False)

In [None]:
temp = pd.read_csv('data/customer_weights.csv')

In [None]:
temp

In [None]:
customer_weights_df

In [None]:
song_weights_df.to_csv('data/song_weights.csv', index = False)

In [None]:
import pickle

In [None]:
a_file = open("data/customer_weights.pkl", "wb")

In [None]:
pickle.dump(customer_weights, a_file)

In [None]:
a_file.close()

In [None]:
b_file = open("data/song_weights.pkl", "wb")

In [None]:
pickle.dump(song_weights, b_file)

In [None]:
b_file.close()