In [5]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import catboost

In [6]:
# Loading Data

train = pd.read_csv('data/train.csv')
songs = pd.read_csv('data/songs.csv')
song_labels = pd.read_csv('data/song_labels.csv')
test = pd.read_csv('data/test.csv')
save_for_later = pd.read_csv('data/save_for_later.csv')
dummy_submission = pd.read_csv('data/dummy_submission.csv')

In [4]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(train, train_size = 0.8)

In [None]:
customer_id_list = train['customer_id'].unique()

In [None]:
song_id_list = train['song_id'].unique()

In [None]:
learning_rate = 0.01
iters = 10
dim = 10
# Initialization

customer_weights = {}
for customer in customer_id_list:
    customer_weights[customer] = np.random.uniform(0, 10e-10, dim)
    
song_weights = {}
for song in song_id_list:
    song_weights[song] = np.random.uniform(0, 10e-10, dim)

In [None]:
# Training
import time
start = time.time()
for i in range(iters):
    for k in range(len(train.index.to_numpy())):
        customer_weight = customer_weights[train['customer_id'].iloc[k]]
        song_weight = song_weights[train['song_id'].iloc[k]]
        y = train['score'].iloc[k]
        
        temp = y - np.dot(customer_weight, song_weight)
        customer_weight_new = customer_weight + learning_rate*temp*song_weight
        song_weight_new = song_weight + learning_rate*temp*customer_weight
        customer_weights[train['customer_id'].iloc[k]] = customer_weight_new
        song_weights[train['song_id'].iloc[k]] = song_weight_new
    print(i)
            
print(time.time()-start)

In [None]:
customer_weights['O29219']

In [None]:
song_weights[3459]

In [None]:
train

In [None]:
customer_weights_df = pd.DataFrame(customer_weights)

In [None]:
customer_weights_df = customer_weights_df.transpose()

In [None]:
customer_weights_df['customer_id'] = customer_weights_df.index

In [None]:
customer_weights_df.reset_index(inplace = True)

In [None]:
customer_weights_df.drop('index', axis = 1, inplace = True)

In [None]:
customer_weights_df

In [None]:
train = train.merge(customer_weights_df, on = 'customer_id', how = 'left')

In [None]:
song_weights_df = pd.DataFrame(song_weights)

In [None]:
song_weights_df = song_weights_df.transpose()

In [None]:
song_weights_df['song_id'] = song_weights_df.index

In [None]:
songs = songs.merge(song_weights_df, on = 'song_id', how = 'left')

In [None]:
estimates = []
for k in range(len(train.index.to_numpy())):
    customer_weight = customer_weights[train['customer_id'].iloc[k]]
    song_weight = song_weights[train['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates.append(estimate)

In [None]:
len(estimates)

In [None]:
train['estimates'] = estimates

In [None]:
train

In [None]:
song_labels_new = song_labels.groupby('label_id').sum().sort_values('count', ascending = False)

In [None]:
song_labels_new.reset_index(inplace = True)

In [None]:
principal_song_labels = song_labels_new['label_id'].to_numpy()[:100]

In [None]:
for i in principal_song_labels:
    label = song_labels[song_labels['label_id'] == i]
    label = label.reset_index().drop(['index', 'label_id'], axis = 1)
    label[f'count{i}'] = label['count']
    label.drop(['count'], axis = 1, inplace = True)
    songs = pd.merge(songs, label, on = 'platform_id', how = 'left')

In [None]:
for i in principal_song_labels:
    songs[f'count{i}'] = songs[f'count{i}'].fillna(0)

In [None]:
songs = songs.drop(['platform_id'], axis = 1)

In [None]:
song_scores = train.merge(train.groupby('song_id').mean(), on = 'song_id', how = 'left')[['song_id', 'score_y']].drop_duplicates('song_id', keep = 'first')

In [None]:
songs = songs.merge(song_scores, on = 'song_id', how = 'left')

In [None]:
song_num_ratings = train['song_id'].value_counts().to_frame()

In [None]:
song_num_ratings['num_ratings'] = song_num_ratings['song_id']

In [None]:
song_num_ratings['song_id'] = song_num_ratings.index

In [None]:
songs = songs.merge(song_num_ratings, on = 'song_id', how = 'left')

In [None]:
songs.drop_duplicates('song_id', keep = 'first', inplace = True)

In [None]:
songs

In [None]:
f = pd.merge(train, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
train = f

In [None]:
train

In [None]:
X_train = pd.merge(train, songs, on = ['song_id'], how = 'left')

In [None]:
X_train

In [None]:
Y_train = X_train['score']

In [None]:
X_train.drop(['score'], axis = 1, inplace = True)

In [None]:
X_train.drop('song_id', axis = 1, inplace = True)

In [None]:
X_train['released_year'] = X_train['released_year'].fillna(-999)
X_train['language'] = X_train['language'].fillna('none')
X_train['number_of_comments'] = X_train['number_of_comments'].fillna(-999)

In [None]:
X_train.drop('customer_id', axis = 1, inplace = True)

In [None]:
X_train.columns.tolist()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, train_size = 0.2)

In [None]:
from catboost import CatBoostRegressor

In [None]:
model = CatBoostRegressor()

In [None]:
eval_dataset = catboost.Pool(data=x_test,
                    label=y_test,
                    cat_features=[11, 13])
model.fit(x_train, y_train, eval_set = eval_dataset, cat_features = [11, 13], use_best_model = True, plot = True)

In [None]:
test

In [None]:
test = test.merge(customer_weights_df, on = 'customer_id', how = 'left')

In [None]:
test

In [None]:
estimates_test = []
for k in range(len(test.index.to_numpy())):
    customer_weight = customer_weights[test['customer_id'].iloc[k]]
    song_weight = song_weights[test['song_id'].iloc[k]]
    estimate = np.dot(customer_weight, song_weight)
    estimates_test.append(estimate)

In [None]:
test['estimates'] = estimates_test

In [None]:
test

In [None]:
f = pd.merge(test, save_for_later, on=['customer_id','song_id'], how='left', indicator='Exist')

In [None]:
test = f

In [None]:
test

In [None]:
X_test = pd.merge(test, songs, on = ['song_id'], how = 'left')

In [None]:
X_test

In [None]:
X_test.drop('song_id', axis = 1, inplace = True)

In [None]:
X_test['released_year'] = X_test['released_year'].fillna(-999)
X_test['language'] = X_test['language'].fillna('none')
X_test['number_of_comments'] = X_test['number_of_comments'].fillna(-999)

In [None]:
X_test.drop('customer_id', axis = 1, inplace = True)

In [None]:
X_test

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
len(y_test_pred)

In [None]:
y_final = pd.DataFrame(y_test_pred)

In [None]:
y_final['score'] = y_final[0]

In [None]:
y_final.drop(0, axis = 1, inplace = True)

In [None]:
y_final['test_row_id'] = y_final.index

In [None]:
y_final = y_final[['test_row_id', 'score']]

In [None]:
y_final

In [None]:
y_final.to_csv('improvedv3.csv', index = False)

In [7]:
train['customer_id'].nunique()

14053