In [1]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import catboost

In [7]:
# Loading Data

train = pd.read_csv('data/train.csv')
songs = pd.read_csv('data/songs.csv')
song_labels = pd.read_csv('data/song_labels.csv')
test = pd.read_csv('data/test.csv')
save_for_later = pd.read_csv('data/save_for_later.csv')
dummy_submission = pd.read_csv('data/dummy_submission.csv')

In [None]:
customer_groups = train.groupby('customer_id')

In [None]:
customer_id_list = train['customer_id'].unique()

In [None]:
song_labels_new = song_labels.groupby('label_id').sum().sort_values('count', ascending = False)
song_labels_new.reset_index(inplace = True)
principal_song_labels = song_labels_new['label_id'].to_numpy()[:3]

for i in principal_song_labels:
    label = song_labels[song_labels['label_id'] == i]
    label = label.reset_index().drop(['index', 'label_id'], axis = 1)
    label[f'count{i}'] = label['count']
    label.drop(['count'], axis = 1, inplace = True)
    songs = pd.merge(songs, label, on = 'platform_id', how = 'left')
    
for i in principal_song_labels:
    songs[f'count{i}'] = songs[f'count{i}'].fillna(0)
    
songs = songs.drop(['platform_id'], axis = 1)

In [None]:
songs.drop_duplicates('song_id', keep = 'first', inplace = True)

In [None]:
songs

In [None]:
songs['released_year'] = songs['released_year'].fillna(-999)
songs['number_of_comments'] = songs['number_of_comments'].fillna(-999)
songs['language'] = songs['language'].fillna('none')

In [None]:
songs['count30574'] = songs['count30574'].fillna(-999)
songs['count8717'] = songs['count8717'].fillna(-999)
songs['count11557'] = songs['count11557'].fillna(-999)

In [None]:
group = customer_groups.get_group('J15604')

In [None]:
one_hot = pd.get_dummies(songs.language, prefix='language')
songs = pd.merge(songs, one_hot, how = 'left', left_index = True, right_index = True)
songs.drop('language', axis =1, inplace = True)

In [None]:
X_train = pd.merge(group, songs, on = ['song_id'], how = 'left')

In [None]:
Y_train = X_train['score']

In [None]:
X_train.drop(['score'], axis = 1, inplace = True)

In [None]:
X_train.drop('song_id', axis = 1, inplace = True)


In [None]:
X_train.fillna(-1, inplace = True)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
model = DecisionTreeRegressor()

In [None]:
X_train.drop('customer_id', axis = 1, inplace = True)

In [None]:
import time
start = time.time()
model.fit(X_train, Y_train)
print(time.time()-start)

In [None]:
X_train['released_year'].isnull().any()

In [None]:
import time
start = time.time()
X_train = pd.merge(group, songs, on = ['song_id'], how = 'left')
Y_train = X_train['score']
X_train.drop(['score'], axis = 1, inplace = True)
X_train.drop('song_id', axis = 1, inplace = True)
X_train.fillna(-1, inplace = True)
model = DecisionTreeRegressor()
X_train.drop('customer_id', axis = 1, inplace = True)
model.fit(X_train, Y_train)
print(time.time()-start)

In [None]:
models = {}

In [None]:
customer_id_list

In [None]:
model_dict = {}
for cust in customer_id_list:
    model = DecisionTreeRegressor()
    group = customer_groups.get_group(cust)
    X_train = pd.merge(group, songs, on = 'song_id', how = 'left')
    Y_train = X_train['score']
    X_train.drop(['score'], axis = 1, inplace = True)
    X_train.drop('song_id', axis = 1, inplace = True)
    X_train.fillna(-1, inplace = True)
    X_train.drop('customer_id', axis = 1, inplace = True)
    model.fit(X_train, Y_train)
    model_dict[cust] = model

In [None]:
test

In [None]:
test_groups = test.groupby('customer_id')

In [None]:
test_dict = []
for customer in customer_id_list:
    group = test_groups.get_group(customer)
    X_test = group.reset_index().merge(songs, on = 'song_id', how = 'left').set_index('index')
    X_test.drop('song_id', axis = 1, inplace = True)
    X_test.fillna(-1, inplace = True)
    X_test.drop('customer_id', axis = 1, inplace = True)
    model = model_dict[customer]
    y_pred = model.predict(X_test)
    X_test['score'] = y_pred
    test_dict.append(X_test)

In [None]:
group = test_groups.get_group(customer_id_list[10])
X_test = group.reset_index().merge(songs, on = 'song_id', how = 'left').set_index('index')

In [None]:
X_test

In [None]:
test_dict[10]

In [None]:
Y_final = pd.concat(test_dict)

In [None]:
Y_final = Y_final.sort_index()

In [None]:
test

In [None]:
songs[songs['song_id'] == 8980]

In [None]:
Y_final

In [None]:
dummy_submission

In [None]:
y_pred_final = Y_final['score']

In [None]:
y_pred_final = y_pred_final.to_frame()

In [None]:
y_pred_final.reset_index(inplace = True)

In [None]:
y_pred_final['test_row_id'] = y_pred_final['index']

In [None]:
y_pred_final = y_pred_final[['test_row_id', 'score']]

In [None]:
y_pred_final.to_csv('baselinev3.csv', index = False)

In [4]:
all_fours = 4*np.ones(len(test['customer_id'].to_numpy()))

In [5]:
all_fours

array([4., 4., 4., ..., 4., 4., 4.])

In [6]:
y_pred_final = pd.DataFrame(all_fours)

In [8]:
y_pred_final['score'] = 4

In [10]:
y_pred_final.drop(0, axis = 1, inplace = True)

In [11]:
y_pred_final['test_row_id'] = y_pred_final.index

In [13]:
y_pred_final = y_pred_final[['test_row_id', 'score']]

In [16]:
y_pred_final.to_csv('just_for_fun.csv', index = False)

In [2]:
a = np.random.randn(5)

In [3]:
b = np.random.randn(5)

In [4]:
np.dot(a,b)

0.01883852325062907

In [5]:
a

array([-0.00679298, -0.30192502, -0.25159855, -0.85877147,  0.74156759])

In [9]:
train['customer_id'].iloc[1]

'I50343'

In [10]:
train

Unnamed: 0,customer_id,song_id,score
0,O29219,3459,3
1,I50343,5326,4
2,N42888,236,5
3,F5740,724,4
4,K4115,8452,5
...,...,...,...
710104,H6322,1003,4
710105,L6306,705,5
710106,J2494,6028,3
710107,G41961,2397,4
