In [1]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import catboost

In [2]:
# Loading Data

train = pd.read_csv('data/train.csv')
songs = pd.read_csv('data/songs.csv')
song_labels = pd.read_csv('data/song_labels.csv')
test = pd.read_csv('data/test.csv')
save_for_later = pd.read_csv('data/save_for_later.csv')
dummy_submission = pd.read_csv('data/dummy_submission.csv')

In [3]:
customer_groups = train.groupby('customer_id')

In [4]:
customer_id_list = train['customer_id'].unique()

In [5]:
song_labels_new = song_labels.groupby('label_id').sum().sort_values('count', ascending = False)
song_labels_new.reset_index(inplace = True)
principal_song_labels = song_labels_new['label_id'].to_numpy()[:3]

for i in principal_song_labels:
    label = song_labels[song_labels['label_id'] == i]
    label = label.reset_index().drop(['index', 'label_id'], axis = 1)
    label[f'count{i}'] = label['count']
    label.drop(['count'], axis = 1, inplace = True)
    songs = pd.merge(songs, label, on = 'platform_id', how = 'left')
    
for i in principal_song_labels:
    songs[f'count{i}'] = songs[f'count{i}'].fillna(0)
    
songs = songs.drop(['platform_id'], axis = 1)

In [6]:
songs.drop_duplicates('song_id', keep = 'first', inplace = True)

In [7]:
group = customer_groups.get_group('J15604')

In [8]:
songs

Unnamed: 0,song_id,released_year,language,number_of_comments,count30574,count8717,count11557
0,8328,2013.0,eng,1936,1759.0,215.0,295.0
1,8591,1925.0,eng,420,594.0,17.0,221.0
2,718,1929.0,eng,5485,281.0,0.0,1710.0
3,6593,1968.0,,200,5174.0,186.0,45.0
4,743,2002.0,en-US,10411,30837.0,2903.0,2190.0
...,...,...,...,...,...,...,...
9976,2048,2013.0,eng,4793,1909.0,294.0,86.0
9977,2552,1952.0,eng,1365,1374.0,482.0,206.0
9978,6741,2006.0,en-US,1296,689.0,258.0,284.0
9979,4564,2012.0,eng,1961,2844.0,212.0,145.0


In [9]:
one_hot = pd.get_dummies(songs.language, prefix='language')

In [10]:
one_hot

Unnamed: 0,language_ara,language_dan,language_en,language_en-CA,language_en-GB,language_en-US,language_eng,language_fil,language_fre,language_ger,...,language_nor,language_per,language_pol,language_por,language_rum,language_rus,language_spa,language_swe,language_tur,language_vie
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9976,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9977,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9978,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9979,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
songs = pd.merge(songs, one_hot, how = 'left', left_index = True, right_index = True)

In [12]:
songs.drop('language', axis =1, inplace = True)

In [13]:
songs

Unnamed: 0,song_id,released_year,number_of_comments,count30574,count8717,count11557,language_ara,language_dan,language_en,language_en-CA,...,language_nor,language_per,language_pol,language_por,language_rum,language_rus,language_spa,language_swe,language_tur,language_vie
0,8328,2013.0,1936,1759.0,215.0,295.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8591,1925.0,420,594.0,17.0,221.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,718,1929.0,5485,281.0,0.0,1710.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6593,1968.0,200,5174.0,186.0,45.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,743,2002.0,10411,30837.0,2903.0,2190.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9976,2048,2013.0,4793,1909.0,294.0,86.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9977,2552,1952.0,1365,1374.0,482.0,206.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9978,6741,2006.0,1296,689.0,258.0,284.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9979,4564,2012.0,1961,2844.0,212.0,145.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
group = customer_groups.get_group('J15604')

In [15]:
X_train = pd.merge(group, songs, on = ['song_id'], how = 'left')

In [16]:
Y_train = X_train['score']

In [17]:
X_train.drop(['score'], axis = 1, inplace = True)

In [18]:
X_train.drop('song_id', axis = 1, inplace = True)

In [19]:
X_train['released_year'] = X_train['released_year'].fillna(-999)
X_train['number_of_comments'] = X_train['number_of_comments'].fillna(-999)

In [20]:
from sklearn.tree import DecisionTreeRegressor

In [21]:
model = DecisionTreeRegressor()

In [22]:
X_train.drop('customer_id', axis = 1, inplace = True)

In [28]:
import time
start = time.time()
model.fit(X_train, Y_train)
print(time.time()-start)

0.006980180740356445


In [27]:
X_train.fillna(-1, inplace = True)

In [29]:
dic = {}