In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("https://datahack2020dataset.s3.us-east-2.amazonaws.com/OfficialCompetitionDataset.csv")
data.pop('Unnamed: 0')

print(f'data dimensions: {data.shape}')

data dimensions: (26504, 25)


Helper functions and symbols

In [None]:
TIME = 'song_debut'
PLAYS = 'nplays'
VULGAR = 'vulgar'

def extract_year_from_date(df):
  df = df.copy()
  df[TIME] = df[TIME].str.slice(0,4)
  df[TIME] = df[TIME].astype('int64')
  return df

def group_by_year(df):
  df = df.copy()

  for i in range(1960,2000,10):
    df.loc[(df[TIME] >= i) & (df[TIME] < i+10),TIME] = i

  df.loc[(df[TIME] >= 2016),TIME] = 2016
  return df

def popularize_in_place(df):
  grouped = df.groupby(TIME)
  thresholds = grouped.quantile(q = 0.90)[PLAYS].values
  years = np.sort(df[TIME].unique())
  df['popular'] = 0

  for year, th in zip(years, thresholds):
    df.loc[(df[TIME] == year) & (df[PLAYS] >= th),'popular'] = 1

In [None]:
data = extract_year_from_date(data)
data = group_by_year(data)

In [None]:
data_drop = data.drop(columns = ['name','album', 'artist', 'critic', 'review','reviewer_type', 'nplays', 'hotness'])

In [None]:
data_drop

Unnamed: 0,auditory,beats_per_measure,beats_per_min,concert_probability,critic_rating,danceability,hype,instrumentalness,length_minutes,lyricism,major/minor,positivity,song_debut,styles,tone,volume,vulgar
0,0.240300,4.0,175.934,32.00,70.0,1.216527,0.627541,1.060468,388200,8.0,major,-0.676431,2007,rock,C#,0.794281,NOT VULGAR
1,0.020610,4.0,97.017,24.40,60.0,1.382647,0.575489,1.256408,252560,7.0,major,-0.684217,2003,"electronic, rock",D,0.760773,NOT VULGAR
2,0.882000,4.0,110.005,9.99,91.0,1.469614,0.498955,1.125781,563253,3.0,major,0.016498,2005,"electronic, rock",F,0.738168,NOT VULGAR
3,2.571000,4.0,139.955,10.50,62.0,1.958150,0.456158,-0.844109,229840,9.0,minor,-0.964502,2014,pop/r&b,F,0.824472,NOT VULGAR
4,0.221400,3.0,144.033,35.00,0.0,2.377382,0.400788,-0.844077,262627,8.0,minor,1.857820,2004,pop/r&b,F,0.777020,NOT VULGAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26499,1.719000,3.0,179.885,37.10,75.0,1.375751,0.463105,1.283622,340133,7.0,major,-1.318363,2002,"electronic, rock",C,0.716484,NOT VULGAR
26500,0.030600,4.0,122.984,6.75,58.0,1.890800,0.657002,1.261851,348787,5.0,major,1.102605,2014,"electronic, rock",C,0.811926,NOT VULGAR
26501,0.000023,4.0,110.356,25.90,68.0,1.499303,0.591114,-0.719056,288640,2.0,major,-0.170359,2005,rock,D,0.851976,NOT VULGAR
26502,1.290000,4.0,59.672,10.50,72.0,1.447735,0.427879,-0.844369,301027,7.0,major,-0.742610,2016,rap,F#,0.733683,VULGAR


Create a categorical regressor model that uses the song features to determine how high a critic would rate the song, and determine which featurs are more important in determining a critic rating.

In [None]:
X = data_drop.drop(columns = ['critic_rating'])
y = data_drop['critic_rating']

In [None]:
X['styles'] = X['styles'].fillna('None')
X.isna().any()

auditory               False
beats_per_measure      False
beats_per_min          False
concert_probability    False
danceability           False
hype                   False
instrumentalness       False
length_minutes         False
lyricism               False
major/minor            False
positivity             False
song_debut             False
styles                 False
tone                   False
volume                 False
vulgar                 False
dtype: bool

In [None]:
cat_features = ['beats_per_measure','major/minor','styles','tone','vulgar']
X['beats_per_measure'] = X['beats_per_measure'].astype("int64")

In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
cb = CatBoostRegressor()
cb.fit(X_train,y_train, eval_set = (X_test,y_test), cat_features=cat_features)

Learning rate set to 0.081068
0:	learn: 12.5982392	test: 12.7029707	best: 12.7029707 (0)	total: 24.5ms	remaining: 24.5s
1:	learn: 12.5649097	test: 12.6713355	best: 12.6713355 (1)	total: 48ms	remaining: 23.9s
2:	learn: 12.5355334	test: 12.6399020	best: 12.6399020 (2)	total: 59.1ms	remaining: 19.6s
3:	learn: 12.5037926	test: 12.6043618	best: 12.6043618 (3)	total: 78ms	remaining: 19.4s
4:	learn: 12.4744973	test: 12.5718368	best: 12.5718368 (4)	total: 91.7ms	remaining: 18.2s
5:	learn: 12.4319622	test: 12.5306069	best: 12.5306069 (5)	total: 113ms	remaining: 18.7s
6:	learn: 12.4052849	test: 12.5072408	best: 12.5072408 (6)	total: 132ms	remaining: 18.7s
7:	learn: 12.3729689	test: 12.4815995	best: 12.4815995 (7)	total: 157ms	remaining: 19.4s
8:	learn: 12.3406963	test: 12.4521171	best: 12.4521171 (8)	total: 188ms	remaining: 20.7s
9:	learn: 12.3198210	test: 12.4317585	best: 12.4317585 (9)	total: 219ms	remaining: 21.7s
10:	learn: 12.2926394	test: 12.4066533	best: 12.4066533 (10)	total: 237ms	remai

<catboost.core.CatBoostRegressor at 0x7f68dd41e050>

In [None]:
feature_importance_df = cb.get_feature_importance(prettified=True)
feature_importance_df

Unnamed: 0,Feature Id,Importances
0,song_debut,23.872398
1,styles,12.750852
2,danceability,7.883105
3,instrumentalness,7.357296
4,volume,7.152986
5,length_minutes,6.582944
6,auditory,6.306714
7,beats_per_min,4.768663
8,positivity,4.586957
9,hype,4.2822


Model using same precodeure on above, but on only pre-2000/post-2000

In [None]:
data_pre = data_drop[data_drop['song_debut'] < 2000]
data_pre

Unnamed: 0,auditory,beats_per_measure,beats_per_min,concert_probability,critic_rating,danceability,hype,instrumentalness,length_minutes,lyricism,major/minor,positivity,song_debut,styles,tone,volume,vulgar
60,2.57400,1.0,105.005,11.60,94.0,1.437636,0.103459,-0.842841,202733,2.0,major,-0.769860,1970,rock,G,0.594475,NOT VULGAR
92,2.96700,3.0,94.745,34.40,65.0,1.678667,0.332894,-0.844405,58000,9.0,minor,-0.267681,1990,,B,0.665438,NOT VULGAR
95,0.18030,4.0,176.738,76.40,100.0,1.306040,0.609766,-0.789268,173613,6.0,major,1.262213,1960,pop/r&b,C,0.814445,NOT VULGAR
101,0.53700,4.0,114.031,15.80,100.0,2.243417,0.428530,-0.844242,220627,3.0,major,1.713784,1970,pop/r&b,C#,0.729123,NOT VULGAR
107,2.03100,4.0,138.493,11.30,90.0,2.087569,0.617345,1.302672,161707,7.0,major,1.799427,1990,"experimental, rock",C,0.800424,NOT VULGAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26419,0.00552,4.0,142.754,10.30,91.0,1.323130,0.547543,1.142110,427800,2.0,major,1.530820,1970,rock,F,0.781566,NOT VULGAR
26427,2.66400,3.0,97.675,26.50,73.0,1.663627,0.318454,0.467202,534133,0.0,minor,-1.456560,1990,,G,0.737062,NOT VULGAR
26429,0.35700,4.0,123.694,8.42,84.0,1.568312,0.460584,-0.844512,124760,2.0,major,1.678748,1960,rock,A,0.799533,NOT VULGAR
26454,2.93100,4.0,66.794,28.80,65.0,1.673639,0.355574,-0.844033,48520,9.0,minor,0.468069,1990,,Bb,0.660186,NOT VULGAR


In [None]:
X = data_pre.drop(columns = ['critic_rating'])
y = data_pre['critic_rating']

In [None]:
X['styles'] = X['styles'].fillna('None')
X.isna().any()

auditory               False
beats_per_measure      False
beats_per_min          False
concert_probability    False
danceability           False
hype                   False
instrumentalness       False
length_minutes         False
lyricism               False
major/minor            False
positivity             False
song_debut             False
styles                 False
tone                   False
volume                 False
vulgar                 False
dtype: bool

In [None]:
cat_features = ['beats_per_measure','major/minor','styles','tone','vulgar']
X['beats_per_measure'] = X['beats_per_measure'].astype("int64")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
cb = CatBoostRegressor()
cb.fit(X_train,y_train, eval_set = (X_test,y_test), cat_features=cat_features)

Learning rate set to 0.044939
0:	learn: 14.7291181	test: 14.9388405	best: 14.9388405 (0)	total: 5.09ms	remaining: 5.09s
1:	learn: 14.5817245	test: 14.8150106	best: 14.8150106 (1)	total: 10ms	remaining: 5s
2:	learn: 14.4298476	test: 14.6856641	best: 14.6856641 (2)	total: 15ms	remaining: 4.98s
3:	learn: 14.3005915	test: 14.5625967	best: 14.5625967 (3)	total: 20.2ms	remaining: 5.02s
4:	learn: 14.1785766	test: 14.4464346	best: 14.4464346 (4)	total: 25.2ms	remaining: 5.01s
5:	learn: 14.1490392	test: 14.4140716	best: 14.4140716 (5)	total: 26.5ms	remaining: 4.38s
6:	learn: 14.0916011	test: 14.3419463	best: 14.3419463 (6)	total: 28ms	remaining: 3.97s
7:	learn: 14.0044605	test: 14.2949523	best: 14.2949523 (7)	total: 33.1ms	remaining: 4.1s
8:	learn: 13.9196927	test: 14.2053031	best: 14.2053031 (8)	total: 38ms	remaining: 4.18s
9:	learn: 13.8177975	test: 14.0893805	best: 14.0893805 (9)	total: 42.6ms	remaining: 4.22s
10:	learn: 13.7155584	test: 13.9774656	best: 13.9774656 (10)	total: 50ms	remaining

<catboost.core.CatBoostRegressor at 0x7f68dcb18a10>

In [None]:
feature_importance_df = cb.get_feature_importance(prettified=True)
feature_importance_df

Unnamed: 0,Feature Id,Importances
0,styles,19.053015
1,song_debut,12.496723
2,volume,7.500343
3,length_minutes,6.944125
4,instrumentalness,6.581338
5,positivity,6.052179
6,tone,5.873946
7,auditory,5.571553
8,hype,5.28187
9,concert_probability,5.273852


Model only on post-2000

In [None]:
data_post = data_drop[data_drop['song_debut'] >= 2000]
data_post

Unnamed: 0,auditory,beats_per_measure,beats_per_min,concert_probability,critic_rating,danceability,hype,instrumentalness,length_minutes,lyricism,major/minor,positivity,song_debut,styles,tone,volume,vulgar
0,0.240300,4.0,175.934,32.00,70.0,1.216527,0.627541,1.060468,388200,8.0,major,-0.676431,2007,rock,C#,0.794281,NOT VULGAR
1,0.020610,4.0,97.017,24.40,60.0,1.382647,0.575489,1.256408,252560,7.0,major,-0.684217,2003,"electronic, rock",D,0.760773,NOT VULGAR
2,0.882000,4.0,110.005,9.99,91.0,1.469614,0.498955,1.125781,563253,3.0,major,0.016498,2005,"electronic, rock",F,0.738168,NOT VULGAR
3,2.571000,4.0,139.955,10.50,62.0,1.958150,0.456158,-0.844109,229840,9.0,minor,-0.964502,2014,pop/r&b,F,0.824472,NOT VULGAR
4,0.221400,3.0,144.033,35.00,0.0,2.377382,0.400788,-0.844077,262627,8.0,minor,1.857820,2004,pop/r&b,F,0.777020,NOT VULGAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26499,1.719000,3.0,179.885,37.10,75.0,1.375751,0.463105,1.283622,340133,7.0,major,-1.318363,2002,"electronic, rock",C,0.716484,NOT VULGAR
26500,0.030600,4.0,122.984,6.75,58.0,1.890800,0.657002,1.261851,348787,5.0,major,1.102605,2014,"electronic, rock",C,0.811926,NOT VULGAR
26501,0.000023,4.0,110.356,25.90,68.0,1.499303,0.591114,-0.719056,288640,2.0,major,-0.170359,2005,rock,D,0.851976,NOT VULGAR
26502,1.290000,4.0,59.672,10.50,72.0,1.447735,0.427879,-0.844369,301027,7.0,major,-0.742610,2016,rap,F#,0.733683,VULGAR


In [None]:
X = data_post.drop(columns = ['critic_rating'])
y = data_post['critic_rating']

In [None]:
X['styles'] = X['styles'].fillna('None')
X.isna().any()

auditory               False
beats_per_measure      False
beats_per_min          False
concert_probability    False
danceability           False
hype                   False
instrumentalness       False
length_minutes         False
lyricism               False
major/minor            False
positivity             False
song_debut             False
styles                 False
tone                   False
volume                 False
vulgar                 False
dtype: bool

In [None]:
cat_features = ['beats_per_measure','major/minor','styles','tone','vulgar']
X['beats_per_measure'] = X['beats_per_measure'].astype("int64")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
cb = CatBoostRegressor()
cb.fit(X_train,y_train, eval_set = (X_test,y_test), cat_features=cat_features)

Learning rate set to 0.08038
0:	learn: 12.2834208	test: 12.5945315	best: 12.5945315 (0)	total: 20.3ms	remaining: 20.3s
1:	learn: 12.2391921	test: 12.5610141	best: 12.5610141 (1)	total: 40ms	remaining: 19.9s
2:	learn: 12.2130056	test: 12.5383474	best: 12.5383474 (2)	total: 59.3ms	remaining: 19.7s
3:	learn: 12.1863230	test: 12.5063260	best: 12.5063260 (3)	total: 79.5ms	remaining: 19.8s
4:	learn: 12.1692037	test: 12.4894702	best: 12.4894702 (4)	total: 102ms	remaining: 20.2s
5:	learn: 12.1473440	test: 12.4708055	best: 12.4708055 (5)	total: 120ms	remaining: 19.9s
6:	learn: 12.1185963	test: 12.4458141	best: 12.4458141 (6)	total: 137ms	remaining: 19.4s
7:	learn: 12.0831327	test: 12.4080257	best: 12.4080257 (7)	total: 158ms	remaining: 19.6s
8:	learn: 12.0607475	test: 12.3914960	best: 12.3914960 (8)	total: 177ms	remaining: 19.5s
9:	learn: 12.0371992	test: 12.3727137	best: 12.3727137 (9)	total: 208ms	remaining: 20.6s
10:	learn: 12.0323029	test: 12.3671104	best: 12.3671104 (10)	total: 216ms	remai

<catboost.core.CatBoostRegressor at 0x7f68d8ac3590>

In [None]:
feature_importance_df = cb.get_feature_importance(prettified=True)
feature_importance_df

Unnamed: 0,Feature Id,Importances
0,song_debut,19.710858
1,styles,12.531992
2,danceability,8.547114
3,volume,8.50119
4,instrumentalness,7.761355
5,length_minutes,6.720792
6,auditory,6.283181
7,beats_per_min,5.106004
8,hype,5.057062
9,positivity,4.877573
