# Data exploration, cleaning and manipulation

In [6]:
import pandas as pd 
import numpy as np
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
albums = pd.read_csv('albums.csv', index_col=0, parse_dates=True)[1:]
albums['date'] = pd.to_datetime(albums.date)
print(albums.shape)
albums.head(3)

(573946, 7)


Unnamed: 0_level_0,id,date,artist,album,rank,length,track_length
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2,2019-01-19,A Boogie Wit da Hoodie,Hoodie SZN,1.0,20.0,185233.8
2,3,2019-01-19,21 Savage,I Am > I Was,2.0,15.0,211050.733333
3,4,2019-01-19,Soundtrack,Spider-Man: Into The Spider-Verse,3.0,13.0,190866.384615


Albums sorted by # weeks in the Billboard Top 200


In [12]:
albums.groupby(['album', 'artist'])['rank'].count().sort_values(ascending=False).head(40)

album                                       artist                                             
The Dark Side Of The Moon                   Pink Floyd                                             941
Legend: The Best Of...                      Bob Marley And The Wailers                             555
Journey's Greatest Hits                     Journey                                                545
Metallica                                   Metallica                                              516
Greatest Hits                               Guns N' Roses                                          451
Curtain Call: The Hits                      Eminem                                                 426
Nevermind                                   Nirvana                                                406
Doo-Wops & Hooligans                        Bruno Mars                                             404
Chronicle The 20 Greatest Hits              Creedence Clearwater Revival Featuri

Cleaning acoustic features

In [49]:
# Remove tracks less than a minute long 
features = acoustic_features.loc[acoustic_features['duration_ms'] > 60000]
# Drop non numeric columns 
features.drop(columns=['id', 'song', 'artist', 'album_id', 'date'], inplace=True)
# Extract total run time per album 
length = features.groupby('album')['duration_ms'].sum()
# Set index to album  
features.set_index('album', inplace=True)
# Set length name to weights for clarity in weighted mean calculation
length.name = 'weights'
features.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Hoodie SZN,0.0555,0.754,142301.0,0.663,0.0,6.0,0.101,-6.311,0.0,0.427,90.195,4.0,0.207
Hoodie SZN,0.292,0.86,152829.0,0.418,0.0,7.0,0.106,-9.061,0.0,0.158,126.023,4.0,0.374
Hoodie SZN,0.153,0.718,215305.0,0.454,4.6e-05,8.0,0.116,-9.012,1.0,0.127,89.483,4.0,0.196


Aggregating songs by album weighted by the percentage of the album they take up. 

In [50]:
features = features.join(length)
# Extract percentage of the album each song takes up 
features['weights'] = features['duration_ms'] / features['weights']
# Multiply weights by each column 
features = features.drop(['weights', 'duration_ms'], axis=1).mul(features['weights'], axis=0)
length.name = 'length'
# Sum up the weighted data, grouping by album. Then add album length. 
features = features.groupby('album').sum().join(length)
features.head(3)

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,length
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
!!Going Places!!,0.504944,0.63436,0.497121,0.6368716,5.469885,0.107299,-11.909669,0.578203,0.093018,115.741766,4.0,0.636955,1786385.0
!Viva El Amor!,0.102661,0.507975,0.717777,0.002417189,7.703102,0.165748,-5.041795,0.723996,0.037021,129.670102,3.850478,0.530391,2715761.0
!Viva La Cobra!,0.046689,0.58602,0.789629,6.977774e-06,2.522854,0.242735,-5.825477,0.282722,0.071973,122.389089,3.701332,0.70053,2219053.0
"""...Ya Know?""",0.118835,0.563288,0.782915,0.04322188,6.778017,0.221678,-5.055693,0.822073,0.039914,137.824034,3.91679,0.591675,3129906.0
"""Awaken, My Love!""",0.305946,0.588547,0.433997,0.1544895,2.923464,0.155045,-11.101028,0.584477,0.098121,137.320998,3.668442,0.456096,2941866.0
"""Better"": Azusa - The Next Generation 2",0.124813,0.47283,0.757153,0.0001651969,4.290365,0.23295,-6.397189,0.73762,0.123369,127.721247,3.745586,0.466849,3108333.0
"""C"" Ya",0.011373,0.837433,0.498152,0.02142154,6.74814,0.179708,-14.534484,0.489243,0.277518,133.955165,4.0,0.454465,2923426.0
# 1's,0.461954,0.526448,0.436523,0.002012308,4.570754,0.145139,-12.141936,0.774129,0.046757,115.622452,4.0,0.42587,6213269.0
#1,0.716692,0.525251,0.407432,0.000574592,5.129254,0.197711,-10.363529,1.0,0.044034,128.67159,3.80228,0.596143,2295028.0
#1 Girl,0.136888,0.585411,0.742812,2.116328e-05,5.773393,0.115862,-4.497562,0.606676,0.109043,125.969987,4.091933,0.553144,2208121.0


Next, we'll center and scale the features to have a mean of 0 and unit variance to make future regression coefficients significantly more interpretable.

In [51]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
features = pd.DataFrame(ss.fit_transform(features), columns=features.columns, index=features.index)
features.head(3)

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,length
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
!!Going Places!!,0.918999,0.732403,-0.66892,2.949217,0.18426,-0.789433,-0.772536,-0.611254,0.090508,-0.380813,0.612909,0.810201,-0.822032
!Viva El Amor!,-0.744326,-0.263155,0.523593,-0.519468,1.953606,-0.384624,1.027195,0.094299,-0.500926,0.713587,-0.306337,0.175245,-0.282002
!Viva La Cobra!,-0.975756,0.351619,0.911908,-0.532645,-2.150631,0.148572,0.821831,-2.041211,-0.131773,0.141491,-1.223267,1.189003,-0.570623


In [52]:
reviews = pd.read_csv('reviews.csv').drop(['index','id','role'], axis=1)
reviews.set_index('album', inplace=True)
print(reviews.shape)
reviews.head(3)

(20873, 8)


Unnamed: 0_level_0,artist,genre,score,date,author,review,bnm,link
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
“…The Best Live Show of All Time” — NME EP,David Byrne,Rock,5.5,January 11 2019,Andy Beta,"Viva Brother, Terris, Mansun, the Twang, Joe L...",0,https://pitchfork.com/reviews/albums/david-byr...
Lost Lovesongs / Lostsongs Vol. 2,DJ Healer,Electronic,6.2,January 11 2019,Chal Ravens,"The Prince of Denmark—that is, the proper prin...",0,https://pitchfork.com/reviews/albums/dj-healer...
Roman Birds,Jorge Velez,Electronic,7.9,January 10 2019,Philip Sherburne,"Jorge Velez has long been prolific, but that’s...",0,https://pitchfork.com/reviews/albums/jorge-vel...


Joining the features and reviews dataframes, removing albums that don't have review data in the process.

In [53]:
# Join features and reviews, remove albums that don't have review data. 
data = features.join(reviews).dropna()
print(data.shape)
data.head(3)

(2948, 21)


Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,...,valence,length,artist,genre,score,date,author,review,bnm,link
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$O$,-0.712969,1.616222,0.483327,-0.141289,-0.461014,-0.283236,0.52972,-0.764353,1.526188,0.313339,...,0.171074,0.019912,Die Antwoord,Rap,5.5,October 20 2010,Scott Plagenhoef,Die Antwoord were always a group of musicians ...,0.0,https://pitchfork.com/reviews/albums/14766-o/
...And Star Power,-0.330429,-1.447793,-0.062838,0.952225,-0.083678,0.424758,0.11735,1.024819,-0.234753,0.316331,...,-0.268539,0.991286,Foxygen,Rock,7.0,October 13 2014,Stuart Berman,Embedded within the detailed credits to Foxyge...,0.0,https://pitchfork.com/reviews/albums/19769-fox...
...And Then You Shoot Your Cousin,0.55771,0.57623,-0.519751,-0.352142,-0.337685,-0.670624,-0.699499,-1.938464,0.914589,-0.968447,...,-0.788879,-0.718222,The Roots,Rap,7.2,May 23 2014,Jayson Greene,"""Yes, @TheRoots have NEVER been conventional i...",0.0,https://pitchfork.com/reviews/albums/19332-the...


# Predicting Pitchfork album score with acoustic features

In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

# Iterate through genres and regress with sklearn.
for genre in data['genre'].unique():
    genre_df = data.loc[data['genre'] == genre]
    X = genre_df.iloc[:,:13]
    y = genre_df['score']
    # Disregard genres with low sample size 
    if len(X) > 50:
        reg = LinearRegression()
        reg.fit(X, y)
        preds = reg.predict(X)
        print(genre + ' R\N{SUPERSCRIPT TWO}: ' + format(reg.score(X, y)) 
              + ', RMSE: ' + format(np.sqrt(mse(y, preds))) + ', sample size: ' + format(len(X)))

Rap R²: 0.07262588423678573, RMSE: 1.3577640514924605, sample size: 490
Rock R²: 0.07821709518357334, RMSE: 1.4611683233816017, sample size: 1216
Experimental R²: 0.19212208876199388, RMSE: 0.744795175028052, sample size: 75
Metal R²: 0.16163304944518941, RMSE: 1.3142091471727597, sample size: 71
Pop/R&B R²: 0.07506539808281698, RMSE: 1.2229843025378582, sample size: 294
Electronic R²: 0.06170000330975788, RMSE: 1.3836397356521215, sample size: 270
Experimental,Rock R²: 0.1792549116031915, RMSE: 1.0120133131688034, sample size: 70
Folk/Country R²: 0.1518886388911922, RMSE: 1.0199806009086203, sample size: 77
Electronic,Rock R²: 0.09760141128058421, RMSE: 1.644883349721011, sample size: 150


The results form the sklearn regression are pretty uninterpretable, except that it's seriously struggling to accurately predict the score. More statistically sound insight may be gained from using the statsmodels package.

In [56]:
import statsmodels.api as sm

for genre in data['genre'].unique():
    genre_df = data.loc[data['genre'] == genre]
    X = genre_df.iloc[:,:13]
    y = genre_df['score']
    # Disregard genres with low sample size 
    if len(X) > 50:
        ols = sm.OLS(y, X)
        res = ols.fit()
        print('\n'+format(genre))
        print(res.summary())


Rap
                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     143.7
Date:                Wed, 12 Jun 2019   Prob (F-statistic):          1.35e-155
Time:                        22:43:29   Log-Likelihood:                -1261.4
No. Observations:                 490   AIC:                             2549.
Df Residuals:                     477   BIC:                             2603.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
acousticness        -1.2454      0.