In [74]:
import numpy as np
import pandas as pd
import requests
import urllib.request
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from bs4 import BeautifulSoup
from urllib.request import urlopen
from numpy.random import choice
import math

# Cleaning

This line of code below was used for selecting the last 25 years as a subset of data that we will be working with. As you can see, we did this so that we work with roughly 50,000 rows of data. Only uncomment the block of code below if you need to create the subset dataset manually, otherwise it should be located in the repo already.

In [15]:
# df = pd.read_csv('data.csv', index_col=0)
# ser = df['year'].value_counts().sort_index(ascending=False)
# check = 0
# years = []
# for items in ser.iteritems(): 
#     if check <= 50000:
#         years.append(items[0])

#         check += items[1]

# new_df = df[df.year >= years[-1]]
# new_df.to_csv('subset_data.csv')

In [16]:
url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1995'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [17]:
tables = soup.find_all('table')
rows = [row for row in tables[0].find_all('tr')]

In [18]:
def get_td(row):
    return [td for td in row.find_all('td')]
songs_list = [get_td(row)for row in rows[1:]]

In [19]:
def get_artist(td):
    try:
        return td[1].a.string
    except:
        return td[1].string
def get_title(td):
    try:
        return (td[0].a.string) 
    except:
        return (td[0].string).strip('\"')

In [20]:
df = pd.DataFrame(columns = ['song', 'artist'])

In [21]:
for i in range(len(songs_list)):
    to_append = [get_title(songs_list[i]), get_artist(songs_list[i])]
    df_length = len(df)
    df.loc[df_length] = to_append

Now we need to repeat this process for every year since 1995

In [22]:
#generate list of urls 
df = pd.DataFrame(columns = ['song', 'artist'])
urls = ['http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{0}'.format(str(i)) for i in range(1995, 2020)]

In [23]:
for url in urls:
    html = urlopen(url) 
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table')
    if url.endswith('2013') or url.endswith('2012'):
        rows = [row for row in tables[1].find_all('tr')]
        songs_list = [get_td(row)for row in rows[1:]]
        for i in range(len(songs_list)):
            to_append = [get_title(songs_list[i]), get_artist(songs_list[i])]
            df_length = len(df)
            df.loc[df_length] = to_append
    else:
        rows = [row for row in tables[0].find_all('tr')]
        songs_list = [get_td(row)for row in rows[1:]]
        for i in range(len(songs_list)):
            to_append = [get_title(songs_list[i]), get_artist(songs_list[i])]
            df_length = len(df)
            df.loc[df_length] = to_append

In [24]:
bb_hits = df
bb_hits = bb_hits.dropna()

In [25]:
data = pd.read_csv('subset_data.csv')
data = data.drop(['id','Unnamed: 0'], axis = 1)

#add a hit column - is a hit or not
data['hit'] = 0

In [26]:
for ind in bb_hits.index: 
    song = df['song'][ind]
    #print(song)
    if ind == 1618:
        continue
    else:
        artist = df['artist'][ind]
        to_check = data[data['name'].str.contains(song)]
        for ind, row in to_check.iterrows():
            if artist in row['artists']:
                data.loc[ind, 'hit'] = 1

  


# EDA

In [33]:
data['hit'].value_counts()[0]

48762

In [54]:
#Number of hits/non-hits in our dataset
print(data['hit'].value_counts())
percenthits = (100*(data['hit'].value_counts()[1] / len(data)))
print('Percent of Hits: ' + str(percenthits))

0    48762
1     2708
Name: hit, dtype: int64
Percent of Hits: 5.261317272197397


In [28]:
#Number of nulls
data.isnull().sum()

acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
hit                 0
dtype: int64

In [31]:
data.corr()['hit'].sort_values()

acousticness       -0.084031
instrumentalness   -0.075739
mode               -0.027283
liveness           -0.022791
tempo              -0.011095
key                 0.008526
year                0.009097
duration_ms         0.010863
speechiness         0.024907
explicit            0.027443
valence             0.035484
energy              0.039899
loudness            0.085277
danceability        0.104116
popularity          0.170373
hit                 1.000000
Name: hit, dtype: float64

In [30]:
data[['name', 'artists', 'year', 'hit']].head()

Unnamed: 0,name,artists,year,hit
0,Wonderwall - Remastered,['Oasis'],1995,1
1,Gangsta's Paradise,"['Coolio', 'L.V.']",1995,1
2,Check Yes Or No,['George Strait'],1995,0
3,1979 - Remastered 2012,['The Smashing Pumpkins'],1995,1
4,I Got 5 On It,"['Luniz', 'Michael Marshall']",1995,0


danceability seems to have the most affect on the popularity of a song, instrumentalness has a negative affect on it.

In [None]:
#data.to_csv('final_data.csv')

In [70]:
df = pd.read_csv('subset_data.csv').drop('Unnamed: 0', axis=1)

# Baseline Model

Our baseline model randomly selects a song to be a 'hit' ~5.26% of the time (the observed amount in our given data) and 'not a hit' the rest of the time.

In [63]:
features = ['acousticness',
            'instrumentalness',
            'mode',
            'liveness',
            'tempo', 
            'key', 
            'year',
            'duration_ms', 
            'speechiness',
            'explicit',
            'valence', 
            'energy',
            'loudness',
            'danceability',
            'artists',
            'duration_ms',
            'explicit',
            'mode',
            'name',
            'popularity']
X = data[features]
y = data.hit

def baseline_predictor(X):
    predictions = []
    for row in range(len(X)):
        hit = choice([0,1], 1, [(100 - percenthits), percenthits])[0]
        predictions.append(hit)
    return predictions
baseline_predictor(X)

In [69]:
#Calculate RMSE
mse = sklearn.metrics.mean_squared_error(y, predicted)
rmse = math.sqrt(mse)