# Data Collection and Cleaning

Spotify is one of the biggest online music platforms in the world and it has great audio features for every song, such as 'danceability', 'energy', 'loudness', 'instrumentalness', etc., as well as a popularity rank from 0-100 for every song, so we chose to use the Spotify Web API to collect song data.

We decided to start by looking at Spotify's most popular songs in the past year. Spotify has a collection of the top 200 songs in the US every day since January 1st, 2017 available to download as csv files (https://spotifycharts.com/regional). We decided to download csvs from the 1st and 15th of every month from January 2017 - May 2018.  Variables included in the csv are simple, including the artist, title, number of streams, and url.

Using Spotify's Web API, we then decided to use the artist's name from the top tracks csv files to extract all of their songs released in 2017 and 2018. We wanted to extract all of their tracks rather than just their most popular tracks so we could have more tracks and a larger range of popularity for the model.

After getting the songs from Spotify, we could use the ID of the song to request its specific audio features, and put that into a data frame.

We also wanted to get the genre for each song. We realized the best way to do this through Spotify was to request the genres of the artists and combine that with their tracks in the dataframe, as Spotify's individual tracks do not have genres associated with them in the API.

Throughout the process we needed to save a lot of csv files so we could pick back up from them later because the collection process was very lengthy. We left the .to_csv code commented out so that you don't accidentally run those cells and overwrite the existing data.

The two resulting csv files, final_tracks.csv and song_lyrics.csv, are saved in the data301/share folder for use in the analysis and machine learning notebooks. final_tracks.csv ended up with a little over 6,000 songs.

# Song Collection

In [2]:
import json
import requests
import pandas as pd
from pandas.io.json import json_normalize
import time

token = "BQDHsbFz-bgWgRdOIksHDMB20MgtKE7dXOQtcC4_-LKKTDqaZz8ivcRhq-AzLxCxBeX7gpHPVWAzctHdSDy1TKT2eEsoiUNk8a7V-VTRfhvog0OwQblohfbGv1Y9mzuQkNn0vGn3jx7w"

In [None]:
# read in top200 charts csv files into dataframe

top200 = pd.DataFrame()
for month in range(1, 12):
    if (month <= 5): # Currently May
        path = "data/top200-" + str(month) + "-15-18.csv"
        top = pd.read_csv(path)
        top200 = pd.concat([top, top200])
        path = "data/top200-" + str(month) + "-1-18.csv"
        top = pd.read_csv(path)
        top200 = pd.concat([top, top200])
    path = "data/top200-" + str(month) + "-1-17.csv"
    top = pd.read_csv(path)
    top200 = pd.concat([top, top200])
    path = "data/top200-" + str(month) + "-15-17.csv"
    top = pd.read_csv(path)
    top200 = pd.concat([top, top200])

    
# Get artists
artists = list(top200.Artist.unique())

# mistakes from csv
artists.remove("Redbone")
artists.remove("Trouble")
artists.append("Cage the Elephant")

In [12]:
# returns artist id
def get_artist_id(artist, a_tracks_df):
    for artists in a_tracks_df.artists:
        for a in artists:
            if (a["name"] == artist):
                artist_id = a["id"]
                return artist_id

In [None]:
# use artist to request track id's from every song they released in the US in 2017 and 2018
tracks = pd.DataFrame()
artists_list = list()
genres_list = list()
i=0
for artist in artists:
    artist_url = "" # need to reformat for api request
    for word in artist.split():
        artist_url = artist_url + word + "+"
        
    # get songs
    url = "https://api.spotify.com/v1/search?q=" + artist_url + "&year:2017-2018&type=track&market=US"
    resp = requests.get(
        url, headers={"Authorization": "Bearer " + token}  
    )

    a_tracks = resp.json()
    a_tracks_df = json_normalize(a_tracks["tracks"], "items")
    a_tracks_df["artist"] = artist
    
    time.sleep(1.5) # otherwise I'm making requests too quickly
    
    # get artist id for genres
    artist_id = get_artist_id(artist, a_tracks_df)
    url = "https://api.spotify.com/v1/artists/" + str(artist_id)
    resp = requests.get(
         url, headers={"Authorization": "Bearer " + token}  
    )
    
    if (resp.status_code != 200):
        continue;
        
    artist_json = resp.json()
    artists_list.append(artist)
    genres_list.append(artist_json["genres"])
    tracks = pd.concat([a_tracks_df, tracks], ignore_index=True)
    
    time.sleep(1.5) # otherwise I'm making requests too quickly
    print(i)
    i += 1

In [None]:
# store as csv for later
#tracks.to_csv('toptracks.csv')

# store artist_genres into csv for later
artists_genres = pd.DataFrame()
artists_genres["artist"] = artists_list
artists_genres["genre"] = genres_list
#artists_genres.to_csv("artist_genres.csv")

In [20]:
# group tracks ids into groups of 100 (max limit for spotify requests)

ids = list(tracks.id)
ids_list = list()
for i in range(0, int(len(tracks) / 100)):
    ids_str = ""
    for j in range(i*100, (i*100)+100):
        ids_str = ids_str + str(tracks.id[j]) + ","
    ids_list.append(ids_str)

# remainder
ids_str = ""
for k in range((i+1)*100, len(tracks)):
    ids_str = ids_str + str(tracks.id[k]) + ","
ids_list.append(ids_str)

In [None]:
# get audio features for every track
track_features = pd.DataFrame()
for i in range(0, len(ids_list)):
    url = "https://api.spotify.com/v1/audio-features?ids=" + str(ids_list[i])
    resp = requests.get(
        url, headers={"Authorization": "Bearer " + token}  
    )
    features_j = resp.json()
    features_list = features_j["audio_features"]
    features_list = list(filter(None, features_list))
    features = json_normalize(features_list)
    track_features = pd.concat([features, track_features], ignore_index=True)
    time.sleep(1)

In [22]:
# save to csv for later
#track_features.to_csv('track_features.csv')

In [4]:
# get data frames from temporarily saved csv files
artists_genre_df = pd.read_csv("saved_csvs/artist_genres.csv")
tracks = pd.read_csv("saved_csvs/toptracks.csv")
track_features = pd.read_csv("saved_csvs/track_features.csv")

artists_genre_df = artists_genre_df.drop("Unnamed: 0", axis=1)
tracks = tracks.drop("Unnamed: 0", axis=1)
track_features = track_features.drop("Unnamed: 0", axis=1)

In [5]:
tracks_simple = tracks.drop(["available_markets", "disc_number", 
                             "external_ids", "external_urls", "is_local",
                             "track_number", "type", "href", "album", "duration_ms"], axis=1)

In [6]:
# merge track dataset with track name and artist with audio features data frame
# remove duplicate tracks
tracks_df = tracks_simple.merge(track_features, on=["uri", "id"], how="inner")
tracks_df = tracks_df.drop_duplicates(["name", "artist"])
tracks_df = tracks_df.drop_duplicates(["uri"])
tracks_df = tracks_df.reset_index()
tracks_df = tracks_df.drop(["index", "analysis_url", "track_href", "artists"], axis=1)

In [None]:
# merge genres in with respective artists
tracks_genres = tracks_df.merge(artists_genre_df, on="artist", how="left")

In [14]:
# make separate categorical variables for genres
genres_list = list(["pop", "hip hop", "rap", "trap", "latin", "rock", "hollywood", "house",
                   "country", "edm", "indie", "reggae", "R&B"])

genres_dict = {}
for genre in genres_list:
    genres_dict[genre] = tracks_genres.genre.str.contains(genre)
genres_df = pd.DataFrame(genres_dict)

final_tracks = tracks_genres.join(genres_df)
# save for later
#final_tracks.to_csv("final_tracks.csv")

In [3]:
tracks = pd.read_csv("final_tracks.csv")

In [4]:
# get main genre
def return_general_genre(num, genre):
    if genre == "rap" or genre == "trap":
        return "hip hop"
    if genre == "hollywood" or genre == "a cappella" or genre == "disco":
        return "pop"
    if genre == "edm" or genre == "rave" or genre == "big room" or genre == "brostep":
        return "house"
    if genre == "indie":
        return "alternative"
    if genre == "R&B" or genre == "funk":
        return "soul"
    return genre     

In [5]:
genres_list = ["pop", "hip hop", "rap", "latin", "rock", "hollywood", "house",
                   "country", "edm", "indie", "alternative", "reggae", "R&B", "funk", "soul",
              "a cappella", "disco", "rave", "big room", "brostep"]
def find_genre(song):
    if len(song) == 2: # empty list, some artists didnt have genres
        return None
    for genre in genres_list:
        num = song.split(",")[0].find(genre)
        if num > -1:
            return return_general_genre(num, genre)
    if len(song.split(",")) > 1:
        for genre in genres_list:
            num = song.split(",")[1].find(genre)
            if num > -1:
                return return_general_genre(num, genre)
    return "other"

In [6]:
tracks["main_genre"] = tracks.genre.apply(find_genre)
tracks = tracks.drop("Unnamed: 0", axis=1)

In [13]:
tracks.head()

Unnamed: 0,explicit,id,name,popularity,preview_url,uri,artist,acousticness,danceability,duration_ms,...,hollywood,house,indie,latin,pop,rap,reggae,rock,trap,main_genre
0,False,66y7x28jXOPrcmu3D5Zjh6,A Million Dreams (Reprise),78,https://p.scdn.co/mp3-preview/1f475e722fa38c51...,spotify:track:66y7x28jXOPrcmu3D5Zjh6,Austyn Johnson,0.726,0.253,60453,...,False,False,False,False,False,False,False,False,False,
1,False,2vvdTrdryjsl8DmPIMDWZU,EL BAÑO,87,https://p.scdn.co/mp3-preview/0456d479f76cda3f...,spotify:track:2vvdTrdryjsl8DmPIMDWZU,Enrique Iglesias,0.132,0.719,228139,...,False,False,False,True,True,False,False,False,False,pop
2,False,7qCAVkHWZkF44OzOUKf8Cr,El Perdón,78,https://p.scdn.co/mp3-preview/61a4d49e478c5826...,spotify:track:7qCAVkHWZkF44OzOUKf8Cr,Enrique Iglesias,0.446,0.628,205907,...,False,False,False,True,True,False,False,False,False,pop
3,False,32lm3769IRfcnrQV11LO4E,Bailando - Spanish Version,71,,spotify:track:32lm3769IRfcnrQV11LO4E,Enrique Iglesias,0.0467,0.713,243413,...,False,False,False,True,True,False,False,False,False,pop
4,False,1BuTNbYxxFYezD1tT8AJR9,MOVE TO MIAMI,79,https://p.scdn.co/mp3-preview/b0959be241a530f3...,spotify:track:1BuTNbYxxFYezD1tT8AJR9,Enrique Iglesias,0.0166,0.74,169380,...,False,False,False,True,True,False,False,False,False,pop


In [10]:
# save for later
#tracks.to_csv("final_tracks.csv")

# Song Lyrics Collection

Here, we collect the lyrics for each song in our dataframe. The lyrics are retrieved from the API lyrics.ovh. To obtain the lyrics for a particular song, we will use the API 
https://api.lyrics.ovh/v1/artist/title.

Due to the slow responses of the API requests and the number of songs we had in our dataset, we decided to collect the songs in groups of 1000. This would help us split up the time we waited to obtain songs and ensure that our IP addresses were not blacklisted from the API. 

For each request to the lyrics API, we passed in the name and artist of the song. Then we created a dataframe to fill with the lyrics of each song per row in a column called "lyrics" ("N/A" if no lyric was returned). We returned that dataframe to be saved to a csv file for later use.

In [None]:
from time import sleep
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
final_tracks = pd.read_csv("saved_csvs/final_tracks.csv")

In [None]:
# initialize
track = final_tracks.loc[0]
name = track["name"]
artist = track["artist"]
url = "https://api.lyrics.ovh/v1/Post Malone/Go Flex"
resp = requests.get(url)
resp.json()["lyrics"]
lyrics = pd.DataFrame(columns=["lyrics"])

In [None]:
# function that returns lyrics between start and end indices
def obtain_lyrics_for(start_index, end_index):
    song_lyrics = pd.DataFrame(columns=["lyrics"])
    for index, track in final_tracks.iterrows():
        if index >= start_index and index < end_index :
            name = track["name"]
            artist = track["artist"]
            url = "https://api.lyrics.ovh/v1/" + artist + "/" + name
            resp = requests.get(url)
            if resp.status_code == 200:
                lyrics = resp.json()["lyrics"]
                song_lyrics.loc[index, "lyrics"] = lyrics
            else:
                song_lyrics.loc[index, "lyrics"] = "N/A"
    return song_lyrics

In [None]:
def obtain_all_lyrics():
    for i in range(0, 6038, 1000):
        temp = obtain_lyrics_for(i, i + 1000)
        sleep(20)
        file_name = "up_to_" + str(i + 1000) + "_lyrics.csv"
        #temp.to_csv(file_name)

In [None]:
obtain_all_lyrics()

In [None]:
# initialize 
upto1000 = pd.read_csv("up_to_1000_lyrics.csv")
upto1000.columns = ["song_index", "lyrics"]
upto1000.loc[199:225, "lyrics"] = upto1000.loc[199:225, "song_index"]
upto1000.loc[198, "lyrics"] = upto1000.loc[198:225, "lyrics"].str.cat(sep=" ")
section1 = upto1000[upto1000.index < 199]
section2 = upto1000[upto1000.index > 225]
upto1000 = section1.append(section2)
upto1000.song_index = upto1000.song_index.astype(int)
#upto1000.to_csv("final_lyrics/up_to_1000_final.csv")

In [None]:
for x in range(2000, 7001, 1000):
    path1 = "up_to_" + x + "_lyrics.csv"
    uptoX = pd.read_csv(path1)
    uptoX.columns = ["song_index", "lyrics"]
    path2 = "final_lyrics/" + path1
    #uptoX.to_csv(path2)

In [None]:
all_lyrics = []
for i in range(1, 8):
    file_path = "final_lyrics/up_to_" + str(i) + "000_final.csv"
    subset = pd.read_csv(file_path)
    all_lyrics.append(subset)

In [None]:
song_lyrics = pd.concat(all_lyrics, ignore_index=True)
#song_lyrics.to_csv("song_lyrics.csv")