# Extracting Audio Features of Songs
In this notebook, we will use `Spotify's Web API` to scarpe the audio features of the songs we have selected from the top 200 songs.

In [14]:
import pandas as pd

from __future__ import print_function
from pprint import pprint

try: 
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials
except ModuleNotFoundError:
    print("Installing Spotipy library using `pip`.")
    !pip install spotipy --quiet
    import spotipy
    from spotipy.oauth2 import SpotifyClientCredentials    

import json
import time
import sys
import pathlib
import os
import csv

try:
    from tqdm import tqdm
except ModuleNotFoundError:
    print("Installing tqdm library using `pip`.")
    !pip install tqdm --quiet
    from tqdm import tqdm

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [4]:
client_credentials_manager = SpotifyClientCredentials(client_id="7dbe272b9d9b44278d84430e76374e88",
                                                      client_secret="14e94037bec24a3680b9249332a3d129")
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
sp.trace = False

In [5]:
sp.audio_features("7BKLCZ1jbUBVqRi2FVlTVw")

[{'danceability': 0.748,
  'energy': 0.524,
  'key': 8,
  'loudness': -5.599,
  'mode': 1,
  'speechiness': 0.0338,
  'acousticness': 0.414,
  'instrumentalness': 0,
  'liveness': 0.111,
  'valence': 0.661,
  'tempo': 95.01,
  'type': 'audio_features',
  'id': '7BKLCZ1jbUBVqRi2FVlTVw',
  'uri': 'spotify:track:7BKLCZ1jbUBVqRi2FVlTVw',
  'track_href': 'https://api.spotify.com/v1/tracks/7BKLCZ1jbUBVqRi2FVlTVw',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7BKLCZ1jbUBVqRi2FVlTVw',
  'duration_ms': 244960,
  'time_signature': 4}]

In [6]:
# Define location of the data
data_dir = '../data'
filename = 'selected_regions_top_200_daily.csv'
data_path = os.path.join(data_dir, filename)

if not pathlib.Path(data_path).exists():
    raise FileNotFoundError('No file found at the location defined.')

In [7]:
# Load the dataset
filepath = pathlib.Path(data_path)

if filepath.exists():
    data_df = pd.read_csv(filepath, parse_dates=True)
else:
    data_df = pd.DataFrame()

# View the first 5 rows    
data_df.head()

Unnamed: 0,Position,Track Name,Artist,Streams,date,region,spotify_id
0,1.0,Starboy,The Weeknd,3135625.0,2017-01-01,global,5aAx2yezTd8zXrkmtKl66Z
1,2.0,Closer,The Chainsmokers,3015525.0,2017-01-01,global,7BKLCZ1jbUBVqRi2FVlTVw
2,3.0,Let Me Love You,DJ Snake,2545384.0,2017-01-01,global,4pdPtRcBmOSQDlJ3Fk945m
3,4.0,Rockabye (feat. Sean Paul & Anne-Marie),Clean Bandit,2356604.0,2017-01-01,global,5knuzwU65gJK7IF5yJsuaW
4,5.0,One Dance,Drake,2259887.0,2017-01-01,global,1xznGGDReH1oQq0xzbwXa3


In [8]:
track_ids = data_df["spotify_id"].unique()
track_ids, len(track_ids)

(array(['5aAx2yezTd8zXrkmtKl66Z', '7BKLCZ1jbUBVqRi2FVlTVw',
        '4pdPtRcBmOSQDlJ3Fk945m', ..., '7as7OL7cmgFZDADgVjQZjz',
        '1lVwFdNhP5q2HQdHoKX30A', '3rnso1KNGrkXfqNTlBXdPH'], dtype=object),
 22750)

## Extract Audio Features
In this section, the audio features of the tracks that have been in the top 200 are going to be extracted and saved to a CSV file named `spotify_top_tracks_audio_features.csv`.

> **Approximate time required**: `1 hour` and `20 minutes`.<sup>*</sup>

<sup>*</sup>The estimated times can be different based your computer's performance.

### Save Audio Features of Tracks
In this section, the audio features of the tracks are downloaded.

In [89]:
with open(os.path.join(data_dir, 'spotify_top_tracks_audio_features.csv'), 'w') as csvfile:
    fieldnames = list(sp.audio_features("7BKLCZ1jbUBVqRi2FVlTVw")[0].keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for track_id in tqdm(track_ids, desc="Saving Audio Features"):
        data_dict = sp.audio_features(track_id)[0]
        if data_dict is not None:
            writer.writerow(data_dict)

Saving Audio Features: 100%|██████████| 22750/22750 [50:13<00:00,  7.55it/s]  


In [158]:
track_dets = sp.track("7BKLCZ1jbUBVqRi2FVlTVw")

In [159]:
sp.artist(track_dets["artists"][0]["id"])["genres"]

['dance pop', 'edm', 'electropop', 'pop', 'pop dance', 'tropical house']

In [163]:
track = sp.track("7BKLCZ1jbUBVqRi2FVlTVw")
pprint(track["album"]["release_date"])

'2016-07-29'


### Save Release Dates for All Tracks
In this section, we get the information on the release date of the tracks and save it to a different CSV file.

> **Approximate time required**: `1 hour` and `20 minutes`.<sup>*</sup>

<sup>*</sup>The estimated times can be different based your computer's performance.

In [172]:
with open(os.path.join(data_dir, 'spotify_top_tracks_release_dates.csv'), 'w') as csvfile:
    fieldnames = ["id", "release_date"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for track_id in tqdm(track_ids, desc="Saving Release Dates"):
        data_dict = {
            "id": track_id,
            "release_date": sp.track(track_id)["album"]["release_date"]
        }
        writer.writerow(data_dict)

Saving Release Dates: 100%|██████████| 22750/22750 [51:14<00:00,  7.40it/s]  


In [9]:
data_df.head()

Unnamed: 0,Position,Track Name,Artist,Streams,date,region,spotify_id
0,1.0,Starboy,The Weeknd,3135625.0,2017-01-01,global,5aAx2yezTd8zXrkmtKl66Z
1,2.0,Closer,The Chainsmokers,3015525.0,2017-01-01,global,7BKLCZ1jbUBVqRi2FVlTVw
2,3.0,Let Me Love You,DJ Snake,2545384.0,2017-01-01,global,4pdPtRcBmOSQDlJ3Fk945m
3,4.0,Rockabye (feat. Sean Paul & Anne-Marie),Clean Bandit,2356604.0,2017-01-01,global,5knuzwU65gJK7IF5yJsuaW
4,5.0,One Dance,Drake,2259887.0,2017-01-01,global,1xznGGDReH1oQq0xzbwXa3


In [138]:
track_dets = data_df.iloc[0]

In [140]:
track = network.get_track(track_dets["Artist"], track_dets["Track Name"])

In [141]:
track.get_url()

'https://www.last.fm/music/the%2bweeknd/_/starboy'

In [12]:
data_dict["artists"].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [30]:
regions = {"global": "global",
           "united states": "us",
           "brazil": "br",
           "germany": "de",
           "united kingdom": "gb",
           "spain": "es",
           "canada": "ca"
           }

In [44]:
search_params = []

for region in regions.values():
    for year in range(1920,2021):
        search_params.append({
            "region": region,
            "year": year
        })

pprint(search_params)

[{'region': 'global', 'year': 1920},
 {'region': 'global', 'year': 1921},
 {'region': 'global', 'year': 1922},
 {'region': 'global', 'year': 1923},
 {'region': 'global', 'year': 1924},
 {'region': 'global', 'year': 1925},
 {'region': 'global', 'year': 1926},
 {'region': 'global', 'year': 1927},
 {'region': 'global', 'year': 1928},
 {'region': 'global', 'year': 1929},
 {'region': 'global', 'year': 1930},
 {'region': 'global', 'year': 1931},
 {'region': 'global', 'year': 1932},
 {'region': 'global', 'year': 1933},
 {'region': 'global', 'year': 1934},
 {'region': 'global', 'year': 1935},
 {'region': 'global', 'year': 1936},
 {'region': 'global', 'year': 1937},
 {'region': 'global', 'year': 1938},
 {'region': 'global', 'year': 1939},
 {'region': 'global', 'year': 1940},
 {'region': 'global', 'year': 1941},
 {'region': 'global', 'year': 1942},
 {'region': 'global', 'year': 1943},
 {'region': 'global', 'year': 1944},
 {'region': 'global', 'year': 1945},
 {'region': 'global', 'year': 1946},
 

 {'region': 'ca', 'year': 1920},
 {'region': 'ca', 'year': 1921},
 {'region': 'ca', 'year': 1922},
 {'region': 'ca', 'year': 1923},
 {'region': 'ca', 'year': 1924},
 {'region': 'ca', 'year': 1925},
 {'region': 'ca', 'year': 1926},
 {'region': 'ca', 'year': 1927},
 {'region': 'ca', 'year': 1928},
 {'region': 'ca', 'year': 1929},
 {'region': 'ca', 'year': 1930},
 {'region': 'ca', 'year': 1931},
 {'region': 'ca', 'year': 1932},
 {'region': 'ca', 'year': 1933},
 {'region': 'ca', 'year': 1934},
 {'region': 'ca', 'year': 1935},
 {'region': 'ca', 'year': 1936},
 {'region': 'ca', 'year': 1937},
 {'region': 'ca', 'year': 1938},
 {'region': 'ca', 'year': 1939},
 {'region': 'ca', 'year': 1940},
 {'region': 'ca', 'year': 1941},
 {'region': 'ca', 'year': 1942},
 {'region': 'ca', 'year': 1943},
 {'region': 'ca', 'year': 1944},
 {'region': 'ca', 'year': 1945},
 {'region': 'ca', 'year': 1946},
 {'region': 'ca', 'year': 1947},
 {'region': 'ca', 'year': 1948},
 {'region': 'ca', 'year': 1949},
 {'region'

In [60]:
for year in range(1920,2021):
    print(year)
    data_dict = sp.search(q=f"year:{year}",
                          limit=50, type='artist')

    for artist in data_dict["artists"]["items"]:
        print(artist["name"])
        query = f"artist:{artist['name']}"
        print(query)
        track_dict = sp.search(q=query, limit=50)
#         print(track_dict)
        for track in track_dict["tracks"]["items"]:
            artists = [data["name"] for data in track["artists"]]
            print(track["album"]["release_date"])
#             print(track["name"])
        break
    break

1920
Berliner Philharmoniker
artist:Berliner Philharmoniker
2003-07-01
2008-05-21
2020-04-14
2020-04-14
2008-01-07
2004-07-12
2008-05-21
2005-11-28
2008-01-07
2007-11-12
2002-07-01
2012-11-09
2008-01-07
2012-11-09
2020-04-14
2012-11-09
2008-01-07
2008-01-07
2012-11-09
2006-09-26
2012-10-03
2012-11-09
2012-11-09
2008-01-07
2006-03-01
2000
2012-11-09
2008-05-21
2019-11-22
2012-10-03
2007-11-12
2006-03-01
2004-07-12
2000
2017-05-26
2000
2008-05-21
2012-11-09
2019-12-27
2008-01-07
2012-11-09
2020-04-14
2012-11-09
2012-11-09
2012-11-09
2012-11-09
2020-04-14
2008-05-21
2008-01-07
2020-04-14


In [None]:
data_df.head()

In [25]:
{
    "Track Name": track["name"],
    "Artist": artist["name"],
    "artists": [data["name"] for data in track["artists"]],
    "spotify_id": track["id"],
    "release_date": track["album"]["release_date"],
    "region": region
}

{'Track Name': 'After Dark (feat. Static Major & Ty Dolla $ign)',
 'Artist': 'Drake',
 'artists': ['Drake', 'Static Major', 'Ty Dolla $ign'],
 'spotify_id': '3mvYQKm8h6M5K5h0nVPY9S',
 'release_date': '2018-06-29'}