# Audio Features Data Crawling

### Libraries import

In [2]:
import spotipy
import csv
import os
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv

### Authentication to Spotify Developer

In [3]:
load_dotenv()

client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')
redirect_uri = os.getenv('SPOTIPY_REDIRECT_URI')

client_credentials_manager = SpotifyClientCredentials()
spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

### Getting 1000 tracks of each year from 2020 to 2022 and saving the received data into a list

In [9]:
tracks = []
for i in range(2020, 2023):
    search_result = spotify.search(q=str(i), limit=50) #the result only consists of tracks

    tracks.extend(search_result['tracks']['items'])

    for i in range(19):
        search_result = spotify.next(search_result['tracks'])
        tracks.extend(search_result['tracks']['items'])

In [10]:
len(tracks)

3000

### Getting tracks' ID from the list of tracks

In [11]:
tracks_id = []
for i in range(len(tracks)):
    tracks_id.append(tracks[i]['id'])

tracks_id = list(dict.fromkeys(tracks_id))

In [12]:
len(tracks_id)

2875

#### There are 2875 unique tracks in the tracks data list, which is synonymous with some of them are duplicated.

## Tracks' audio features

### For each track id, using API to get corresponding audio features of the track and saving all received data into a list

In [13]:
track_features_list = []

for i in range(len(tracks_id)):
    track_features_list.append(spotify.audio_features(tracks_id[i]))

In [15]:
len(track_features_list)

2875

In [16]:
count = 0
for i in range(len(track_features_list)):
    if track_features_list[i] == [None]:
        count += 1
count

11

#### There are some tracks that don't have  audio features so we need to remove them from the list

In [17]:
track_features_list = list(filter(lambda x: x != [None], track_features_list))

In [18]:
len(track_features_list)

2864

In [26]:
type(track_features_list[0])

list

#### As can be seen, each element of the list is a list, and those internal lists represent correspoding track's audio features.
#### Given this, there may be more than 1 audio features for a track ... Let's check this out.

In [27]:
count = 0
for i in range(len(track_features_list)):
    if len(track_features_list[i]) > 1:
        count += 1
count

0

#### We can somehow conclude that it is just about the data type of data we receive from API

#### Now, let's explore each track's audio features

In [23]:
track_features_list[0][0].keys()

dict_keys(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature'])

#### We only need the id attribute along with all the audio features, and also need to "reshape" the list as the internal lists don't make any sense.

In [28]:
for i in range(len(track_features_list)):
    track_features_list[i] = dict(list(filter(lambda x: x[0] == 'id' or type(x[1]) != str, list(track_features_list[i][0].items()))))

In [30]:
track_features_list[0].keys()

dict_keys(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'id', 'duration_ms', 'time_signature'])

## Spotify API doesn't provide genre(s) of tracks, so we need to somehow initialize it.
### It is notable that artists have genre(s), so we decide to get the first genre of the first artist of each track to become the track genre.

### Getting unique artists' ID from the list of tracks

In [38]:
artists_id = []
for i in range(len(tracks)):
    for j in range(len(tracks[i]['artists'])):
        artists_id.append(tracks[i]['artists'][j]['id'])

artists_id = list(dict.fromkeys(artists_id))

In [39]:
len(artists_id)

1950

### For each artist id, using API to get corresponding genres of the artist and saving all received data into a list

In [53]:
artists = []
for i in range(len(artists_id)):
    artist_info = spotify.artist(artist_id=artists_id[i])
    cols = ['id', 'genres']
    artist = dict.fromkeys(cols)
    for key in artist:
        artist[key] = artist_info[key]
    artists.append(artist)

In [57]:
count = 0
for i in range(len(artists)):
    if artists[i]['genres'] == []:
        count += 1
count

535

#### There are some artists that dont have genres, so we need to remove them out.

In [58]:
artists = list(filter(lambda x: x['genres'] != [], artists))

In [59]:
len(artists)

1415

#### For each artist in the list, we only need the first genre of them (artists may have many genres)
#### For each track in the tracks list, if the first artist of the track is in the artists list above, we will add both the track id and the first genre of the artist as a dictionary to a new list.

In [60]:
genres_list = []

for i in range(len(artists)):
    keys = ['id', 'genre']
    genre = dict.fromkeys(keys)
    for j in range(len(tracks)):
        if artists[i]['id'] == tracks[j]['artists'][0]['id']:
            genre['id'] = tracks[j]['id']
            genre['genre'] = artists[i]['genres'][0]
            genres_list.append(genre)

In [66]:
len(genres_list)

2660

#### Now, we initialize genre for all tracks' audio features.

In [67]:
for i in range(len(track_features_list)):
    for j in range(len(genres_list)):
        if track_features_list[i]['id'] == genres_list[j]['id']:
            track_features_list[i]['genre'] = genres_list[j]['genre']

In [73]:
count = 0
for i in range(len(track_features_list)):
    if len(track_features_list[i]) < 15:
        count += 1
count

1971

#### There are many of tracks' audio features don't have genre. This could be explained by there are many artists don't have genre(s).
#### Now we have to remove them from the list.

In [74]:
track_features_list = list(filter(lambda x: len(x) == 15, track_features_list)) #tracks having genre

In [77]:
len(track_features_list)

893

#### After all steps of handling, only 893 of 3000 tracks have both audio features and genres

### Saving the list of artists' infomation into "audio_features.csv" file

#### Bringing the key "id" to the front

In [75]:
for i in range(len(track_features_list)):
    track_features_list[i] = {'id': track_features_list[i].pop('id'), **track_features_list[i]}

#### Saving to csv file

In [76]:
with open('audio_features.csv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=track_features_list[0].keys(), delimiter='\t')
    writer.writeheader()
    for data in track_features_list:
        writer.writerow(data)