# <b>Albums Data Crawling</b>

### <b>Libraries Import</b>

In [14]:
import spotipy
import os
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv

### <b>Authentication to Spotify Developer</b>

Get Client ID, Client Secret and Redirect URI of to connect to Spotify Developer.

In [15]:
load_dotenv()

client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')
redirect_uri = os.getenv('SPOTIPY_REDIRECT_URI')

In [16]:
client_id, client_secret, redirect_uri

('f0f688047045486b962b85537f723922',
 'fad3f3a693f744aa88e10b16ef9825f1',
 'https://localhost:3000/callback')

Authentication - without user

In [17]:
client_credentials_manager = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret)
sp = spotipy.Spotify(auth_manager = SpotifyOAuth(client_id = client_id, client_secret = client_secret, redirect_uri = redirect_uri),
                    client_credentials_manager = SpotifyClientCredentials())

### <b>Call API To Get Data From Last 3 Years</b>

Get 1000 songs of year 2022

In [18]:
result = sp.search('year:2022', limit = 50, offset = 0)
tracks = result['tracks']['items']

for _ in range(19):
    result = sp.next(result['tracks'])
    tracks.extend(result['tracks']['items'])

Get 1000 songs of year 2021

In [19]:
for i in range(20):
    result = sp.search('year:2021', limit = 50, offset = 0)
    tracks.extend(result['tracks']['items'])

Get 1000 songs of year 2020

In [20]:
for i in range(20):
    result = sp.search('year:2020', limit = 50, offset = 0)
    tracks.extend(result['tracks']['items'])

Total songs we have got

In [21]:
len(tracks)

3000

Get albums' uri from 3000 songs above

In [28]:
album_uri = []

for i in range(len(tracks)):
    album_uri.append(tracks[i]['album']['uri'])

### <b>Call API To Get Albums' Data</b>

Get albums' data by calling api through albums' uri we have got last shell.

In [29]:
album_data = []

for uri in album_uri:
    album_data.append(sp.album(uri))

Check features of each data to decide which are necessary.

In [30]:
album_data[0].keys()

dict_keys(['album_type', 'artists', 'available_markets', 'copyrights', 'external_ids', 'external_urls', 'genres', 'href', 'id', 'images', 'label', 'name', 'popularity', 'release_date', 'release_date_precision', 'total_tracks', 'tracks', 'type', 'uri'])

### <b>Data Export</b>

Store necessary albums' information to an array following 6 columns of neccessary features.

In [31]:
album_info = {k: [] for k in album_data[0].keys() if k in {'id', 'name', 'popularity', 'available_markets', 'release_date', 'total_tracks'}}

for i in album_data:
    album_info['id'].append(i['id'])
    album_info['name'].append(i['name'])
    album_info['popularity'].append(str(i['popularity']))
    album_info['available_markets'].append(str(len(i['available_markets'])))
    album_info['release_date'].append(i['release_date'])
    album_info['total_tracks'].append(str(i['total_tracks']))

Check if albums' information is stored in array successfully or not.

In [None]:
#for i in range(len(album_info['id'])):
print(album_info['name'])

Write albums' information above into a tsv file.

In [21]:
with open('../../data/albums_data.tsv', 'w', encoding = "utf-8") as f:
    f.write('id\tname\tpopularity\tavailable_markets\trelease_date\ttotal_tracks\n')

    for i in range(len(album_info['id'])):
        f.write(album_info['id'][i] + '\t' + album_info['name'][i] + '\t' + album_info['popularity'][i] + '\t' + album_info['available_markets'][i] + '\t' + album_info['release_date'][i] + '\t' + album_info['total_tracks'][i] + '\n')