# <b>Section 1: Data Crawling</b>

### <b><u>Step 1</u>: Import library</b>

These are the main libraries used for data crawling:
- `spotipy`: Spotipy is a lightweight Python library for the Spotify Web API. With Spotipy you get full access to all of the music data provided by the Spotify platform.
- `dotenv`: used to get spotipy's client_id, secret and redirect uri in .env file
- `os`: used to get accessed to .env files in system
- `csv`: used to support write data to csv files

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv
import spotipy
import csv
import os

### <b><u>Step 2</u>: Request access to Spotify API by using OAuth method

Firstly, we will need to load the .env file to get the spotipy's client_id, secret and redirect uri in order to get accessed to Spotify's API service using OAuth method.

After that, we will initialize a `SpotifyClientCredentials` object and pass in as a parameter along with spotipy's client_id, secret and redirect uri to get permission to API service.

In [None]:
load_dotenv()

client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')
redirect_uri = os.getenv('SPOTIPY_REDIRECT_URI')

client_credentials_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri),
                    client_credentials_manager=SpotifyClientCredentials())

### <b><u>Step 3</u>: Crawl top 3000 songs from Spotify from 2020-2022

day la markdown

In [None]:
result = sp.search(q='year:2022', limit=50)

day la markdown

In [None]:
songs_data = result['tracks']['items']

for _ in range(19):
    result = sp.next(result['tracks'])
    songs_data.extend(result['tracks']['items'])

day la markdown

In [None]:
for i in range(20):
    result = sp.search(q='year:2021', limit=50, offset=i*50)
    songs_data.extend(result['tracks']['items'])

day la markdown

In [None]:
for i in range(20):
    result = sp.search(q='year:2020', limit=50, offset=i*50)
    songs_data.extend(result['tracks']['items'])

### <b><u>Step 4</u>: Getting artists' ID from the list of tracks</b>

In [None]:
artists_id = [[songs_data[i]['artists'][j]['id'] for j in range(len(songs_data[i]['artists']))] for i in range(len(songs_data))]

day la markdown

In [None]:
artists = []
for i in range(len(artists_id)):
    artist_info = sp.artist(artist_id=artists_id[i])
    cols = ['id', 'name', 'genres', 'followers', 'popularity']
    artist = dict.fromkeys(cols)
    for key in artist:
        if key != 'followers' and key != 'genres':
            artist[key] = artist_info[key]
        elif key != 'genres':
            artist[key] = artist_info[key]['total']
        else:
            artist[key] = ', '.join(artist_info[key])
    artists.append(artist)

### <b><u>Step 5</u>: Saving the list of artists data into "artists.csv" file</b>

day la markdown

In [None]:
with open('../../data/artists_data.csv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=artists[0].keys(), delimiter='\t')
    writer.writeheader()
    for data in artists:
        writer.writerow(data)

### <b><u>Step 6</u>: Getting albums' URI from the list of songs data</b>

day la markdown

In [None]:
album_uri = [songs_data[i]['album']['uri'] for i in range(len(songs_data))]

day la markdown

In [None]:
album_data = [sp.album(uri) for uri in album_uri]

### <b><u>Step 7</u>: Store albums data to 'albums_data.tsv' file

day la markdown

In [None]:
album_info = {k: [] for k in album_data[0].keys() if k in {'id', 'name', 'popularity', 'available_markets', 'release_date', 'total_tracks'}}

for i in album_data:
    album_info['id'].append(i['id'])
    album_info['name'].append(i['name'])
    album_info['popularity'].append(str(i['popularity']))
    album_info['available_markets'].append(str(len(i['available_markets'])))
    album_info['release_date'].append(i['release_date'])
    album_info['total_tracks'].append(str(i['total_tracks']))

day la markdown

In [None]:
with open('../../data/albums_data.tsv', 'w', encoding = "utf-8") as f:
    f.write('id\tname\tpopularity\tavailable_markets\trelease_date\ttotal_tracks\n')

    for i in range(len(album_info['id'])):
        f.write(album_info['id'][i] + '\t' + album_info['name'][i] + '\t' + album_info['popularity'][i] + \
                '\t' + album_info['available_markets'][i] + '\t' + album_info['release_date'][i] + '\t' + \
                album_info['total_tracks'][i] + '\n')

### <b><u>Step 8</u>: Store songs data to 'songs_data.tsv' file

day la markdown

In [None]:
with open("../../data/songs_data.tsv", 'w') as f:
    f.write("id\tname\tartist\tgenres\tartist_followers\tartist_popularity\tmarkets\talbum\treleased_date\talbum_popularity\tduration\texplicit\tpopularity\n")
    for track in songs_data:
        artists_uri = [artist['uri'] for artist in track['artists']]
        artists_data = [sp.artist(uri) for uri in artists_uri]
        artists_info = {k: [] for k in artists_data[0].keys() if k in {'followers', 'genres', 'name', 'popularity'}}

        for i in artists_data:
            artists_info['name'].append(i['name'])
            artists_info['genres'].extend(i['genres'])
            artists_info['followers'].append(str(i['followers']['total']))
            artists_info['popularity'].append(str(i['popularity']))

        album_popularity = str(sp.album(track['album']['uri'])['popularity'])

        f.write(track['id']+'\t'+track['name']+'\t'+(','.join(artists_info['name']))+'\t'+(','.join(set(artists_info['genres']))) \
                +'\t'+(','.join(artists_info['followers']))+'\t'+(','.join(artists_info['popularity'])) \
                +'\t'+ str(len(track['available_markets'])) \
                +'\t'+track['album']['name']+'\t'+track['album']['release_date']+'\t'+album_popularity \
                +'\t'+str(track['duration_ms'])+'\t'+str(track['explicit'])+'\t'+str(track['popularity'])+'\n')
