# <b>Section 1: Data Crawling</b>

### <b><u>Step 1</u>: Import library</b>

These are the main libraries used for data crawling:
- spotipy:
- dotenv:
- os:

In [None]:
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv
import spotipy
import os

### <b><u>Step 2</u>: Request access to Spotify API by using OAuth method

In [3]:
load_dotenv()

client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')
redirect_uri = os.getenv('SPOTIPY_REDIRECT_URI')

client_credentials_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri),
                    client_credentials_manager=SpotifyClientCredentials())

### <b><u>Step 3</u>: Crawl top 3000 songs from Spotify from 2020-2022

In [4]:
result = sp.search(q='2022', limit=50)
result['tracks'].keys()

dict_keys(['href', 'items', 'limit', 'next', 'offset', 'previous', 'total'])

In [5]:
songs_data = result['tracks']['items']

for _ in range(19):
    result = sp.next(result['tracks'])
    songs_data.extend(result['tracks']['items'])

In [6]:
for i in range(20):
    result = sp.search(q='2021', limit=50, offset=i*50)
    songs_data.extend(result['tracks']['items'])

In [7]:
for i in range(20):
    result = sp.search(q='2020', limit=50, offset=i*50)
    songs_data.extend(result['tracks']['items'])

In [10]:
songs_data[0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

### <b><u>Step 4</u>: Store songs data to 'songs_data.tsv' file

In [11]:
with open("songs_data.tsv", 'w') as f:
    f.write("id\tname\tartist\tgenres\tartist_followers\tartist_popularity\tmarkets\talbum\treleased_date\talbum_popularity\tduration\texplicit\tpopularity\n")
    for track in songs_data:
        artists_uri = [artist['uri'] for artist in track['artists']]
        artists_data = [sp.artist(uri) for uri in artists_uri]

        artists_info = {k: [] for k in artists_data[0].keys() if k in {'followers', 'genres', 'name', 'popularity'}}

        for i in artists_data:
            artists_info['name'].append(i['name'])
            artists_info['genres'].extend(i['genres'])
            artists_info['followers'].append(str(i['followers']['total']))
            artists_info['popularity'].append(str(i['popularity']))

        album_popularity = str(sp.album(track['album']['uri'])['popularity'])
        print(1)

        f.write(track['id']+'\t'+track['name']+'\t'+(','.join(artists_info['name']))+'\t'+(','.join(set(artists_info['genres']))) \
                +'\t'+(','.join(artists_info['followers']))+'\t'+(','.join(artists_info['popularity'])) \
                +'\t'+ str(len(track['available_markets'])) \
                +'\t'+track['album']['name']+'\t'+track['album']['release_date']+'\t'+album_popularity \
                +'\t'+str(track['duration_ms'])+'\t'+str(track['explicit'])+'\t'+str(track['popularity'])+'\n')


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
