# Setup

In [1]:
from dotenv import load_dotenv
import os
import spotipy
from lyricsgenius import Genius
from spotipy.oauth2 import SpotifyClientCredentials
import sys

# Setting the working directory
sys.path.insert(0,'..')

from src.data.make_dataset import create_dataset

In [2]:
# Loading credentials to Spotify and Genius API from .env file
dotenv_path = os.path.join('..', '.env')
load_dotenv(dotenv_path)

SPOTIFY_API_CLIENT_ID = os.getenv('SPOTIFY_API_CLIENT_ID')
SPOTIFY_API_CLIENT_SECRET = os.getenv('SPOTIFY_API_CLIENT_SECRET')
GENIUS_ACCESS_TOKEN = os.getenv('GENIUS_ACCESS_TOKEN')

In [2]:
# Path to save results
path_to_save_df = os.path.join('..', 'data', 'raw', 'data.csv')

In [4]:
# Connecting to spotify API
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIFY_API_CLIENT_ID,
                                                      client_secret=SPOTIFY_API_CLIENT_SECRET)
sp = spotipy.Spotify(auth_manager=client_credentials_manager)

In [5]:
# Connecting to Genius API
genius = Genius(GENIUS_ACCESS_TOKEN, timeout=10, retries=5)
genius.verbose = False
genius.remove_section_headers = True

In [6]:
# Selecting 10 subjectively most popular music genre
# From lsit generated with function below

# sp.recommendation_genre_seeds()

In [7]:
chosen_genres_10 = ['blues', 'country', 'disco', 'hip-hop',
                   'pop', 'punk', 'reggae', 'rock', 'r-n-b', 'jazz']

# Scraping data

## Example

In [8]:
df = create_dataset(chosen_genres_10, sp, genius, 
              limit=3, 
              how_many_in_genre=6,
              sleep_time=0)

Number of scraped samples: 0/60
Number of scraped samples: 3/60. Time: 0.04 min
Number of scraped samples: 6/60. Time: 0.03 min
Number of scraped samples: 9/60. Time: 0.03 min
Number of scraped samples: 12/60. Time: 0.03 min
Number of scraped samples: 15/60. Time: 0.04 min
Number of scraped samples: 18/60. Time: 0.09 min
Number of scraped samples: 21/60. Time: 0.10 min
Number of scraped samples: 24/60. Time: 0.05 min
Number of scraped samples: 27/60. Time: 0.05 min
Number of scraped samples: 30/60. Time: 0.04 min
Number of scraped samples: 33/60. Time: 0.05 min
Number of scraped samples: 36/60. Time: 0.04 min
Number of scraped samples: 39/60. Time: 0.08 min
Number of scraped samples: 42/60. Time: 0.04 min
Number of scraped samples: 45/60. Time: 0.05 min
Number of scraped samples: 48/60. Time: 0.04 min
Number of scraped samples: 51/60. Time: 0.05 min
Number of scraped samples: 54/60. Time: 0.04 min
Number of scraped samples: 57/60. Time: 0.05 min
Number of scraped samples: 60/60. Time: 

In [9]:
df.head()

Unnamed: 0,artist_name,track_name,popularity,genre,lyrics
0,Cage The Elephant,Cigarette Daydreams,80,punk,"Did you stand there all alone?\nOh, I cannot e..."
1,Lil Nas X,INDUSTRY BABY (feat. Jack Harlow),96,hip-hop,"(D-D-Daytrip took it to ten, hey)\nBaby back, ..."
2,Michael Bublé,Holly Jolly Christmas,86,jazz,"Have a holly, jolly Christmas\nIt's the best t..."
3,Commodores,Easy,76,disco,"I know it sounds funny, but I just can't stand..."
4,Luke Combs,Forever After All,80,country,A cold beer's got twelve ounces\nA good truck'...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   artist_name  60 non-null     object
 1   track_name   60 non-null     object
 2   popularity   60 non-null     int64 
 3   genre        60 non-null     object
 4   lyrics       60 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.8+ KB


## Final data frame

We will scrape data from 10 popular music genres. For each genre there will be maximum 1 000 unique observations. Carefully it can took some time until function ends.

In [11]:
df = create_dataset(chosen_genres_10, sp, genius, 
              limit=50, 
              how_many_in_genre=1_000,
              sleep_time=10,
              path_to_save=path_to_save_df,
              verbose=False)