# Data Collection 

In this notebook we will generate the relevant datasets for analyzing the songs in your *Spotify* library. To make sure the code runs and 

## 0.0 Import Relevant Libraries

In [17]:
# the two must have libraries in any data science project 
import pandas as pd 
import numpy as np


import os
import time
import requests


# for handling our web requests and html 
from bs4 import BeautifulSoup

# for handling environment variables 
from dotenv import dotenv_values
from dotenv import load_dotenv

# handle our language detection 
from langdetect import detect

# for handling spotify endpoints 
import spotipy
from spotipy.oauth2 import SpotifyOAuth

# Helps Make our Notebook Pretty
from IPython.display import clear_output

## 1.0 Define Global Variables 

In [18]:
# load your environment variables 
load_dotenv()

# define the scope of your spotipy client 
scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

# relative path to where data will be stored. 
data_dir = "datasets/"

## 1.1 Define helper functions 

In [19]:
def get_genre(track_id):
    """Gets the genre of a track. Important to notice
    that the genre is not a property of the track, but
    a property of the artist. Therefore, the genre is
    retrieved from the artist.

    Args:
        track_id (str): Spotify track id

    Returns:
        list: list  of genres associated with the artist
    """
    # Uses track endpoint to get artist id
    track_info = sp.track(track_id)
    
    # Uses artist endpoint to get genres associated with artist
    artist_info = sp.artist(track_info['artists'][0]['id'])
    
    # Returns list of genres associated with artist
    genre = artist_info['genres']
    
    return genre


def get_genius_access_token():
    """Get an access token from Genius API"""
    
    # Set API endpoint and headers
    endpoint = 'https://api.genius.com/oauth/token'
    headers = {'Content-Type': 'application/json'}
    
    # Set request parameters
    data = {
        'client_id': GENIUS_CLIENT_ID,
        'client_secret': GENIUS_CLIENT_SECRET,
        'grant_type': 'client_credentials'
    }
    
    # Send API request
    response = requests.post(endpoint, headers=headers, json=data)
    
    # Check if request was successful
    if response.status_code == 200:
        data = response.json()
        access_token = data['access_token']
        
        return access_token
    else:
        print("Failed to fetch access token. Please check your client ID and client secret.")
        return None
    
    
def get_lyrics(song_title, artist_name):
    """Get the lyrics for a song using Genius API.

    Args:
        song_title (str): title of the song.
        artist_name (str): artist name.

    Returns:
        str: lyrics of the song requested.
    """
    # Get your access token from https://genius.com/api-clients
    access_token = get_genius_access_token()
    if not access_token:
        return None
    
    # Set API endpoint and headers
    endpoint = 'https://api.genius.com/search'
    headers = {'Authorization': f'Bearer {access_token}'}
    
    # Set query parameters
    params = {
        'q': f'{song_title} {artist_name}',
    }
    
    # Send API request
    response = requests.get(endpoint, headers=headers, params=params)
    
    # Check if request was successful
    if response.status_code == 200:
        # Extract lyrics from response
        data = response.json()
        matches = data['response']['hits']
        
        # Check a match was found
        if matches:
            # Get the first match
            match = matches[0]
            song_lyrics_url = match['result']['url']
            
            # Get the lyrics from the song page
            lyrics_response = requests.get(song_lyrics_url)
            soup = BeautifulSoup(lyrics_response.content,"html.parser")
            lyrics = str()
            for tag in soup.select('div[class^="Lyrics__Container"], .song_body-lyrics p'):
                t = tag.get_text(strip=True, separator='\n')
                if t:
                    lyrics += t 
            
            
            return lyrics
        else:
            print(f"No lyrics found for {song_title} by {artist_name}.")
            return None
    else:
        print("Failed to get lyrics. Please check your client ID and client secret.")
        return "API Call Failed"
    
    
def lang_detect(lyrics):
    """Detects the language of the lyrics using langdetect. We use a try/except
    in order to handle exceptions where we were not able to retrieve lyrics from
    the genius API
    
    Args:
        lyrics (str): The lyrics to detect the language of
    Returns: 
        str: The language of the lyrics"""
    try:
        return detect(lyrics)
    except:
        return None

## 2.0 Get our *liked songs* library from *Spotify*

In [20]:
offset = 0 # at what track to start retreiving 
limit = 50 # max limit of allowed songs to fetch at once before receiving error 

LIB = pd.DataFrame() # initiate the dataframe containing our library 

while True:
    # Get trackes from the user `liked songs` 
    results = sp.current_user_saved_tracks(limit=limit, offset=offset)
    
    # Append fetched tracks to the list
    LIB = pd.concat([LIB,pd.json_normalize(results['items'])])
    
    # Check if you are at the end of the playlist
    if len(results['items']) < limit:
        # Break if you have reached the end of the playlist
        break
    else:
        # Increment the offset for the next request
        offset += limit

In [21]:
# reset the index of our library `LIB`
LIB = LIB.reset_index(drop=True) 

## 2.1 Adding Genre to our library

In [22]:
genres = [] # initiate the list of of genres of each song
for i in LIB.index:
    # call our helper function 
    genre = get_genre(LIB['track.id'][i])
    
    # add to the `genres` list
    genres.append(genre)
    
    # print status 
    clear_output(wait=True)
    print(i,flush=True)
    print(genre,flush=True)

2012
['canadian hip hop', 'canadian pop', 'hip hop', 'pop', 'rap', 'toronto rap']


In [23]:
LIB['genre'] = genres

## 2.2 Inspect and Save our `LIB` DataFrame 

In [24]:
LIB.head()

Unnamed: 0,added_at,track.album.album_group,track.album.album_type,track.album.artists,track.album.available_markets,track.album.external_urls.spotify,track.album.href,track.album.id,track.album.images,track.album.is_playable,...,track.href,track.id,track.is_local,track.name,track.popularity,track.preview_url,track.track_number,track.type,track.uri,genre
0,2023-04-11T15:06:37Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/6tG8sCK4htJOLjl...,https://api.spotify.com/v1/albums/6tG8sCK4htJO...,6tG8sCK4htJOLjlWwb7gZB,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,https://api.spotify.com/v1/tracks/7HdXRMw14roD...,7HdXRMw14roDx2a0COWk3M,False,This Is Why,74,https://p.scdn.co/mp3-preview/41728b2155b6603b...,1,track,spotify:track:7HdXRMw14roDx2a0COWk3M,"[candy pop, modern rock, pixie, pop, pop emo, ..."
1,2023-04-11T11:33:20Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/1nJzOScwb2SsJte...,https://api.spotify.com/v1/albums/1nJzOScwb2Ss...,1nJzOScwb2SsJtemleHGrI,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,https://api.spotify.com/v1/tracks/1p14xNSMBasf...,1p14xNSMBasfxLqTBn7k6L,False,Pistoleros Famosos,65,https://p.scdn.co/mp3-preview/fd184e0a47c712fa...,1,track,spotify:track:1p14xNSMBasfxLqTBn7k6L,"[banda, corridos clasicos, musica mexicana, no..."
2,2023-04-10T01:13:38Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/6kZ42qRrzov54Lc...,https://api.spotify.com/v1/albums/6kZ42qRrzov5...,6kZ42qRrzov54LcAk4onW9,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,https://api.spotify.com/v1/tracks/4OmFmE0fzcMG...,4OmFmE0fzcMG6g0Y8p4eSD,False,Better Man (Taylor's Version) (From The Vault),70,https://p.scdn.co/mp3-preview/63cfaaef1a487995...,22,track,spotify:track:4OmFmE0fzcMG6g0Y8p4eSD,[pop]
3,2023-04-09T07:18:17Z,single,single,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/2OZO8I4Z79M8VN2...,https://api.spotify.com/v1/albums/2OZO8I4Z79M8...,2OZO8I4Z79M8VN2H0wgjEp,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,https://api.spotify.com/v1/tracks/5ZjFa8NE9MHK...,5ZjFa8NE9MHKBPNefxIh88,False,Give It To Me - Sped Up Version,66,https://p.scdn.co/mp3-preview/eaf69b89d382a2e1...,2,track,spotify:track:5ZjFa8NE9MHKBPNefxIh88,[]
4,2023-04-09T02:07:04Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/4aJQ9c9XNSJ9eiC...,https://api.spotify.com/v1/albums/4aJQ9c9XNSJ9...,4aJQ9c9XNSJ9eiCrmqH3S3,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",True,...,https://api.spotify.com/v1/tracks/171SFtWNviZ7...,171SFtWNviZ7Tp10zxNdpN,False,Union Maid,9,https://p.scdn.co/mp3-preview/81fa22cf7f759d9c...,24,track,spotify:track:171SFtWNviZ7Tp10zxNdpN,"[american folk revival, folk, protest folk, si..."


In [25]:
LIB.to_csv(data_dir+"LIB.csv",index=False)

In [28]:
LIB['genre'][0]

['candy pop', 'modern rock', 'pixie', 'pop', 'pop emo', 'pop punk', 'rock']

## 3.0 Generate CORPUS

## 3.1 Load Your Credentials for the Genius API

In [10]:
# Get client ID and client secret from environment variables
GENIUS_CLIENT_ID = os.getenv('GENIUS_CLIENT_ID')
GENIUS_CLIENT_SECRET = os.getenv('GENIUS_CLIENT_SECRET')

## 3.2 Download the Lyrics for Your Library

In [11]:
CORPUS = pd.DataFrame()

total_tracks = len(LIB)
for i in LIB.index:
    
    song_title = LIB['track.name'][i]
    artist_name = LIB['track.artists'][i][0]['name']
    lyrics = get_lyrics(song_title, artist_name)
    CORPUS = pd.concat([CORPUS,pd.DataFrame({'artist':[artist_name],
                                             'song':[song_title],
                                             'lyrics':[lyrics]})]
                      )
    
    # slow down our requests so that we can continue making them. 
    time.sleep(0.25)
    clear_output(wait=True)
    print(str(np.round(100*(i+1)/total_tracks,2))+"%"+f'       [{i+1}/{total_tracks}]   \n', flush=True)
    print(lyrics,flush=True)

100.0%       [2013/2013]   

[Intro: Ericka Lee]
Hello?
Yeah, I just walked in
Yeah, I'm good, you still working?
Tonight
, right now?
Did I go out? Yeah, I went out, I went
I went to a couple of clubs
I never went to bed, shit
Wine or water?
Did you say something about a cold drink? I don't know
I'm
delirious
[Verse 1: Drake]
Cups of the rosé
Bitches in my old phone
I should call one and go home
I've been in this club too long
The woman that I would try
Is happy with a good guy
But I've been drinkin' so much
That I'ma call her anyway and say
[Chorus: Drake]
Fuck that nigga that you love so bad
I know you still think about the times we had
I say fuck that nigga that you think you found
And since you pick up, I know he's not around, oh, oh
[Post-Chorus: Drake &
Ericka Lee
]
Are you drunk right now?
I'm just sayin' you could do better
Tell me, have you heard that lately?
I'm just sayin' you could do better
And I'll start hatin' only if you make me
[Verse 2: Drake]
Uh, cups of the XO
All 

In [12]:
CORPUS= CORPUS.reset_index(drop=True)

## 3.3 Add the Language as a Feature

In [13]:
CORPUS['language'] = CORPUS['lyrics'].apply(lang_detect)

In [18]:
# Add the full language name as provided in the `langdetect` documentation
# The map is avaiable here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# The `langdetect` documentation can be found here: https://pypi.org/project/langdetect/

lang_map = pd.read_csv('langugage_map.csv')
lang_names = []
for i in CORPUS.index:
    try:
        lang_name = lang_map.loc[lang_map['639-1'] == CORPUS['language'][i]]['ISO language name'].values[0]
    except:
        lang_name = None
    lang_names.append(lang_name)

In [22]:
CORPUS['language_name'] = lang_names

## 3.4  Inspect and Save our `LIB` DataFrame 

In [23]:
CORPUS.head()

Unnamed: 0,artist,song,lyrics,language,language_name
0,Paramore,This Is Why,[Verse 1]\nIf you have an opinion\nMaybe you s...,en,English
1,Los Cadetes De Linares,Pistoleros Famosos,Por las márgenes del rio\nDe reinos hasta lare...,es,"Spanish, Castilian"
2,Taylor Swift,Better Man (Taylor's Version) (From The Vault),[Verse 1]\nI know I'm probably better off on m...,en,English
3,FYLOW,Give It To Me - Sped Up Version,[Verse 1: ReyTheStinger]\nI was scrolling down...,en,English
4,Woody Guthrie,Union Maid,"There once was a union maid, she never was afr...",en,English


In [24]:
CORPUS.to_csv(data_dir+'CORPUS.csv',index=False)