# Data Collection 

In this notebook we will generate the relevant datasets for analyzing the songs in your *Spotify* library. To make sure the code runs and 

## 0.0 Import Relevant Libraries

In [1]:
# the two must have libraries in any data science project 
import pandas as pd 
import numpy as np

import os
import requests

# for handling our web requests and html 
from bs4 import BeautifulSoup

# for handling environment variables 
from dotenv import dotenv_values
from dotenv import load_dotenv

# handle our language detection 
from langdetect import detect

# for handling spotify endpoints 
import spotipy
from spotipy.oauth2 import SpotifyOAuth
load_dotenv()

# 
from IPython.display import clear_output

## 1.0 Define Global Variables 

In [2]:
# load your environment variables 
load_dotenv()

# define the scope of your spotipy client 
scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

# relative path to where data will be stored. 
data_dir = "datasets/"

## 1.1 Define helper functions 

In [36]:
def get_genre(track_id):
    """Gets the genre of a track. Important to notice
    that the genre is not a property of the track, but
    a property of the artist. Therefore, the genre is
    retrieved from the artist.

    Args:
        track_id (str): Spotify track id

    Returns:
        list: list  of genres associated with the artist
    """
    # Uses track endpoint to get artist id
    track_info = sp.track(track_id)
    
    # Uses artist endpoint to get genres associated with artist
    artist_info = sp.artist(track_info['artists'][0]['id'])
    
    # Returns list of genres associated with artist
    genre = artist_info['genres']
    
    return genre


def get_genius_access_token():
    """Get an access token from Genius API"""
    
    # Set API endpoint and headers
    endpoint = 'https://api.genius.com/oauth/token'
    headers = {'Content-Type': 'application/json'}
    
    # Set request parameters
    data = {
        'client_id': GENIUS_CLIENT_ID,
        'client_secret': GENIUS_CLIENT_SECRET,
        'grant_type': 'client_credentials'
    }
    
    # Send API request
    response = requests.post(endpoint, headers=headers, json=data)
    
    # Check if request was successful
    if response.status_code == 200:
        data = response.json()
        access_token = data['access_token']
        
        return access_token
    else:
        print("Failed to fetch access token. Please check your client ID and client secret.")
        return None
    
    
def get_lyrics(song_title, artist_name):
    """Get the lyrics for a song using Genius API.

    Args:
        song_title (str): title of the song.
        artist_name (str): artist name.

    Returns:
        str: lyrics of the song requested.
    """
    # Get your access token from https://genius.com/api-clients
    access_token = get_genius_access_token()
    if not access_token:
        return None
    
    # Set API endpoint and headers
    endpoint = 'https://api.genius.com/search'
    headers = {'Authorization': f'Bearer {access_token}'}
    
    # Set query parameters
    params = {
        'q': f'{song_title} {artist_name}',
    }
    
    # Send API request
    response = requests.get(endpoint, headers=headers, params=params)
    
    # Check if request was successful
    if response.status_code == 200:
        # Extract lyrics from response
        data = response.json()
        matches = data['response']['hits']
        
        # Check a match was found
        if matches:
            # Get the first match
            match = matches[0]
            song_lyrics_url = match['result']['url']
            
            # Get the lyrics from the song page
            lyrics_response = requests.get(song_lyrics_url)
            soup = BeautifulSoup(lyrics_response.content,"html.parser")
            lyrics = str()
            for tag in soup.select('div[class^="Lyrics__Container"], .song_body-lyrics p'):
                t = tag.get_text(strip=True, separator='\n')
                if t:
                    lyrics += t 
            
            
            return lyrics
        else:
            print(f"No lyrics found for {song_title} by {artist_name}.")
            return None
    else:
        print("Failed to get lyrics. Please check your client ID and client secret.")
        return "API Call Failed"
    
    
def lang_detect(lyrics):
    """Detects the language of the lyrics using langdetect. We use a try/except
    in order to handle exceptions where we were not able to retrieve lyrics from
    the genius API
    
    Args:
        lyrics (str): The lyrics to detect the language of
    Returns: 
        str: The language of the lyrics"""
    try:
        return detect(lyrics)
    except:
        return None

## 2.0 Get our *liked songs* library from *Spotify*

In [4]:
offset = 0 # at what track to start retreiving 
limit = 50 # max limit of allowed songs to fetch at once before receiving error 

LIB = pd.DataFrame() # initiate the dataframe containing our library 

while True:
    # Get trackes from the user `liked songs` 
    results = sp.current_user_saved_tracks(limit=limit, offset=offset)
    
    # Append fetched tracks to the list
    LIB = pd.concat([LIB,pd.json_normalize(results['items'])])
    
    # Check if you are at the end of the playlist
    if len(results['items']) < limit:
        # Break if you have reached the end of the playlist
        break
    else:
        # Increment the offset for the next request
        offset += limit

In [5]:
# reset the index of our library `LIB`
LIB = LIB.reset_index() 

## 2.1 Adding Genre to our library

In [6]:
genres = [] # initiate the list of of genres of each song
for i in LIB.index:
    # call our helper function 
    genre = get_genre(LIB['track.id'][i])
    
    # add to the `genres` list
    genres.append(genre)
    
    # print status 
    clear_output(wait=True)
    print(i,flush=True)
    print(genre,flush=True)

2055
['canadian hip hop', 'canadian pop', 'hip hop', 'rap', 'toronto rap']


## 2.2 Save and inspect our `LIB` DataFrame 

In [7]:
LIB.head()

Unnamed: 0,index,added_at,track.album.album_group,track.album.album_type,track.album.artists,track.album.available_markets,track.album.external_urls.spotify,track.album.href,track.album.id,track.album.images,...,track.external_urls.spotify,track.href,track.id,track.is_local,track.name,track.popularity,track.preview_url,track.track_number,track.type,track.uri
0,0,2023-04-09T02:07:04Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/4aJQ9c9XNSJ9eiC...,https://api.spotify.com/v1/albums/4aJQ9c9XNSJ9...,4aJQ9c9XNSJ9eiCrmqH3S3,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/171SFtWNviZ7Tp1...,https://api.spotify.com/v1/tracks/171SFtWNviZ7...,171SFtWNviZ7Tp10zxNdpN,False,Union Maid,6,https://p.scdn.co/mp3-preview/81fa22cf7f759d9c...,24,track,spotify:track:171SFtWNviZ7Tp10zxNdpN
1,1,2023-04-04T23:47:45Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/1kTlYbs28MXw7hw...,https://api.spotify.com/v1/albums/1kTlYbs28MXw...,1kTlYbs28MXw7hwO0NLYif,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/0bCCMwWTaYOcQ4v...,https://api.spotify.com/v1/tracks/0bCCMwWTaYOc...,0bCCMwWTaYOcQ4v8EeEYmd,False,Crazy In Love,57,https://p.scdn.co/mp3-preview/260584b4f44684fa...,17,track,spotify:track:0bCCMwWTaYOcQ4v8EeEYmd
2,2,2023-03-30T11:28:56Z,single,single,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/6XN8b2NIomlwPOo...,https://api.spotify.com/v1/albums/6XN8b2NIomlw...,6XN8b2NIomlwPOoBlILqXv,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/4IiuExPFijOGZnV...,https://api.spotify.com/v1/tracks/4IiuExPFijOG...,4IiuExPFijOGZnVxGsKWcc,False,Seven Nation Army - The Glitch Mob Remix,64,https://p.scdn.co/mp3-preview/9fbdaf2d131c8c0c...,1,track,spotify:track:4IiuExPFijOGZnVxGsKWcc
3,3,2023-03-30T01:49:21Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/1G8sFT0WajlwPsx...,https://api.spotify.com/v1/albums/1G8sFT0Wajlw...,1G8sFT0WajlwPsxUJBQW4h,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/60vNDz3RWsKEg5J...,https://api.spotify.com/v1/tracks/60vNDz3RWsKE...,60vNDz3RWsKEg5JwIAG31L,False,Alfredo Venegas,7,https://p.scdn.co/mp3-preview/85dc59e35370dd03...,1,track,spotify:track:60vNDz3RWsKEg5JwIAG31L
4,4,2023-03-30T00:07:18Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/6kZ42qRrzov54Lc...,https://api.spotify.com/v1/albums/6kZ42qRrzov5...,6kZ42qRrzov54LcAk4onW9,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/4IQkfUsrwXol38V...,https://api.spotify.com/v1/tracks/4IQkfUsrwXol...,4IQkfUsrwXol38VV3U7t7T,False,Run (feat. Ed Sheeran) (Taylor’s Version) (Fro...,70,https://p.scdn.co/mp3-preview/6ea4fafd3a5a7f21...,28,track,spotify:track:4IQkfUsrwXol38VV3U7t7T


In [8]:
LIB.to_csv(data_dir+"LIB.csv")

In [9]:
LIB.head()

Unnamed: 0,index,added_at,track.album.album_group,track.album.album_type,track.album.artists,track.album.available_markets,track.album.external_urls.spotify,track.album.href,track.album.id,track.album.images,...,track.external_urls.spotify,track.href,track.id,track.is_local,track.name,track.popularity,track.preview_url,track.track_number,track.type,track.uri
0,0,2023-04-09T02:07:04Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/4aJQ9c9XNSJ9eiC...,https://api.spotify.com/v1/albums/4aJQ9c9XNSJ9...,4aJQ9c9XNSJ9eiCrmqH3S3,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/171SFtWNviZ7Tp1...,https://api.spotify.com/v1/tracks/171SFtWNviZ7...,171SFtWNviZ7Tp10zxNdpN,False,Union Maid,6,https://p.scdn.co/mp3-preview/81fa22cf7f759d9c...,24,track,spotify:track:171SFtWNviZ7Tp10zxNdpN
1,1,2023-04-04T23:47:45Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/1kTlYbs28MXw7hw...,https://api.spotify.com/v1/albums/1kTlYbs28MXw...,1kTlYbs28MXw7hwO0NLYif,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/0bCCMwWTaYOcQ4v...,https://api.spotify.com/v1/tracks/0bCCMwWTaYOc...,0bCCMwWTaYOcQ4v8EeEYmd,False,Crazy In Love,57,https://p.scdn.co/mp3-preview/260584b4f44684fa...,17,track,spotify:track:0bCCMwWTaYOcQ4v8EeEYmd
2,2,2023-03-30T11:28:56Z,single,single,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/6XN8b2NIomlwPOo...,https://api.spotify.com/v1/albums/6XN8b2NIomlw...,6XN8b2NIomlwPOoBlILqXv,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/4IiuExPFijOGZnV...,https://api.spotify.com/v1/tracks/4IiuExPFijOG...,4IiuExPFijOGZnVxGsKWcc,False,Seven Nation Army - The Glitch Mob Remix,64,https://p.scdn.co/mp3-preview/9fbdaf2d131c8c0c...,1,track,spotify:track:4IiuExPFijOGZnVxGsKWcc
3,3,2023-03-30T01:49:21Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/1G8sFT0WajlwPsx...,https://api.spotify.com/v1/albums/1G8sFT0Wajlw...,1G8sFT0WajlwPsxUJBQW4h,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/60vNDz3RWsKEg5J...,https://api.spotify.com/v1/tracks/60vNDz3RWsKE...,60vNDz3RWsKEg5JwIAG31L,False,Alfredo Venegas,7,https://p.scdn.co/mp3-preview/85dc59e35370dd03...,1,track,spotify:track:60vNDz3RWsKEg5JwIAG31L
4,4,2023-03-30T00:07:18Z,album,album,[{'external_urls': {'spotify': 'https://open.s...,"[AD, AE, AG, AL, AM, AO, AR, AT, AU, AZ, BA, B...",https://open.spotify.com/album/6kZ42qRrzov54Lc...,https://api.spotify.com/v1/albums/6kZ42qRrzov5...,6kZ42qRrzov54LcAk4onW9,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",...,https://open.spotify.com/track/4IQkfUsrwXol38V...,https://api.spotify.com/v1/tracks/4IQkfUsrwXol...,4IQkfUsrwXol38VV3U7t7T,False,Run (feat. Ed Sheeran) (Taylor’s Version) (Fro...,70,https://p.scdn.co/mp3-preview/6ea4fafd3a5a7f21...,28,track,spotify:track:4IQkfUsrwXol38VV3U7t7T


## 3.0 Generate Corpus

## 3.1 Get the 

In [10]:
# Get client ID and client secret from environment variables
GENIUS_CLIENT_ID = os.getenv('GENIUS_CLIENT_ID')
GENIUS_CLIENT_SECRET = os.getenv('GENIUS_CLIENT_SECRET')

In [14]:
CORPUS = pd.DataFrame()
import time
import numpy as np
from IPython.display import clear_output
total_tracks = len(LIB)
for i in LIB.index:
    
    song_title = LIB['track.name'][i]
    artist_name = LIB['track.artists'][i][0]['name']
    lyrics = get_lyrics(song_title, artist_name)
    CORPUS = pd.concat([CORPUS,pd.DataFrame({'artist':[artist_name],
                                             'song':[song_title],
                                             'lyrics':[lyrics]})]
                      )
    
    # slow down our requests so that we can continue making them. 
    time.sleep(0.25)
    clear_output(wait=True)
    print(str(np.round(100*(i+1)/total_tracks,2))+"%"+f'       [{i+1}/{total_tracks}]   \n', flush=True)
    print(lyrics,flush=True)

100.0%       [2055/2056]   

[Intro: Ericka Lee]
Hello?
Yeah, I just walked in
Yeah, I'm good, you still working?
Tonight
, right now?
Did I go out? Yeah, I went out, I went
I went to a couple of clubs
I never went to bed, shit
Wine or water?
Did you say something about a cold drink? I don't know
I'm
delirious
[Verse 1: Drake]
Cups of the rosé
Bitches in my old phone
I should call one and go home
I've been in this club too long
The woman that I would try
Is happy with a good guy
But I've been drinkin' so much
That I'ma call her anyway and say
[Chorus: Drake]
Fuck that nigga that you love so bad
I know you still think about the times we had
I say fuck that nigga that you think you found
And since you pick up, I know he's not around, oh, oh
[Post-Chorus: Drake &
Ericka Lee
]
Are you drunk right now?
I'm just sayin' you could do better
Tell me, have you heard that lately?
I'm just sayin' you could do better
And I'll start hatin' only if you make me
[Verse 2: Drake]
Uh, cups of the XO
All 

In [15]:
CORPUS= CORPUS.reset_index()

In [37]:
CORPUS['language'] = CORPUS['lyrics'].apply(lang_detect)

In [38]:
CORPUS.loc[CORPUS['lyrics'].isna()]

Unnamed: 0,index,artist,song,lyrics,language
3,0,Beto Quintanilla,Alfredo Venegas,,
76,0,El Halcon De La Sierra,Pakas De A Kilo,,
107,0,WEEDMANE,SUICIDE YEAR - airshade Remix,,
111,0,David García Díaz,Surtr,,
134,0,Grupo Bagdad,Yo Soy De Matamoros,,
...,...,...,...,...,...
1834,0,Explosions In The Sky,"From West Texas - From ""Friday Night Lights"" S...",,
1848,0,Explosions In The Sky,"A Slow Dance - From ""Friday Night Lights"" Soun...",,
1883,0,Black Sabbath,A National Acrobat - 2013 Remaster,,
1935,0,Chon Arauza Y Su Furia Colombiana,Perdóname,,


In [25]:
CORPUS.iloc[3]['lyrics'] == None

True

TypeError: expected string or bytes-like object

In [40]:
CORPUS

Unnamed: 0,index,artist,song,lyrics,language
0,0,Woody Guthrie,Union Maid,"There once was a union maid, she never was afr...",en
1,0,Eminem,Crazy In Love,[Intro: Heart (Sampled)]\nTell myself that I w...,en
2,0,The White Stripes,Seven Nation Army - The Glitch Mob Remix,[Instrumental Intro]\n[Verse 1]\nI'm gonna fig...,en
3,0,Beto Quintanilla,Alfredo Venegas,,
4,0,Taylor Swift,Run (feat. Ed Sheeran) (Taylor’s Version) (Fro...,"[Intro: Ed Sheeran]\nOne, two, three, four\n[V...",tr
...,...,...,...,...,...
2051,0,Eminem,"Lose Yourself - From ""8 Mile"" Soundtrack",1. Eminem- Higher\n2. Eminem- Gnat\n3. Eminem-...,en
2052,0,Eminem,Till I Collapse,[Intro: Eminem]\n'Cause sometimes you just fee...,en
2053,0,Drake,Make Me Proud,[Verse 1: Drake]\nI like a woman with a future...,en
2054,0,Drake,Headlines,[Verse 1]\nI might be too strung out on compli...,en
