## Script for creating song lyrics database

### Spotify and Genius Authentication

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

import spotipy
from spotipy.oauth2 import SpotifyOAuth, SpotifyClientCredentials

# Get Spotify app credentials
client_id = os.getenv("SPOTIFY_CLIENT_ID")
client_secret = os.getenv("SPOTIFY_CLIENT_SECRET")
redirect_uri = os.getenv("SPOTIFY_REDIRECT_URI")
scope = "user-library-read playlist-read-private user-top-read"

# Authenticate 
client_credentials_manager = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
)

oauth_manager = SpotifyOAuth(
    client_id=client_id,
    client_secret=client_secret,
    redirect_uri=redirect_uri,
    scope=scope
)

sp = spotipy.Spotify(oauth_manager=oauth_manager)
# sp.current_user()

### Fetch Spotify songs

5YkfW67PCvWAOO63TjYwLl

In [5]:
user_list = sp.playlist_items('1GfuMQxvw8BcgUx0ozH3fl', offset=100)

In [6]:
user_songs = []
for item in user_list['items']:
    track = item['track']
    user_songs.append({
        'name': track['name'],
        'artist': track['artists'][0]['name'],
        'album': track['album']['name'],
        'release_date': track['album']['release_date'],
        'popularity': track['popularity']
    })

In [7]:
import pandas as pd
user_songs = pd.DataFrame(user_songs)
user_songs

Unnamed: 0,name,artist,album,release_date,popularity
0,Callaita,Bad Bunny,Un Verano Sin Ti,2022-05-06,71
1,Oops!...I Did It Again,Britney Spears,Oops!... I Did It Again,2000-05-16,76
2,Ain't Shit,Doja Cat,Planet Her,2021-06-25,71
3,NO,Meghan Trainor,Thank You (Deluxe Version),2017-05-12,69
4,Hit 'Em Up Style (Oops!),Blu Cantrell,So Blu,2001-01-18,66
5,Work from Home (feat. Ty Dolla $ign),Fifth Harmony,7/27 (Deluxe),2016-05-27,73
6,Swish Swish,Katy Perry,Witness (Deluxe),2017-06-09,59
7,Do It,Chloe x Halle,Do It,2020-05-14,42
8,Angel Of My Dreams,JADE,Angel Of My Dreams,2024-07-19,63
9,Beggin' On Your Knees (feat. Victoria Justice),Victorious Cast,Beggin' On Your Knees (feat. Victoria Justice),2011-04-01,33


### Match songs with lyrics

In [8]:
import lyricsgenius
gen_client_access_token = os.getenv("GENIUS_CLIENT_TOKEN")
genius = lyricsgenius.Genius(gen_client_access_token, timeout=10)

list_lyrics = []

for i, song in user_songs.iterrows():
    title = song['name']
    artist = song['artist']
    retries = 0
    while retries < 3:
        try:
            lyrics = genius.search_song(title, artist)
        except:
            retries += 1
            continue
        if lyrics:
            list_lyrics.append({
                'title': title,
                'lyrics': lyrics.lyrics
            })
        break

Searching for "Callaita" by Bad Bunny...
Done.
Searching for "Oops!...I Did It Again" by Britney Spears...
Done.
Searching for "Ain't Shit" by Doja Cat...
Done.
Searching for "NO" by Meghan Trainor...
Done.
Searching for "Hit 'Em Up Style (Oops!)" by Blu Cantrell...
Done.
Searching for "Work from Home (feat. Ty Dolla $ign)" by Fifth Harmony...
Done.
Searching for "Swish Swish" by Katy Perry...
Done.
Searching for "Do It" by Chloe x Halle...
Done.
Searching for "Angel Of My Dreams" by JADE...
Done.
Searching for "Beggin' On Your Knees (feat. Victoria Justice)" by Victorious Cast...
Done.
Searching for "the boy is mine" by Ariana Grande...
Done.
Searching for "Paint The Town Red" by Doja Cat...
Done.
Searching for "Wild Side (feat. Cardi B)" by Normani...
Done.
Searching for "Low" by SZA...
Done.
Searching for "Good In Goodbye" by Madison Beer...
Done.
Searching for "HOT TO GO!" by Chappell Roan...
Done.
Searching for "Wild Thoughts (feat. Rihanna & Bryson Tiller)" by DJ Khaled...
Done.


Clean and tokenize lyrics

In [9]:
import random
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# nltk.download('stopwords')
# nltk.download('wordnet')   
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')   

def clean_lyrics(df, column):
    """
    Cleans the words without importance and fix the format of the  dataframe's column lyrics 
    Args:
        df (DataFrame): df containing song information
        column (str): column to clean
    Returns:
        df (DataFrame): DataFrame containing the cleaned lyrics
    """
    df[column] = df[column].str.lower()
    # remove section marker
    df[column] = df[column].str.replace(r"(verse\s?\d*|chorus|bridge|outro|intro)", "", regex=True)
    df[column] = df[column].str.replace(r"(instrumental|guitar|solo)", "", regex=True) 
    df[column] = df[column].str.replace(r"\[.*?\]", "", regex=True)
    # remove new line
    df[column] = df[column].str.replace(r"\n", ". ", regex=True)
    # remove special characters
    df[column] = df[column].str.replace(r"[^\w\d'\s.]+", "", regex=True)
    df[column] = df[column].str.strip()
    df[column] = df[column].str[2:] # impromptu removing the 1st period when replacing "\n"

    return df

In [10]:
# Convert list_lyrics to DataFrame
list_lyrics_df = pd.DataFrame(list_lyrics)

In [11]:
# Clean and tokenize lyrics
list_lyrics_df = clean_lyrics(list_lyrics_df, 'lyrics')
list_lyrics_df

Unnamed: 0,title,lyrics
0,Callaita,. . se acostó temprano mañana hay que estudiar...
1,Oops!...I Did It Again,mmm yeah. yeah yeah yeah yeah yeah yeah. yeah ...
2,Ain't Shit,man. this happened one two three times too muc...
3,NO,i think it's so cute and i think it's so sweet...
4,Hit 'Em Up Style (Oops!),get your hands on his cash and. . . while he w...
5,Work from Home (feat. Ty Dolla $ign),i ain't worried about nothin' i ain't wearin' ...
6,Swish Swish,they know what is what. but they don't know wh...
7,Do It,oh. oh oh. oh. . . yeah i beat my face. movin'...
8,Angel Of My Dreams,i wonder if one day that you'll say that you c...
9,Beggin' On Your Knees (feat. Victoria Justice),you had it all. the day you told me told me yo...


In [12]:
df = pd.read_csv('songs_lyrics.csv', index_col=0)
lyrics = pd.concat([df, list_lyrics_df], ignore_index=True).drop_duplicates(subset=['title'], keep='first')

In [15]:
lyrics.to_csv('songs_lyrics.csv')

### Add songs and lyrics to db

In [17]:
import sqlite3

with open('spotify_db/playlist_schema.sql', 'r') as file:
    create_table_query = file.read()
    
connection = sqlite3.connect('spotify_db/spotify.db')
connection.execute(create_table_query)

list_lyrics_df.to_sql('songs_lyrics', connection, if_exists='append', index=False)

connection.commit()
connection.close()