In [5]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import lyricsgenius

In [2]:
# scraping code

# constructs dataframe for a user playlist
def playlist_df(url, genius_key):
    titles = scrape_titles(url)
    artists = scrape_artists(url)

    df = pd.DataFrame({
        "title": titles,
        "artist": artists,
    })

    df["lyrics"] = df.apply(lambda x: grab_song(x.title, x.artist, genius_key), axis=1)

    return df

# lyrics for a song and artist pair
def grab_song(title, artist, key):
    genius = lyricsgenius.Genius(key)
    song = genius.search_song(title, artist)

    if song:
        return str(song.lyrics)
    elif artist == "":
        return "no lyrics found"
    else:
        return grab_song(title, "", key)

# Base scraping function
def scrape_something(url, classname, single_val=False):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    result = soup.find_all(class_ = classname)
    result = pd.Series(result).apply(str)

    html_cleaner = lambda text: re.sub("<.*?>", "", text)
    result = result.apply(html_cleaner)

    if single_val:
        return result[0]
    else:
        return result

# Titles for a single playlist
def scrape_titles(query):
    song_titles = scrape_something(query, "track-name")
    return song_titles

# Artists for a single playlist
def scrape_artists(query):
    artists = scrape_something(query, "artists-albums")
    artists = artists.apply(clean_artists)
    return artists

# Cleaning the dot and extra space around artist name
def clean_artists(artist_name):
    x = artist_name.split("•")[0]
    return re.sub('\s+', ' ', x).strip()

# Name for a single user
def scrape_name(query):
    name = scrape_something(query, "view-header", single_val=True)
    return name

# List of playlist titles and URLs (for further analysis) for a single user
def scrape_user_playlists(query):
    playlists = scrape_something(query, "cover playlist")
    return playlists

In [3]:
user_url = "https://open.spotify.com/user/priankasubs?si=dIr00M7sQUuZ4fUdbkuBnw"
scrape_user_playlists(user_url)

0    bollywood
1          pop
2         kpop
dtype: object

In [4]:
scrape_name(user_url)

'priankasubs'

In [5]:
from lxml import html
import requests

page = requests.get(user_url)
webpage = html.fromstring(page.content)

urls = webpage.xpath('//a/@href')
urls

['https://www.spotify.com/us/legal/cookies-vendor-list/',
 'https://www.spotify.com/legal/cookies-policy/',
 'https://support.spotify.com/using_spotify/the_basics/webplayer',
 'https://www.spotify.com/download',
 '/browse',
 '/playlist/03WmYJPovtBJNRoYENKuOw?si=dIr00M7sQUuZ4fUdbkuBnw',
 '/playlist/4vykLHrwiPXfS6HnfnM2CU?si=dIr00M7sQUuZ4fUdbkuBnw',
 '/playlist/4Rbe8cn4gxJQnotZeIPSP8?si=dIr00M7sQUuZ4fUdbkuBnw',
 '/browse',
 'https://www.spotify.com/legal/',
 'https://www.spotify.com/legal/privacy-policy/',
 'https://www.spotify.com/legal/cookies-policy/',
 'https://www.spotify.com/legal/privacy-policy/#s3']

In [6]:
# need to construct the urls with a regex

# use a regex to clean out extraneous urls + construct full url list
def process_playlist_urls(urls):
    spotify_base = "https://open.spotify.com"
    cleaned = []
    for url in urls:
        if re.findall(r"\/playlist\/.*", url):
            cleaned.append(url)
    cleaned = [spotify_base + url for url in cleaned]
    return cleaned

process_playlist_urls(urls)

['https://open.spotify.com/playlist/03WmYJPovtBJNRoYENKuOw?si=dIr00M7sQUuZ4fUdbkuBnw',
 'https://open.spotify.com/playlist/4vykLHrwiPXfS6HnfnM2CU?si=dIr00M7sQUuZ4fUdbkuBnw',
 'https://open.spotify.com/playlist/4Rbe8cn4gxJQnotZeIPSP8?si=dIr00M7sQUuZ4fUdbkuBnw']

In [7]:
def pull_user_playlists(url):
    playlist_names = scrape_user_playlists(url)
    
    # get playlist urls
    page = requests.get(url)
    webpage = html.fromstring(page.content)
    urls = webpage.xpath('//a/@href')
    
    spotify_base = "https://open.spotify.com"
    cleaned = []
    for url in urls:
        if re.findall(r"\/playlist\/.*", url):
            cleaned.append(url)
    cleaned = [spotify_base + url for url in cleaned]
    df = pd.DataFrame(list(zip(playlist_names, cleaned)), columns =['playlist_name', 'playlist_url']) 
    return df

In [8]:
pull_user_playlists(user_url)

Unnamed: 0,playlist_name,playlist_url
0,bollywood,https://open.spotify.com/playlist/03WmYJPovtBJ...
1,pop,https://open.spotify.com/playlist/4vykLHrwiPXf...
2,kpop,https://open.spotify.com/playlist/4Rbe8cn4gxJQ...


In [9]:
genius_key = "0AGGE0X9UYCDMkHxZWHojX0uBIaoHNZCQbfJO8hFx0g7nj9OJYEPJl2NzdBDdgtJ"

In [10]:
# get lyrics and data for playlists given df of names and urls
def playlist_datagrab(df, token):
    urls = df["playlist_url"].tolist()
    result = {}
    for url in urls:
        p_df = playlist_df(url, token)
        result[url] = p_df
    return result

In [11]:
class SpotifyUser:
    def __init__(self, url):
        self.name = scrape_name(url)
        self.playlist_metadata = pull_user_playlists(url)
        self.num_playlists = len(self.playlist_metadata.index)
        self.playlist_data = playlist_datagrab(self.playlist_metadata, genius_key)

In [13]:
x = SpotifyUser(user_url)

Searching for "Kal Ho Naa Ho" by Shankar-Ehsaan-Loy, Sonu Nigam...
Done.
Searching for "Nagada Sang Dhol" by Shreya Ghoshal, Osman Mir...
Done.
Searching for "Channa Mereya" by Pritam, Arijit Singh...
Done.
Searching for "Sugar" by Maroon 5...
Done.
Searching for "Havana (feat. Young Thug)" by Camila Cabello, Young Thug...
Done.
Searching for "Cake By The Ocean" by DNCE...
Done.
Searching for "DNA" by BTS...
Done.
Searching for "LOVE SCENARIO" by iKON...
Done.
Searching for "BBoom BBoom" by MOMOLAND...
Done.


In [14]:
x.playlist_data

{'https://open.spotify.com/playlist/03WmYJPovtBJNRoYENKuOw?si=dIr00M7sQUuZ4fUdbkuBnw':               title                          artist  \
 0     Kal Ho Naa Ho  Shankar-Ehsaan-Loy, Sonu Nigam   
 1  Nagada Sang Dhol       Shreya Ghoshal, Osman Mir   
 2     Channa Mereya            Pritam, Arijit Singh   
 
                                               lyrics  
 0  [Chorus]\nहर घड़ी बदल रही है रूप ज़िंदगी\nछाँव...  
 1  [Chorus 1]\nहे धिन तड़ाक धिन तड़ाक\nआजा उड़ के...  
 2  [Verse 1]\nअच्छा चलता हूँ\nदुआओं में याद रखना\...  ,
 'https://open.spotify.com/playlist/4vykLHrwiPXfS6HnfnM2CU?si=dIr00M7sQUuZ4fUdbkuBnw':                        title                      artist  \
 0                      Sugar                    Maroon 5   
 1  Havana (feat. Young Thug)  Camila Cabello, Young Thug   
 2          Cake By The Ocean                        DNCE   
 
                                               lyrics  
 0  [Directed by David Dobkin]\n\n[Verse 1]\nI'm h...  
 1  Oh!, so you wan

In [1]:
# after class is constructed should be able to just run this cell + disregard above
from spotifyuser import SpotifyUser

# construct test user
genius_key = "0AGGE0X9UYCDMkHxZWHojX0uBIaoHNZCQbfJO8hFx0g7nj9OJYEPJl2NzdBDdgtJ"

# change this for other spotify users
user_url = "https://open.spotify.com/user/priankasubs?si=dIr00M7sQUuZ4fUdbkuBnw"

# represents @priankasubs test user
x = SpotifyUser(user_url, genius_key)
x.user_summary()

Searching for "Kal Ho Naa Ho" by Shankar-Ehsaan-Loy, Sonu Nigam...
Done.
Searching for "Nagada Sang Dhol" by Shreya Ghoshal, Osman Mir...
Done.
Searching for "Channa Mereya" by Pritam, Arijit Singh...
Done.
Searching for "Sugar" by Maroon 5...
Done.
Searching for "Havana (feat. Young Thug)" by Camila Cabello, Young Thug...
Done.
Searching for "Cake By The Ocean" by DNCE...
Done.
Searching for "DNA" by BTS...
Done.
Searching for "LOVE SCENARIO" by iKON...
Done.
Searching for "BBoom BBoom" by MOMOLAND...
Done.
User: priankasubs
Number of playlists: 3
Playlists
**************
1. bollywood
2. pop
3. kpop


In [2]:
pdata = x.datagrab()
pdata

{'https://open.spotify.com/playlist/03WmYJPovtBJNRoYENKuOw?si=dIr00M7sQUuZ4fUdbkuBnw':               title                          artist  \
 0     Kal Ho Naa Ho  Shankar-Ehsaan-Loy, Sonu Nigam   
 1  Nagada Sang Dhol       Shreya Ghoshal, Osman Mir   
 2     Channa Mereya            Pritam, Arijit Singh   
 
                                               lyrics  
 0  [Chorus]\nहर घड़ी बदल रही है रूप ज़िंदगी\nछाँव...  
 1  [Chorus 1]\nहे धिन तड़ाक धिन तड़ाक\nआजा उड़ के...  
 2  [Verse 1]\nअच्छा चलता हूँ\nदुआओं में याद रखना\...  ,
 'https://open.spotify.com/playlist/4vykLHrwiPXfS6HnfnM2CU?si=dIr00M7sQUuZ4fUdbkuBnw':                        title                      artist  \
 0                      Sugar                    Maroon 5   
 1  Havana (feat. Young Thug)  Camila Cabello, Young Thug   
 2          Cake By The Ocean                        DNCE   
 
                                               lyrics  
 0  [Directed by David Dobkin]\n\n[Verse 1]\nI'm h...  
 1  Oh!, so you wan

In [3]:
def merge_dfs(df_dict):
    dfs = []
    for df in df_dict:
        dfs.append(df_dict[df])
    df = pd.concat(dfs)
    return df

In [7]:
df = merge_dfs(pdata)
df.head()

Unnamed: 0,title,artist,lyrics
0,Kal Ho Naa Ho,"Shankar-Ehsaan-Loy, Sonu Nigam",[Chorus]\nहर घड़ी बदल रही है रूप ज़िंदगी\nछाँव...
1,Nagada Sang Dhol,"Shreya Ghoshal, Osman Mir",[Chorus 1]\nहे धिन तड़ाक धिन तड़ाक\nआजा उड़ के...
2,Channa Mereya,"Pritam, Arijit Singh",[Verse 1]\nअच्छा चलता हूँ\nदुआओं में याद रखना\...
0,Sugar,Maroon 5,[Directed by David Dobkin]\n\n[Verse 1]\nI'm h...
1,Havana (feat. Young Thug),"Camila Cabello, Young Thug","Oh!, so you want to hear the verse erh?\nI dey..."


In [44]:
# lang imports
from textblob import TextBlob
import re
import pycountry

# translate identified non-english text
def to_english(lyr, lang):
    if lang == 'en':
        return lyr
    else:
        blob = TextBlob(lyr)
        translation = blob.translate(to='en')
        return str(translation)

# identify non-english text, return detected language
def detect_lang(lyr):
    blob = TextBlob(lyr)
    lang = blob.detect_language()
    return lang

# fix languages on each row of df
def lang_transform(df):
    df['lang'] = df["lyrics"].apply(detect_lang)
    df["english_lyrics"] = df.apply(lambda x: to_english(x['lyrics'], x['lang']), axis=1)
    return df

In [24]:
df = lang_transform(df)
df.head()

Unnamed: 0,title,artist,lyrics,lang,english_lyrics
0,Kal Ho Naa Ho,"Shankar-Ehsaan-Loy, Sonu Nigam",[Chorus]\nहर घड़ी बदल रही है रूप ज़िंदगी\nछाँव...,hi,[Chorus]\nLife is changing every moment\nSomet...
1,Nagada Sang Dhol,"Shreya Ghoshal, Osman Mir",[Chorus 1]\nहे धिन तड़ाक धिन तड़ाक\nआजा उड़ के...,hi,[Chorus 1]\nHey tad tadak dhin tadak\nAaja uda...
2,Channa Mereya,"Pritam, Arijit Singh",[Verse 1]\nअच्छा चलता हूँ\nदुआओं में याद रखना\...,hi,[Verse 1]\nGood walk\nRemember in prayers\nTo ...
0,Sugar,Maroon 5,[Directed by David Dobkin]\n\n[Verse 1]\nI'm h...,en,[Directed by David Dobkin]\n\n[Verse 1]\nI'm h...
1,Havana (feat. Young Thug),"Camila Cabello, Young Thug","Oh!, so you want to hear the verse erh?\nI dey...",en,"Oh!, so you want to hear the verse erh?\nI dey..."


In [27]:
df["english_lyrics"].tolist()

['[Chorus]\nLife is changing every moment\nSometimes there is sunshine\nLive here every moment\nWhich is not tomorrow\nLife is changing every moment\nSometimes there is sunshine\nLive here every moment\nWhich is not tomorrow\n\n[Verse 1]\nEven if you wholeheartedly\nBarely gets\nSomeone who is somewhere\nShe is the only one\nHold that hand\nMay it not be tomorrow\n\n[Chorus]\nLive here every moment\nWhich is not tomorrow\n\n[Verse 2]\nEyelashes\nSomeone who came\nTake care of a million crazy hearts\nHeart beats\nBut think this is the moment\nMay that tale not happen tomorrow\n\n[Chorus]\nLife is changing every moment\nSometimes there is sunshine\nLive here every moment\nWhich is not tomorrow\nLive here every moment\nWhich is not tomorrow\n\n[Outro]\nWhich is not tomorrow',
 "[Chorus 1]\nHey tad tadak dhin tadak\nAaja uda ke sarat\nLittle shell from feet\n\n[Refrain]\nDrum with drums, drum drums\nThank you thank you thank you\nDrum with drums, drum drums\nCary cary cary fary cary cary\n

In [48]:
# regex to remove anything in brackets
def clean_musical_indicators(lyrics):
    lyric_capture = r"\[.*?\]"
    result = re.sub(lyric_capture,'', lyrics) 
    return result

# fix all the newlining
def fix_newlines(lyrics):
    newline_capture = r"\s+"
    result = re.sub(newline_capture, ' ', lyrics)
    return result

# change language codes to full language name
def full_lang(iso_code):
    lang = pycountry.languages.get(alpha_2=iso_code)
    return str(lang.name)

# apply all cleaning functions
def clean_lyrics(df):
    df["lyrics"] = df["english_lyrics"].apply(clean_musical_indicators)
    df["lyrics"] = df["lyrics"].apply(fix_newlines)
    df["lyrics"] = df["lyrics"].apply(str.lower)
    df = df.drop(["english_lyrics"], axis=1)
    df["lang"] = df["lang"].apply(full_lang).apply(str.lower)
    return df

In [49]:
clean_lyrics(df)

Unnamed: 0,title,artist,lyrics,lang
0,Kal Ho Naa Ho,"Shankar-Ehsaan-Loy, Sonu Nigam",life is changing every moment sometimes there...,hindi
1,Nagada Sang Dhol,"Shreya Ghoshal, Osman Mir",hey tad tadak dhin tadak aaja uda ke sarat li...,hindi
2,Channa Mereya,"Pritam, Arijit Singh",good walk remember in prayers to taste my zic...,hindi
0,Sugar,Maroon 5,"i'm hurting baby, i'm broken down i need your...",english
1,Havana (feat. Young Thug),"Camila Cabello, Young Thug","oh!, so you want to hear the verse erh? i dey ...",english
2,Cake By The Ocean,DNCE,"nah, nah, nah cake by the ocean oh, no see yo...",english
0,DNA,BTS,i got to know you at first sight as if i had ...,korean
1,LOVE SCENARIO,iKON,we met in love it became a memory i couldn't ...,korean
2,BBoom BBoom,MOMOLAND,"just feel it (yeah) it's exciting, you and my...",korean


In [None]:
# PYTHON LIB THAT REMOVES PROFANIty???