In [1]:
import re
import time
import numpy as np
from random import randint
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

In [None]:
SONG_URLS = [f'https://maistocadas.mus.br/{year}' for year in range(2022, 2024)]
SONGS_DIR = 'songs/'
LYRICS_URL = 'https://www.letras.mus.br/'
ACCENTS_MAPPING = {'é': 'e', 'á': 'a', 'í': 'i', 'ó': 'o', 'ú': 'u', 'ã': 'a', 'ç': 'c', 'à': 'a', 'ê': 'e', 'ô': 'o', 'õ': 'o', 'â': 'a'}

def get_random_geolocation():
    return {'latitude': randint(-90,90), 'longitude':randint(-180,180)}

def get_random_viewport():
    return {'width': randint(800, 1920), 'height': randint(600, 1080)}

def remove_accents(string):
    for accent, replacement in ACCENTS_MAPPING.items():
        string = string.replace(accent, replacement)
    return string

def remove_non_alphanumeric_characters(string):
    return re.sub(pattern=r'[^a-zA-Z0-9\s]', repl='', string=string)

In [27]:
songs_without_lyrics = []
non_portuguese_songs = []

async with async_playwright() as playwright:
    browser = None
    if np.random.random() > 0.5:
        browser = await playwright.firefox.launch(headless = False,)
    else:
        browser = await playwright.chromium.launch(headless = False,)
    for url in SONG_URLS:
        csv_lines = []
        songs_page = await browser.new_page(viewport=get_random_viewport(), geolocation=get_random_geolocation())
        await songs_page.goto(url)
        track_list = await songs_page.locator('#tracklist').locator('li').all()
        lyrics_browser = None
        if np.random.random() > 0.5:
            lyrics_browser = await playwright.chromium.launch(headless = False,)
        else:
            lyrics_browser = await playwright.firefox.launch(headless = False,)
        lyrics_page = await lyrics_browser.new_page(viewport=get_random_viewport(), geolocation=get_random_geolocation())
        await lyrics_page.goto(LYRICS_URL)
        lyrics_page.set_default_timeout(3000)
        await lyrics_page.get_by_role('button', name='Consent', exact=True).click()
        for track_card in track_list:
            song_title = str(await track_card.locator('span.musicas').text_content()).strip()
            song_title_transformed = remove_non_alphanumeric_characters(remove_accents(song_title.replace('feat', 'part').replace('&', 'e').lower()))
            artist = str(await track_card.locator('span.artista').text_content()).strip()
            artist_transformed = remove_non_alphanumeric_characters(remove_accents(artist.replace('feat', 'part').replace('&', 'e').lower()))
            search_bar = lyrics_page.locator('#main_suggest')
            await search_bar.wait_for(state='attached')
            await search_bar.fill(song_title + ' ' + artist)
            search_button = lyrics_page.locator("button.header-search-submit")
            await search_button.wait_for(state='attached')
            await search_button.click()
            captcha_text = None
            try:
                captcha_text = (await lyrics_page.locator('.gs-captcha-msg').nth(0).text_content()).strip()
                if captcha_text == 'Confirme que você não é um robô.':
                    raise RuntimeError(f'captcha visible when processing {url}')
            except PlaywrightTimeoutError:
                pass
            # Captcha message 'Confirme que você não é um robô.'
            song_lyrics_link = lyrics_page.locator('a.gs-title').nth(0)
            try:
                song_lyrics_link_title, song_lyrics_link_artist = (await song_lyrics_link.text_content()).split('-')[0:2]
            except PlaywrightTimeoutError:
                songs_without_lyrics.append((song_title, artist))
                continue
            except ValueError:
                songs_without_lyrics.append((song_title, artist))
                continue
            if (remove_non_alphanumeric_characters(remove_accents(song_lyrics_link_title.strip().replace('feat', 'part').replace('&', 'e').lower())) != song_title_transformed or 
                remove_non_alphanumeric_characters(remove_accents(song_lyrics_link_artist.strip().replace('feat', 'part').replace('&', 'e').lower())) != artist_transformed):
                songs_without_lyrics.append((song_title, artist))
                continue
            await song_lyrics_link.click()
            try:
                translation = (await lyrics_page.locator('.js-filterTranslation').text_content()).strip()
                if translation == 'Tradução':
                    non_portuguese_songs.append((song_title, artist))
                    continue    
            except PlaywrightTimeoutError:
                non_portuguese_songs.append((song_title, artist))
                continue
            # translation_languages = await lyrics_page.locator('label.translationModal-label').all_text_contents()
            # if 'Português' in translation_languages:
            #     non_portuguese_songs.append((song_title, artist))
            #     continue
            lyrics = await lyrics_page.locator('div.lyric-original').all_inner_texts()
            csv_lines.append(f'"{song_title}","{artist}","{lyrics[0].replace('"', '')}"\n')
            time.sleep(np.random.randint(low=2, high=6))
        await lyrics_page.close()
        await songs_page.close()
        with open(SONGS_DIR+url.split('/')[-1]+'.csv', 'w') as f:
            f.write('title,artist,lyrics\n')
            f.writelines(csv_lines)
        time.sleep(60)