In [1]:
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup


def get_songs_pesni_net(artist_name):
    url = "https://www.pesni.net/text/" + artist_name
    basename = "https://www.pesni.net"

    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    # получаем все песни
    text = soup.find("div", {"class": "textcontent"})
    links = text.find_all("a")

    # получаем ссылки на песни
    songs = []

    for link in links:
        if artist_name in link.get("href"):
            songs.append(
                {
                    "name": link.text,
                    "url": basename + link.get("href"),
                    "text": "",
                }
            )

    for song in tqdm(songs):
        r = requests.get(song["url"])
        soup = BeautifulSoup(r.text, "html.parser")
        # получаем текст песни
        text = soup.find("div", {"class": "song-block-text"})
        # форматируем текст
        song["text"] = text.get_text(separator="\n")
        tqdm.write(f'"{song["name"]}" загружена!')

    return songs


def get_songs_altwall_net(artist_name):
    url = "https://altwall.net/texts.php?show=" + artist_name
    basename = "https://altwall.net"

    from selenium import webdriver
    browser = webdriver.Chrome()
    browser.get(url)

    soup = BeautifulSoup(browser.page_source,
                         "html.parser", from_encoding="utf-8")
    # получаем таблицу с песнями
    tbody = soup.find("tbody").find_all("a")

    # получаем ссылки на песни
    songs = []
    for link in tbody:
        if artist_name in link.get("href"):
            songs.append(
                {
                    "name": link.text,
                    "url": basename + link.get("href"),
                    "text": "",
                }
            )

    for song in tqdm(songs):
        browser.get(song["url"])
        soup = BeautifulSoup(browser.page_source,
                             "html.parser", from_encoding="utf-8")
        # получаем текст песни
        text = soup.find("article", {"id": "main_text_div"})
        # форматируем текст
        song["text"] = text.get_text(separator="\n")
        tqdm.write(f'"{song["name"]}" загружена!')

    return songs


In [2]:
artists = [
  {
    "name": "Orange House",
    "id": "orangehouse",
    "source": "https://altwall.net",
    "value": 20
  },
  {
    "name": "Операция Пластилин",
    "id": "Operatsiya-Plastilin",
    "source": "https://www.pesni.net",
    "value": 20
  },
  {
    "name": "Адаптация Пчёл",
    "id": "Adaptatsiya-Pchyol",
    "source": "https://www.pesni.net",
    "value": 20
  },
  {
    "name": "Аффинаж",
    "id": "Affinazh",
    "source": "https://www.pesni.net",
    "value": 5
  },
  {
    "name": "Комсомольск",
    "id": "Komsomolsk",
    "source": "https://www.pesni.net",
    "value": 5
  },
  {
    "name": "7 Раса",
    "id": "7Rasa",
    "source": "https://www.pesni.net",
    "value": 5
  },
  {
    "name": "Самое большое простое число",
    "id": "Samoe-bolshoe-prostoe-chislo",
    "source": "https://www.pesni.net",
    "value": 5
  }
]  

In [None]:
# загружаем песни всех исполнителей

for artist in artists:
    match artist["source"]:
        case "https://www.pesni.net":
            songs = get_songs_pesni_net(artist["id"])
            artist["songs"] = songs
        case "https://altwall.net":
            songs = get_songs_altwall_net(artist["id"])
            artist["songs"] = songs
        case _:
            print(artist["name"], "- неизвестный источник")
            artist["songs"] = []
            continue

In [9]:
# сохраняем песни всех исполнителей в файлы

for artist in artists:
    folder = "songs/"
    filename = artist["name"] + ".txt"
    with open(folder + filename, "w", encoding="utf-8") as f:
        for song in artist["songs"]:
            f.write(song["text"])

In [7]:
# write all songs to file

with open("songs.txt", "w", encoding="utf-8") as f:
    for artist in artists:
        for song in artist["songs"]:
            f.write(song["text"])

# Markovify

In [None]:
%pip install markovify

In [19]:
import markovify

# Get raw text as string.
with open("songs.txt", encoding="utf-8") as f:
    text = f.read()

# Build the model.
text_model = markovify.NewlineText(text, state_size=2)

# Print five randomly-generated sentences
for i in range(10):
    text = text_model.make_sentence()
    if text:
        print(text)
        print()

# Print three randomly-generated sentences of no more than 280 characters
for i in range(3):
    text = text_model.make_sentence()
    if text:
        print(text)
        print()

Мир был бы живым

Лица, что мы с тобой в друг друга небесные часовые

О том, что они унесут.

Таблетки радости в глаза друг другу чувства

Мама, все твои драмы мои

И мы никогда не простят

Рисуя волну, бесконечно бегу

Вместо вернемся из темноты и ты однажды сам себя заебал

