In [9]:
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup


def prettify(text):
  # удалить строки где есть слова "куплет / припев"
  text = text.split("\n")
  # удалить пустые строки
  text = [i.strip() for i in text if i]
  bad_words = ["куплет", "припев", "[", "]"]
  text = [i for i in text if not any(j in i.lower() for j in bad_words)]
  text = "\n".join(text)
  return text

def get_songs_pesni_net(artist_name):
    url = "https://www.pesni.net/text/" + artist_name
    basename = "https://www.pesni.net"

    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    # получаем все песни
    text = soup.find("div", {"class": "textcontent"})
    links = text.find_all("a")

    # получаем ссылки на песни
    songs = []

    for link in links:
        if artist_name in link.get("href"):
            songs.append(
                {
                    "name": link.text,
                    "url": basename + link.get("href"),
                    "text": "",
                }
            )

    for song in tqdm(songs):
        r = requests.get(song["url"])
        soup = BeautifulSoup(r.text, "html.parser")
        # получаем текст песни
        text = soup.find("div", {"class": "song-block-text"})
        # форматируем текст
        song["text"] = prettify(text.get_text(separator="\n"))
        # tqdm.write(f'"{song["name"]}" загружена!')

    return songs


def get_songs_altwall_net(artist_name):
    url = "https://altwall.net/texts.php?show=" + artist_name
    basename = "https://altwall.net"

    from selenium import webdriver
    browser = webdriver.Chrome()
    browser.get(url)

    soup = BeautifulSoup(browser.page_source,
                         "html.parser", from_encoding="utf-8")
    # получаем таблицу с песнями
    tbody = soup.find("tbody").find_all("a")

    # получаем ссылки на песни
    songs = []
    for link in tbody:
        if artist_name in link.get("href"):
            songs.append(
                {
                    "name": link.text,
                    "url": basename + link.get("href"),
                    "text": "",
                }
            )

    for song in tqdm(songs):
        browser.get(song["url"])
        soup = BeautifulSoup(browser.page_source,
                             "html.parser", from_encoding="utf-8")
        # получаем текст песни
        text = soup.find("article", {"id": "main_text_div"})
        # форматируем текст
        song["text"] = prettify(text.get_text(separator="\n"))
        # tqdm.write(f'"{song["name"]}" загружена!')

    return songs

In [10]:
artists = [
  {
    "name": "Orange House",
    "id": "orangehouse",
    "source": "https://altwall.net",
    "value": 3
  },
  {
    "name": "Операция Пластилин",
    "id": "Operatsiya-Plastilin",
    "source": "https://www.pesni.net",
    "value": 3
  },
  {
    "name": "Адаптация Пчёл",
    "id": "Adaptatsiya-Pchyol",
    "source": "https://www.pesni.net",
    "value": 5
  },
  {
    "name": "Аффинаж",
    "id": "Affinazh",
    "source": "https://www.pesni.net",
    "value": 1
  },
  {
    "name": "Комсомольск",
    "id": "Komsomolsk",
    "source": "https://www.pesni.net",
    "value": 1
  },
  {
    "name": "7 Раса",
    "id": "7Rasa",
    "source": "https://www.pesni.net",
    "value": 1
  },
  {
    "name": "Самое большое простое число",
    "id": "Samoe-bolshoe-prostoe-chislo",
    "source": "https://www.pesni.net",
    "value": 1
  },
  {
    "name": "Lumen",
    "id": "Lumen",
    "source": "https://www.pesni.net",
    "value": 1
  }
]  

In [53]:
values = [i["value"] for i in artists]
values = [1, 3, 7, 1, 1, 1, 1, 1]

In [11]:
# загружаем песни всех исполнителей

for artist in artists:
    match artist["source"]:
        case "https://www.pesni.net":
            songs = get_songs_pesni_net(artist["id"])
            artist["songs"] = songs
        case "https://altwall.net":
            songs = get_songs_altwall_net(artist["id"])
            artist["songs"] = songs
        case _:
            print(artist["name"], "- неизвестный источник")
            artist["songs"] = []
            continue

100%|██████████| 39/39 [00:16<00:00,  2.43it/s]
100%|██████████| 26/26 [00:04<00:00,  5.83it/s]
100%|██████████| 34/34 [00:07<00:00,  4.30it/s]
100%|██████████| 12/12 [00:02<00:00,  5.87it/s]
100%|██████████| 8/8 [00:02<00:00,  2.90it/s]
100%|██████████| 6/6 [00:01<00:00,  5.03it/s]
100%|██████████| 12/12 [00:02<00:00,  5.95it/s]
100%|██████████| 130/130 [00:28<00:00,  4.55it/s]


### Опционально - сохраняем тексты

In [4]:
# сохраняем песни всех исполнителей в файлы

for artist in artists:
    folder = "songs/"
    filename = artist["name"] + ".txt"
    with open(folder + filename, "w", encoding="utf-8") as f:
        for song in artist["songs"]:
            f.write(song["text"])

In [5]:
# write all songs to file

with open("songs.txt", "w", encoding="utf-8") as f:
    for artist in artists:
        for song in artist["songs"]:
            f.write(song["text"])

# Markovify

In [None]:
%pip install markovify

In [18]:
import markovify

chains = []
for artist in artists:
    text = "\n".join([song["text"] for song in artist["songs"]])
    chains.append(
        {
            "value": artist["value"],
            "chain": markovify.Text(text, state_size=2),
        }
    )

In [24]:
model = markovify.combine([i["chain"] for i in chains], [i["value"] for i in chains])

In [22]:
generated_texts = set()
while len(generated_texts) < 1000:
    text = model.make_sentence()
    if text:
        generated_texts.add(text)

In [26]:
with open("generated.txt", "w", encoding="utf-8") as f:
    for text in generated_texts:
        f.write(text + "\n\n")

In [None]:
for text in generated_texts:
    print(text, end="\n\n")