In [2]:
from urllib.request import urlopen
from urllib.parse import quote, unquote
from lxml import html
from glob import glob
import re

from functools import cache
import pathlib
import os


In [3]:
def num_to_devanagari(n):
    """Converts a number to devanagari"""
    devanagari = "०१२३४५६७८९"
    return "".join([devanagari[int(i)] for i in str(n)])

def kss_url(lambaka, taranga=None):
    """Returns the url for a given lambaka and taranga"""
    lambaka = num_to_devanagari(lambaka)
    if taranga is None:
        return f"https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_{lambaka}"
    
    taranga = num_to_devanagari(taranga)
    return f"https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_{lambaka}/तरङ्गः_{taranga}"

# def get_url_of_anchors(url, matches=r"/wiki/कथासरित्सागरः/लम्बकः_[०१२३४५६७८९]+/तरङ्गः_[०१२३४५६७८९]+"):
# def get_url_of_anchors(url, matches=r"/wiki/कथासरित्सागरः/लम्बकः_[०१२३४५६७८९]+/तर.*गः_[०१२३४५६७८९]+"):
def get_url_of_anchors(url, matches=r"/wiki/कथासरित्सागरः/लम्बकः_[०१२३४५६७८९]+/तर.*"):
    """Returns the url of all the anchors in a given url"""
    print(url)
    url = quote(url, safe='/:')
    try:
        page = urlopen(url)
    except:
        print("Error: ", url)
        return []
    
    tree = html.fromstring(page.read())
    hrefs = tree.xpath('//a/@href')
    # decode the url
    # hrefs = [href.encode('utf-8').decode('unicode_escape') for href in hrefs]
    # unquote the url
    hrefs = [unquote(href) for href in hrefs]
    # filter the hrefs
    hrefs = [href for href in hrefs if re.search(matches, href)]
    return hrefs


@cache
def get_kss_urls():
    kss_urls = [ x for l in range(1, 19) for x in get_url_of_anchors(kss_url(l)) ]
    return [ f"https://sa.wikisource.org{x}" for x in kss_urls if len(x) > 0 ]


In [4]:
@cache
def get_kss_text(url, xpath="//div[@class='poem']//span//text()"):
    """Returns the text of a given url"""

    url = quote(url, safe='/:')
    # print(url)
    try:
        page = urlopen(url)
    except Exception as e:
        print(f"Error: {e}\n{unquote(url)}\n", url)
        return []
    
    tree = html.fromstring(page.read())
    text = tree.xpath(xpath)
    text = [x for x in text]
    return text


In [5]:
kss_urls = get_kss_urls()
for url in kss_urls[:]:
    print(url)
    # (lambaka, taranga) = re.findall(r"लम्बकः_([०१२३४५६७८९]+)/तरङ्गः_([०१२३४५६७८९]+)", url)[0]
    (lambaka, taranga) = re.findall(r"लम्बकः_([०१२३४५६७८९]+)/तर.*_([०१२३४५६७८९]+)", url)[0]
    lambaka = int(lambaka)
    taranga = int(taranga)
    l,t = (num_to_devanagari(x) for x in (lambaka, taranga))
    filename = f"~scraped-topic/kss/kss_{lambaka:02d}_{taranga:02d}.md"
    print(filename)
    pathlib.Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True)
    seen =v=0
    with open(filename, 'w') as f:
        f.write(f"## [कथासरित्सागरः {lambaka:02d} {taranga:02d}]({url})\n\n")
        txt = get_kss_text(url)
        txt = [ x.strip() for x in txt]
        txt = [ x for x in txt if len(x)>3]
        txt = [ re.sub(r"\s*।।\s*", " ॥ ", x) for x in txt]
        txt = [ re.sub(r"\s*।s*", " । ", x) for x in txt]
        # remove newlines
        txt = [ re.sub(r"\n+", "", x) for x in txt]
        txt = [ [
            # re.sub(r"(\s*॥\s*([०१२३४५६७८९]+\s*)?)", r"\1" + f"( {l}-{t}-{num_to_devanagari(1+v//2)} )\n\n", x)
            # re.sub(r"((\s*॥\s*([०१२३४५६७८९]+\s*)?)|(\s*.\s*([०१२३४५६७८९]+\s*){1}))" , r"\1" + f"( {l}-{t}-{num_to_devanagari(1+v//2)} )" + f"\n\n", x)
            # re.sub(r"((\s*॥\s*([०१२३४५६७८९]+\s*)?)|(\s*.\s*([०१२३४५६७८९]+\s*){1}))" , r"\1" + f" ({lambaka}-{taranga}-{(1+v//2)})\n" + f"\n\n", x)
            0
            , seen := seen + (1 if (seen!=0 or re.search(r"॥\s*[०१२३४५६७८९०]+", x)) else 0)
            , v:=v+(1 if seen>0 else 0)
            , re.sub(r"((\s*॥\s*([०१२३४५६७८९]+\s*)?)|(\s*.\s*([०१२३४५६७८९]+\s*){1}))$" 
                     , r"\1" 
                     + (f" ({lambaka}-{taranga}-{(1+v//2)})\n" if seen>0 else "")
                     + f"\n\n"
                     , x)
            ][-1] for x in txt]

        # txt = [ re.sub(r"^", "### ", x) if '॥'  in x else x for x in txt]
        txt = "".join(txt)
        txt = re.sub(r"^", "### ", txt)
        txt = re.sub(r"\n+", "\n\n### ", txt)
        f.write(txt)
# get_kss_text(kss_urls[0]) 

https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_२
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_३
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_४
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_५
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_६
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_७
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_८
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_९
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१०
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_११
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१२
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१३
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१४
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१५
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१६
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१७
https://sa.wikisource.org/wiki/कथासरित्सागरः/लम्बकः_१८
https://sa.wikisour