In [3]:
!scrapy startproject pealim

New Scrapy project 'pealim', using template directory '/Users/pabloherrero/Library/Python/3.9/lib/python/site-packages/scrapy/templates/project', created in:
    /Users/pabloherrero/Documents/ManHatTan/mht/notebooks/pealim

You can start your first spider with:
    cd pealim
    scrapy genspider example example.com


In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [7]:
class PealimVerbSpider(scrapy.Spider):
    name = "pealim_verb"
    allowed_domains = ["pealim.com"]

    def start_requests(self):
        query = getattr(self, 'verb', None)
        if not query:
            self.logger.error("Please pass ?verb=ללכת")
            return
        url = f"https://www.pealim.com/search/?q={query}"
        yield scrapy.Request(url, self.parse_search)

    def parse_search(self, response):
        for sel in response.css('li'):
            href = sel.css('a::attr(href)').get()
            if href and href.startswith('/dict/'):
                text = sel.css('a::text').get()
                # Optional: ensure text (e.g. infinitive form) matches criteria
                yield response.follow(href, self.parse_verb)

    def parse_verb(self, response):
        verb = response.css('h1 ::text').get()
        tables = {}
        for part in ['INF', 'PST', 'FUT']:
            sel = response.css(f'#h-{part}-L').xpath('following-sibling::table[1]')
            headings = sel.css('thead tr th::text').getall()
            rows = []
            for tr in sel.css('tbody tr'):
                row = tr.css('td::text').getall()
                rows.append(row)
            tables[part] = {
                'headings': headings,
                'rows': rows
            }
        yield {
            'verb': verb,
            'conjugations': tables
        }



In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import json

class PealimSearchSpider(scrapy.Spider):
    name = "pealim_search"
    allowed_domains = ["pealim.com"]

    def __init__(self, verb=None, **kwargs):
        super().__init__(**kwargs)
        self.verb = verb

    def start_requests(self):
        url = f"https://www.pealim.com/search/?q={self.verb}"
        yield scrapy.Request(url, self.parse)

    def parse(self, response):
        for sel in response.css("li.search-result"):
            yield {
                "title": sel.css("a::text").get(),
                "url": response.urljoin(sel.css("a::attr(href)").get())
            }

# Set up and run the spider
process = CrawlerProcess(settings={
    "LOG_LEVEL": "WARNING",
    "FEEDS": {
        "results.json": {"format": "json"}
    }
})
process.crawl(PealimSearchSpider, verb="ללכת")
process.start()  # this blocks until done

# Load and display results
with open("results.json") as f:
    data = json.load(f)

data[:5]  # show up to first 5 results


RuntimeError: This event loop is already running

  warn(



: 

In [163]:
import requests
from bs4 import BeautifulSoup
from hebrew import Hebrew
import random

In [60]:
def get_verb_link(hebrew_verb):
    search_url = f"https://www.pealim.com/search/?q={hebrew_verb}"
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Extract all verb entries from the soup
    verb_entries = []

    for entry in soup.select('.verb-search-result'):
        lemma = entry.select_one('.verb-search-lemma a')
        binyan = entry.select_one('.verb-search-binyan')
        if 'verb' in binyan.get_text(strip=True).lower():
        
            if lemma:
                text = lemma.get_text(strip=True)
                url = lemma['href']
                verb_entries.append({'text': text, 'url': url})

    first_url = verb_entries[0]['url'] if verb_entries else None
    return "https://www.pealim.com" + first_url if first_url else None

In [121]:
def get_conjugation_dict(verb_url):
    if not verb_url:
        return {}
    r = requests.get(verb_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    conjugation_entries = soup.select('.conj-td')

    conjug_dict = {}
    for entry in conjugation_entries:
        for div in entry.find_all('div', id=True):
            menukad = div.find('span', class_='menukad')
            if menukad:
                conjugated_verb_nikkud = menukad.get_text(strip=True)
                verb_noniqqud = Hebrew(conjugated_verb_nikkud).no_niqqud()
                conjug_dict[div['id']] = verb_noniqqud
    return conjug_dict

In [137]:
hebrew_verb = "ללכת"
verb_url = get_verb_link("לראות") 
verb_url = get_verb_link(hebrew_verb) 
conj_dict = get_conjugation_dict(verb_url)  

# Remove all items from ex_dict whose key contains 'IMP' (imperative forms)
conj_dict = {k: v for k, v in conj_dict.items() if 'IMP-' not in k}

In [164]:
conjugation_key_map = {
	"AP": "Present",
	"PERF": "Past",
	"IMPF": "Future",
	"IMP": "Imperative",
	"INF": "Infinitive",
	"ms": "masculine singular",
	"fs": "feminine singular",
	"mp": "masculine plural",
	"fp": "feminine plural",
	"1s": "1st person singular",
	"1p": "1st person plural",
	"2ms": "2nd person masculine singular",
	"2fs": "2nd person feminine singular",
	"2mp": "2nd person masculine plural",
	"2fp": "2nd person feminine plural",
	"3ms": "3rd person masculine singular",
	"3fs": "3rd person feminine singular",
	"3mp": "3rd person masculine plural",
	"3fp": "3rd person feminine plural",
    "3p": "3rd person plural",
	"L": "long form"
}

In [145]:
def parse_conjugation_key(key, key_map):
    """Split the key by dash and map each part using key_map if possible."""
    parts = key.split('-')
    mapped_parts = [key_map.get(part, part) for part in parts]
    return " - ".join(mapped_parts)


In [162]:
key, value = random.choice(list(conj_dict.items()))
parsed_key = parse_conjugation_key(key, conjugation_key_map)
print(f"Random conjugation for '{hebrew_verb}':")
print(f"{parsed_key}: {value}")

Random conjugation for 'ללכת':
Past - 1st person plural: הלכנו


In [148]:
# Example usage:
for k, v in conj_dict.items():
    parsed_key = parse_conjugation_key(k, conjugation_key_map)
    print(f"{k}: {parsed_key} = {v}")

AP-ms: Present - masculine singular = הולך
AP-fs: Present - feminine singular = הולכת
AP-mp: Present - masculine plural = הולכים
AP-fp: Present - feminine plural = הולכות
PERF-1s: Past - 1st person singular = הלכתי
PERF-1p: Past - 1st person plural = הלכנו
PERF-2ms: Past - 2nd person masculine singular = הלכת
PERF-2fs: Past - 2nd person feminine singular = הלכת
PERF-2mp: Past - 2nd person masculine plural = הלכתם
PERF-2fp: Past - 2nd person feminine plural = הלכתן
PERF-3ms: Past - 3rd person masculine singular = הלך
PERF-3fs: Past - 3rd person feminine singular = הלכה
PERF-3p: Past - 3p = הלכו
IMPF-1s: Future - 1st person singular = אלך
IMPF-1p: Future - 1st person plural = נלך
IMPF-2ms: Future - 2nd person masculine singular = תלך
IMPF-2fs: Future - 2nd person feminine singular = תלכי
IMPF-2mp: Future - 2nd person masculine plural = תלכו
IMPF-2fp: Future - 2nd person feminine plural = תלכנה
IMPF-3ms: Future - 3rd person masculine singular = ילך
IMPF-3fs: Future - 3rd person feminine s

In [143]:
for k, v in conj_dict.items():
    
    print(f"{k}: {v}")

AP-ms: הולך
AP-fs: הולכת
AP-mp: הולכים
AP-fp: הולכות
PERF-1s: הלכתי
PERF-1p: הלכנו
PERF-2ms: הלכת
PERF-2fs: הלכת
PERF-2mp: הלכתם
PERF-2fp: הלכתן
PERF-3ms: הלך
PERF-3fs: הלכה
PERF-3p: הלכו
IMPF-1s: אלך
IMPF-1p: נלך
IMPF-2ms: תלך
IMPF-2fs: תלכי
IMPF-2mp: תלכו
IMPF-2fp: תלכנה
IMPF-3ms: ילך
IMPF-3fs: תלך
IMPF-3mp: ילכו
IMPF-3fp: תלכנה
INF-L: ללכת
