# Attempt scrapy

In [3]:
!scrapy startproject pealim

New Scrapy project 'pealim', using template directory '/Users/pabloherrero/Library/Python/3.9/lib/python/site-packages/scrapy/templates/project', created in:
    /Users/pabloherrero/Documents/ManHatTan/mht/notebooks/pealim

You can start your first spider with:
    cd pealim
    scrapy genspider example example.com


In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [7]:
class PealimVerbSpider(scrapy.Spider):
    name = "pealim_verb"
    allowed_domains = ["pealim.com"]

    def start_requests(self):
        query = getattr(self, 'verb', None)
        if not query:
            self.logger.error("Please pass ?verb=ללכת")
            return
        url = f"https://www.pealim.com/search/?q={query}"
        yield scrapy.Request(url, self.parse_search)

    def parse_search(self, response):
        for sel in response.css('li'):
            href = sel.css('a::attr(href)').get()
            if href and href.startswith('/dict/'):
                text = sel.css('a::text').get()
                # Optional: ensure text (e.g. infinitive form) matches criteria
                yield response.follow(href, self.parse_verb)

    def parse_verb(self, response):
        verb = response.css('h1 ::text').get()
        tables = {}
        for part in ['INF', 'PST', 'FUT']:
            sel = response.css(f'#h-{part}-L').xpath('following-sibling::table[1]')
            headings = sel.css('thead tr th::text').getall()
            rows = []
            for tr in sel.css('tbody tr'):
                row = tr.css('td::text').getall()
                rows.append(row)
            tables[part] = {
                'headings': headings,
                'rows': rows
            }
        yield {
            'verb': verb,
            'conjugations': tables
        }



In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import json

class PealimSearchSpider(scrapy.Spider):
    name = "pealim_search"
    allowed_domains = ["pealim.com"]

    def __init__(self, verb=None, **kwargs):
        super().__init__(**kwargs)
        self.verb = verb

    def start_requests(self):
        url = f"https://www.pealim.com/search/?q={self.verb}"
        yield scrapy.Request(url, self.parse)

    def parse(self, response):
        for sel in response.css("li.search-result"):
            yield {
                "title": sel.css("a::text").get(),
                "url": response.urljoin(sel.css("a::attr(href)").get())
            }

# Set up and run the spider
process = CrawlerProcess(settings={
    "LOG_LEVEL": "WARNING",
    "FEEDS": {
        "results.json": {"format": "json"}
    }
})
process.crawl(PealimSearchSpider, verb="ללכת")
process.start()  # this blocks until done

# Load and display results
with open("results.json") as f:
    data = json.load(f)

data[:5]  # show up to first 5 results


RuntimeError: This event loop is already running

  warn(



: 

# Beautiful Soup

In [25]:
import requests
from bs4 import BeautifulSoup
from hebrew import Hebrew
import random

In [69]:
def scrap_verb_link(hebrew_verb):
    search_url = f"https://www.pealim.com/search/?q={hebrew_verb}"
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')

    # Extract all verb entries from the soup
    verb_entries = []

    for entry in soup.select('.verb-search-result'):
        lemma = entry.select_one('.verb-search-lemma a')
        binyan = entry.select_one('.verb-search-binyan')
        if 'verb' in binyan.get_text(strip=True).lower():
        
            if lemma:
                text = lemma.get_text(strip=True)
                url = lemma['href']
                verb_entries.append({'text': text, 'url': url})

    first_url = verb_entries[0]['url'] if verb_entries else None
    return "https://www.pealim.com" + first_url if first_url else None

In [102]:
def scrap_conjugation_dict(verb_url):
    if not verb_url:
        return {}
    r = requests.get(verb_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    conjugation_entries = soup.select('.conj-td')

    conjug_dict = {}
    for entry in conjugation_entries:
        for div in entry.find_all('div', id=True):
            menukad = div.find('span', class_='menukad')
            chaser = div.find('span', class_='chaser')
            if chaser:
                conjugated_verb_nikkud = chaser.get_text(strip=True)
                verb_noniqqud = Hebrew(conjugated_verb_nikkud).no_niqqud()
                verb_noniqqud = str(verb_noniqqud).replace('~', '').replace(' ', '')
                conjug_dict[div['id']] = verb_noniqqud
            elif menukad:
                conjugated_verb_nikkud = menukad.get_text(strip=True)
                verb_noniqqud = Hebrew(conjugated_verb_nikkud).no_niqqud()
                verb_noniqqud = str(verb_noniqqud).replace('~', '').replace(' ', '')
                conjug_dict[div['id']] = verb_noniqqud
    return conjug_dict

In [103]:
conj_dict = scrap_conjugation_dict(verb_url)  
conj_dict 

{'AP-ms': 'מדבר',
 'AP-fs': 'מדברת',
 'AP-mp': 'מדברים',
 'AP-fp': 'מדברות',
 'PERF-1s': 'דיברתי',
 'PERF-1p': 'דיברנו',
 'PERF-2ms': 'דיברת',
 'PERF-2fs': 'דיברת',
 'PERF-2mp': 'דיברתם',
 'PERF-2fp': 'דיברתן',
 'PERF-3ms': 'דיבר',
 'PERF-3fs': 'דיברה',
 'PERF-3p': 'דיברו',
 'IMPF-1s': 'אדבר',
 'IMPF-1p': 'נדבר',
 'IMPF-2ms': 'תדבר',
 'IMPF-2fs': 'תדברי',
 'IMPF-2mp': 'תדברו',
 'IMPF-2fp': 'תדברנה',
 'IMPF-3ms': 'ידבר',
 'IMPF-3fs': 'תדבר',
 'IMPF-3mp': 'ידברו',
 'IMPF-3fp': 'תדברנה',
 'IMP-2ms': 'דבר!\u200f',
 'IMP-2fs': 'דברי!\u200f',
 'IMP-2mp': 'דברו!\u200f',
 'IMP-2fp': 'דברנה!\u200f',
 'INF-L': 'לדבר',
 'passive-AP-ms': 'מדובר',
 'passive-AP-fs': 'מדוברת',
 'passive-AP-mp': 'מדוברים',
 'passive-AP-fp': 'מדוברות',
 'passive-PERF-1s': 'דוברתי',
 'passive-PERF-1p': 'דוברנו',
 'passive-PERF-2ms': 'דוברת',
 'passive-PERF-2fs': 'דוברת',
 'passive-PERF-2mp': 'דוברתם',
 'passive-PERF-2fp': 'דוברתן',
 'passive-PERF-3ms': 'דובר',
 'passive-PERF-3fs': 'דוברה',
 'passive-PERF-3p': 'דוברו',
 

In [None]:
hebrew_verb = "ללכת"
verb_url = scrap_verb_link("לדבר") 
# verb_url = scrap_verb_link(hebrew_verb) 
conj_dict = scrap_conjugation_dict(verb_url)  

# Remove all items from ex_dict whose key contains 'IMP' (imperative forms)
conj_dict = {k: v for k, v in conj_dict.items() if 'IMP-' not in k}

In [104]:
conjugation_key_map = {
    "AP": ("Present", "היום"),
    "PERF": ("Past", "אתמול"),
    "IMPF": ("Future", "מחר"),
    "IMP": ("Imperative", ""),
    "INF": ("Infinitive", "ל..."),
    "ms": ("masculine singular", "הוא"),
    "fs": ("feminine singular", "היא"),
    "mp": ("masculine plural", "הם"),
    "fp": ("feminine plural", "הן"),
    "1s": ("1st person singular", "אני"),
    "1p": ("1st person plural", "אנחנו"),
    "2ms": ("2nd person masculine singular", "אתה"),
    "2fs": ("2nd person feminine singular", "את"),
    "2mp": ("2nd person masculine plural", "אתם"),
    "2fp": ("2nd person feminine plural", "אתן"),
    "3ms": ("3rd person masculine singular", "הוא"),
    "3fs": ("3rd person feminine singular", "היא"),
    "3mp": ("3rd person masculine plural", "הם"),
    "3fp": ("3rd person feminine plural", "הן"),
    "3p": ("3rd person plural", "הם/הן"),
    "L": ("long form", " "),
    "passive": ("passive", "(passive)"),
}

In [105]:
def parse_conjugation_key(key):
    """
    Split the key by dash and map each part using combined_conjugation_map.
    Returns two strings: one for English, one for Hebrew hints.
    """
    parts = key.split('-')
    english_parts = []
    hebrew_parts = []
    for part in parts:
        eng, heb = conjugation_key_map.get(part, (part, part))
        english_parts.append(eng)
        hebrew_parts.append(heb)
    parsed_english = " - ".join(english_parts)
    parsed_hebrew = " ,".join(hebrew_parts)
    return parsed_english, parsed_hebrew

In [108]:
# Select a random conjugation from the dictionary
key, value = random.choice(list(conj_dict.items()))
parsed_english, parsed_hebrew = parse_conjugation_key(key)
print(f"Random conjugation for '{hebrew_verb}':")
print(f"({parsed_english}), {parsed_hebrew}: {value}")

Random conjugation for 'ללכת':
(passive - Future - 1st person singular), (passive) ,מחר ,אני: אדובר


In [111]:
# Display all conjugations with their parsed keys
for k, v in conj_dict.items():
    parsed_english, parsed_hebrew = parse_conjugation_key(k)
    print(f"{k}: {parsed_english, parsed_hebrew} = {v}")

AP-ms: ('Present - masculine singular', 'היום ,הוא') = מדבר
AP-fs: ('Present - feminine singular', 'היום ,היא') = מדברת
AP-mp: ('Present - masculine plural', 'היום ,הם') = מדברים
AP-fp: ('Present - feminine plural', 'היום ,הן') = מדברות
PERF-1s: ('Past - 1st person singular', 'אתמול ,אני') = דיברתי
PERF-1p: ('Past - 1st person plural', 'אתמול ,אנחנו') = דיברנו
PERF-2ms: ('Past - 2nd person masculine singular', 'אתמול ,אתה') = דיברת
PERF-2fs: ('Past - 2nd person feminine singular', 'אתמול ,את') = דיברת
PERF-2mp: ('Past - 2nd person masculine plural', 'אתמול ,אתם') = דיברתם
PERF-2fp: ('Past - 2nd person feminine plural', 'אתמול ,אתן') = דיברתן
PERF-3ms: ('Past - 3rd person masculine singular', 'אתמול ,הוא') = דיבר
PERF-3fs: ('Past - 3rd person feminine singular', 'אתמול ,היא') = דיברה
PERF-3p: ('Past - 3rd person plural', 'אתמול ,הם/הן') = דיברו
IMPF-1s: ('Future - 1st person singular', 'מחר ,אני') = אדבר
IMPF-1p: ('Future - 1st person plural', 'מחר ,אנחנו') = נדבר
IMPF-2ms: ('Future - 2

In [120]:
key, value = random.choice(list(conj_dict.items()))
parsed_english, parsed_hebrew = parse_conjugation_key(key)
print(f"Random conjugation for '{hebrew_verb}':")
print(f"{parsed_english, parsed_hebrew}: {value} ")


Random conjugation for 'ללכת':
('Future - 3rd person feminine singular', 'מחר ,היא'): תדבר 


'מדובר'