Skip to content

Commit

Permalink
Merge pull request #831 from Sefaria/add_or_update_bug_2
Browse files Browse the repository at this point in the history
Add or update bug 2
  • Loading branch information
nsantacruz committed May 15, 2022
2 parents 5fe59df + d82072d commit 631c17c
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 18 deletions.
28 changes: 22 additions & 6 deletions sefaria/model/tests/webpage_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,17 @@ def create_good_web_page():
yield {"result": result, "webpage": webpage, "data": data_good_url}
WebPage().load({"url": data_good_url["url"]}).delete()

@pytest.fixture(scope='module')
def create_web_page_wout_desc():
data_good_url = {'url': 'http://blogs.timesofisrael.com/dvar-torah4',
'title': title_good_url+" without description",
'refs': ["Haamek Davar on Genesis, Kidmat Ha'Emek 2", 'Genesis 3']}

result = WebPage().add_or_update_from_linker(data_good_url)
webpage = WebPage().load({"url": data_good_url["url"]})
yield {"result": result, "webpage": webpage, "data": data_good_url}
webpage.delete()

def test_add_bad_domain_from_linker():
#localhost:8000 should not be added to the linker, so make sure attempting to do so fails

Expand Down Expand Up @@ -74,15 +85,9 @@ def test_add_and_update_with_same_data(create_good_web_page):

def test_update_blank_title_from_linker(create_good_web_page):
result, webpage, data = create_good_web_page["result"], create_good_web_page["webpage"], create_good_web_page["data"]
print(webpage.contents())
assert result == "saved"

data["title"] = ""



assert WebPage().add_or_update_from_linker(data) == "saved"
print(WebPage().load({"url": data["url"]}).contents())
assert WebPage().load({"url": data["url"]}).title == title_good_url


Expand All @@ -96,3 +101,14 @@ def test_add_search_URL():
'refs': ['Psalms 1–41', 'Psalms 42–72', 'Psalms 73–89', 'Psalms 90–106', 'Psalms 107–150', 'Psalms 130:1',
'Psalms 63:2-4', 'Psalms 42:2-4']}
assert WebPage.add_or_update_from_linker(linker_data) == "excluded"


def test_page_wout_description(create_web_page_wout_desc):
result, webpage, data = create_web_page_wout_desc["result"], create_web_page_wout_desc["webpage"], create_web_page_wout_desc["data"]
assert result == "saved"

data["refs"] = ["Exodus 3:3"]
assert WebPage().add_or_update_from_linker(data) == "saved"

assert WebPage().add_or_update_from_linker(data) == "excluded"

27 changes: 15 additions & 12 deletions sefaria/model/webpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,12 @@ def add_or_update_from_linker(data):
data["url"] = WebPage.normalize_url(data["url"])
webpage = WebPage().load(data["url"])
data["refs"] = WebPage._normalize_refs(data["refs"]) # remove bad refs so pages with empty refs won't get saved
data["title"] = WebPage.clean_title(data["title"], getattr(webpage, "_site_data", {}), getattr(webpage, "site_name", ""))
data["description"] = WebPage.clean_description(data.get("description", ""))

if webpage:
existing = True
if data["title"] == webpage.title and data["description"] == webpage.description and set(data["refs"]) == set(webpage.refs):
if set(data["refs"]) == set(webpage.refs) and data["title"] == webpage.title and data["description"] == getattr(webpage, "description", ""):
return "excluded" # no new data
if data["title"] == "":
data["title"] = webpage.title # dont save an empty title if title exists
Expand Down Expand Up @@ -185,20 +187,21 @@ def client_contents(self):
return d

def clean_client_contents(self, d):
d["title"] = self.clean_title()
d["description"] = self.clean_description()
d["title"] = self.clean_title(d["title"], d["_site_data"], d["site_name"])
d["description"] = WebPage.clean_description(d["description"])
return d

def clean_title(self):
if not self._site_data:
return self.title
title = str(self.title)
@staticmethod
def clean_title(title, site_data, site_name):
if site_data == {}:
return title
title = str(title)
title = title.replace("&", "&")
brands = [self.site_name] + self._site_data.get("title_branding", [])
brands = [site_name] + site_data.get("title_branding", [])
separators = [("-", ' '), ("|", ' '), ("—", ' '), ("–", ' '), ("»", ' '), ("•", ' '), (":", ''), ("⋆", ' ')]
for separator, padding in separators:
for brand in brands:
if self._site_data.get("initial_title_branding", False):
if site_data.get("initial_title_branding", False):
brand_str = f"{brand}{padding}{separator} "
if title.startswith(brand_str):
title = title[len(brand_str):]
Expand All @@ -207,10 +210,10 @@ def clean_title(self):
if title.endswith(brand_str):
title = title[:-len(brand_str)]

return title if len(title) else self._site_data["name"]
return title

def clean_description(self):
description = getattr(self, "description", "")
@staticmethod
def clean_description(description):
for uhoh_string in ["*/", "*******"]:
if description.find(uhoh_string) != -1:
return None
Expand Down

0 comments on commit 631c17c

Please sign in to comment.