Skip to content

Commit

Permalink
🐛 small bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
redadmiral committed Feb 23, 2019
1 parent 91f6d78 commit 483c0e8
Showing 1 changed file with 12 additions and 7 deletions.
19 changes: 12 additions & 7 deletions scraper.py
Expand Up @@ -3,9 +3,11 @@

import scraperwiki
import requests_html
import lxml.html
import re
import json
import locale
import datetime


# Read in a page

Expand All @@ -17,25 +19,29 @@
r = session.get(site_url)
r.html.render(sleep = 5)

chronic = r.html.find("#chronic")
articles = chronic.find("article")
articles = r.html.find("article")
#articles = chronic.find("article")

singledate = re.compile("[0-9.]*")
timespan = re.compile("[0-9\.]*-[0-9\.]*")
month_year = re.compile("\w*\ [0-9]{4}")
month = re.compile("[A-Z][a-z]*")

last_year = 0
last_year = 2000

locale.setlocale(locale.LC_ALL, "de_DE.UTF-8") #for encoding german month names

endDate = ""
startDate = ""

for article in articles:

## Parse dates
date = article.find(".chronic__entry__date")
if singledate.fullmatch(date[0].text):
if timespan.match(date[0].text):
startDate = datetime.datetime.strptime(date[0].text.split("-")[0], "%d.%m.%Y")
print(last_year)
last_year = startDate.year
startDate = startDate.isoformat()
endDate = datetime.datetime.strptime(date[0].text.split("-")[1], "%d.%m.%Y").isoformat()
Expand All @@ -57,7 +63,6 @@

## Parse content
content = article.find("div.chronic__entry__content-wrapper > div > p")[0].text
print(content)

## Parse source
source_primary = "EZRA Chronik"
Expand All @@ -77,8 +82,8 @@
unique_keys=["uri"],
data={
"sources": json.dumps(
{"name": source_primary, "date": "", "url": source_uri_primary},
{"name": source_secondary, "date": "", "url": source_uri_secondary}
[{"name": source_primary, "date": "", "url": source_uri_primary},
{"name": source_secondary, "date": "", "url": source_uri_secondary}]
),
"description": content,
"startDate": startDate,
Expand Down

0 comments on commit 483c0e8

Please sign in to comment.