Skip to content

Commit

Permalink
Merge pull request #26 from robot-lab/parsingCodexes
Browse files Browse the repository at this point in the history
Parsing codexes
  • Loading branch information
navolotsky committed Nov 23, 2018
2 parents c04fda9 + 504969a commit 2e04323
Show file tree
Hide file tree
Showing 8 changed files with 1,076 additions and 9 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
CODES/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
Binary file modified dist/web_crawler-0.1-py3-none-any.whl
Binary file not shown.
Binary file modified dist/web_crawler-0.1.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion web_crawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

def Init(sourceNameList=None, databaseSource=None):
'''
Initialize web_crawler for working.
Initialize web_crawler for working.
Should be invoked before any actions with
Crawler
'''
Expand Down
10 changes: 5 additions & 5 deletions web_crawler/ksrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
else:
from web_crawler import DataSource, DataSourceType, DataType

PATH_TO_CHROME_WEB_DRIVER = os.path.join(os.path.dirname(__file__),
'Selenium','chromedriver.exe')
PATH_TO_CHROME_WEB_DRIVER = os.path.join(
os.path.dirname(__file__), 'Selenium', 'chromedriver.exe')
KSRF_PAGE_URI = 'http://www.ksrf.ru/ru/Decision/Pages/default.aspx'


Expand Down Expand Up @@ -104,7 +104,7 @@ def get_decision_headers(pagesNumber=None, sourcePrefix='КСРФ'):
decisionID = sourcePrefix + '/' + key
docType = sourcePrefix + '/' + typePattern.search(key)[0]
date = d[0].text_content()
title = d[1].text_content()
title = d[1].text_content().strip()
url = d[2].getchildren()[0].get('href')
headerElements = {'supertype': sourcePrefix,
'release_date': date, 'doc_type': docType,
Expand Down Expand Up @@ -135,7 +135,7 @@ def get_decision_headers(pagesNumber=None, sourcePrefix='КСРФ'):
page = html.document_fromstring(get_page_html_by_num(
driver, template, i))
if True: # debug print:
print(f"Pages downloaded: {i-1}/{pagesNumber}")
print(f"Pages downloaded: {i-1}/{pagesNumber}", end='\r')
driver.quit()
return courtSiteContent

Expand Down Expand Up @@ -409,7 +409,7 @@ def put_data_collection(self, dataDict, dataType: DataType):
with open(os.path.join(self.folder_path,
self.HEADERS_FILE_NAME),
'wt', encoding='utf-8') as headersFile:
headersFile.write(json.dumps(self.headers))
headersFile.write(json.dumps(self.headers, ensure_ascii=False))

if __name__ == '__main__':
headersOld = get_decision_headers()
Expand Down
Loading

0 comments on commit 2e04323

Please sign in to comment.