Skip to content

Commit

Permalink
Merge pull request #29 from robot-lab/Refactoring
Browse files Browse the repository at this point in the history
Refactoring
  • Loading branch information
Rexarrior committed Dec 3, 2018
2 parents 310d919 + 61768e2 commit dd3b8c9
Show file tree
Hide file tree
Showing 10 changed files with 499 additions and 7 deletions.
Binary file modified dist/web_crawler-0.1-py3-none-any.whl
Binary file not shown.
Empty file modified web_crawler/Selenium/chromedriver.exe
100644 → 100755
Empty file.
37 changes: 32 additions & 5 deletions web_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
__version__ = '0.1'

if __package__:
from web_crawler import ksrf
from web_crawler import ksrf_models, tools
from web_crawler.web_crawler import WebCrawler
from web_crawler.web_crawler\
import DataSource, DataSourceType, DataType
from web_crawler.models.DatabaseWrapper import DatabaseWrapper
else:
import ksrf
import ksrf_models
from web_crawler import WebCrawler
from web_crawler\
import DataSource, DataSourceType, DataType
Local_database_source = ksrf.LocalFileStorageSource()
KSRF_Source = ksrf.KSRFSource()
Local_database_source = ksrf_models.LocalFileStorageSource()
KSRF_Source = ksrf_models.KSRFSource()
Crawler = WebCrawler([Local_database_source, KSRF_Source])


Expand All @@ -23,8 +24,34 @@ def Init(sourceNameList=None, databaseSource=None):
'''
global Crawler
Crawler.prepare_sources(sourceNameList, databaseSource)

def Init_by_data_model(sourceNameList=None, databaseSource=None):
'''
Initialize web_crawler for working.
Should be invoked before any actions with
Crawler
'''
global Crawler
Crawler.prepare_sources(
databaseSource=DatabaseWrapper('DatabaseSource', databaseSource))

def Init_by_KSRF_wrapper(dataModels):
'''
Initialize web_crawler for working.
Should be invoked before any actions with
Crawler.
Activate KSRF_Source and KSRF_database sources
'''
wrapper_name = 'KSRFDatabase'
wrapper = ksrf_models.KSRFDatabaseWrapper(wrapper_name,
dataModels)
Crawler.collected_sources[wrapper_name] = wrapper
Crawler.prepare_sources([wrapper_name, 'KSRFSource'], wrapper)


# Local_database_source.folder_path = 'D:\\programming\\Judyst\\files'
# Local_database_source.prepare()
# Init(databaseSource=Local_database_source)

__all__ = ['Crawler', 'DataSourceType', 'DataType', 'Init']
__all__ = ['Crawler', 'DataSourceType', 'DataType',
'Init', 'Init_by_data_model', 'DatabaseWrapper']
12 changes: 10 additions & 2 deletions web_crawler/ksrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,16 @@
else:
from web_crawler import DataSource, DataSourceType, DataType

PATH_TO_CHROME_WEB_DRIVER = os.path.join(
os.path.dirname(__file__), 'Selenium', 'chromedriver.exe')

if system_name().lower() == 'windows':
PATH_TO_CHROME_WEB_DRIVER = os.path.join(os.path.dirname(__file__),
'Selenium','chromedriver.exe')
else:
PATH_TO_CHROME_WEB_DRIVER = os.path.join(os.path.dirname(__file__),
'Selenium','chromedriver')



KSRF_PAGE_URI = 'http://www.ksrf.ru/ru/Decision/Pages/default.aspx'


Expand Down
12 changes: 12 additions & 0 deletions web_crawler/ksrf_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import os

import json
if __package__:
# retrorsum compatibility
from web_crawler.models.DatabaseWrapper import DatabaseWrapper as KSRFDatabaseWrapper
from web_crawler.models.KSRFSource import *
from web_crawler.models.LocalFileStorageSource import *




147 changes: 147 additions & 0 deletions web_crawler/models/DatabaseWrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os

import json
if __package__:
from web_crawler.web_crawler import DataSource,\
DataType, DataSourceType


class DatabaseWrapper(DataSource):
source = None
DOCUMENTS = 'Documents'
LINKS = 'Links'
DOCUMENT_FIELDS = ['supertype', 'doc_type','title',
'release_date' , 'text_source_url',
'effective_date','absolute_path',
'interredaction_id', 'cons_selected_info']
LINK_FIELDS = ['doc_id_from', 'doc_id_to', 'positions_list', 'citations_number']
def __init__(self, name, dataSource):
super().__init__(name, DataSourceType.DATABASE)
self.source = dataSource

def prepare(self):
return True

def _request_fields(self, retDict, fieldNames, modelName, doc_id):
for fieldName in fieldNames:
retDict[fieldName] = self.source.\
get_data(fieldName,
model_name=modelName,
doc_id=doc_id)


def _prepare_data(self, data, fieldsNames):
fieldName = 'positions_list'
if fieldName in fieldsNames and fieldName in data.keys():
data['citations_number'] = len(data[fieldName])
data[fieldName] = [json.dumps(data[fieldName][i]) for i in range(len(data[fieldName]))]

fieldName = 'cons_selected_info'
if fieldName in fieldsNames and fieldName in data.keys():
data[fieldName] = json.dumps(data[fieldName])

return data


def _create_data(self, dataDict, fieldNames, modelName, **requireKwargs):
data = dict()
for fieldName in fieldNames:
if fieldName in dataDict.keys():
data[fieldName] = dataDict[fieldName]
data = self._prepare_data(data, fieldNames)
self.source.create_data(model_name=modelName, **data,
**requireKwargs)


def _edit_data(self, dataDict, fieldNames, modelName, **requireKwargs):
data = dict()
for fieldName in fieldNames:
if fieldName in dataDict.keys():
data[fieldName] = dataDict[fieldName]
data = self._prepare_data(data, fieldNames)
self.source.edit_data(data, model_name=modelName, **requireKwargs)


def get_data(self, dataId, dataType):
if dataType == DataType.DOCUMENT_HEADER:
model_name = self.DOCUMENTS
header = dict()
self._request_fields(header, self.DOCUMENT_FIELDS,
model_name, dataId)
return header
if dataType == DataType.DOCUMENT_TEXT:
text = self.source.get_data('text',
model_name=self.DOCUMENTS,
doc_id=dataId)
return text

raise ValueError('Not supported data type')


def get_all_data(self, dataType):
uids = self.source.get_all_data('doc_id',
model_name=self.DOCUMENTS,
)
if (dataType == DataType.DOCUMENT_HEADER or
dataType == DataType.DOCUMENT_TEXT):
ret = {}
for uid in uids:
ret[uid] = self.get_data(uid, dataType)
return ret

raise ValueError('Not supported data type')


def put_data(self, docId, data, dataType):
if dataType == DataType.DOCUMENT_HEADER:
modelName = self.DOCUMENTS
if (self.source.get_data('doc_id', model_name=self.DOCUMENTS,
doc_id=docId) is None):
self._create_data(data, self.DOCUMENT_FIELDS,
modelName, doc_id=docId)
else:
self._edit_data(data, self.DOCUMENT_FIELDS, modelName, doc_id=docId)
return

if dataType == DataType.LINK:
modelName = self.LINKS
doc_id_from = data['doc_id_from']
doc_id_to = data['doc_id_to']
if (self.source.get_data('doc_id_from', model_name=self.LINKS,
doc_id_from=doc_id_from,
doc_id_to=doc_id_to) is None):
self._create_data(data, self.LINK_FIELDS, modelName)
else:
self._edit_data(data, self.LINK_FIELDS, modelName,
doc_id_from=doc_id_from,
doc_id_to=doc_id_to)

return

if dataType == DataType.DOCUMENT_TEXT:
if (self.source.get_data('doc_id', model_name=self.DOCUMENTS,
doc_id=docId is None)):
self.source.create_data(model_name=self.DOCUMENTS,
doc_id=docId,
text=data)
else:
self.source.edit_data({'text':data},
model_name=self.DOCUMENTS,
doc_id=docId
)
return

raise ValueError('Not supported data type')

def put_data_collection(self, dataCollection, dataType):
if (dataType == DataType.DOCUMENT_HEADER or
dataType == DataType.DOCUMENT_TEXT):
for uid in dataCollection:
self.put_data(uid, dataCollection[uid], dataType)
return
if dataType == DataType.LINK:
for link in dataCollection:
self.put_data('', link, DataType.LINK)
return
raise ValueError('Not supported data type')

106 changes: 106 additions & 0 deletions web_crawler/models/KSRFSource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os

import json
if __package__:
from web_crawler.web_crawler import DataSource,\
DataType, DataSourceType
from web_crawler.ksrf import *
else:
from web_crawler import DataSource,\
DataType, DataSourceType
from ksrf import *


class KSRFSource(DataSource):
_temp_folder = 'ksrf_temp_folder'
_decision_urls = dict()
_database_source = None

def __init__(self):
super().__init__('KSRFSource', DataSourceType.WEB_SOURCE)

def set_database(self, database):
'''
set given database data source to the data source
'''
self._database_source = database

def prepare(self):
'''
It try to prepare the data source for work and
return False if preparing have failed.
It return True if all is ok.
'''
try:
# TODO repair site available check
# res = ping(KSRF_PAGE_URI)
# if (not res):
# return False
headersFromBase = self._database_source.get_all_data(
DataType.DOCUMENT_HEADER)

if (headersFromBase is None or len(headersFromBase) == 0):
headers = get_decision_headers()
self._database_source.\
put_data_collection(headers,
DataType.DOCUMENT_HEADER)
else:
headers = headersFromBase

self._decision_urls = {}
for dataId in headers:
elem = headers[dataId]
self._decision_urls[dataId] = elem['text_source_url']
return True
except Exception as e:
print(e)
return False

def get_data(self, dataId: str, dataType: DataType):
'''
It gets data by the given id and dataType and return data.
If there is no such data, it returns None.
--
Only DataType.DOCUMENT_TEXT is supported.
'''
if (not isinstance(dataType, DataType)):
raise TypeError('dataType isn\'t instance of DataType')
if dataType == DataType.DOCUMENT_TEXT:
text = self._database_source.get_data(dataId, dataType)
if (text is None):
text = download_text(self._decision_urls[dataId],
dataId, self._temp_folder,
needReturnText=True)
self._database_source.put_data(dataId, text, dataType)
return text
raise ValueError("data type is not supported")

def get_all_data(self, dataType: DataType, needReload=False):
'''
Get's list of dict of all data of the given type.
Supported data types:
DataType.DOCUMENT_HEADER
DataType.DOCUMENT_TEXT
'''
if (not isinstance(dataType, DataType)):
raise TypeError('dataType isn\'t instance of DataType')

if (dataType == DataType.DOCUMENT_HEADER):
if (needReload):
headers = get_decision_headers()
self._database_source.\
put_data_collection(headers, DataType.DOCUMENT_HEADER)
else:
headers = self._database_source.get_all_data(
DataType.DOCUMENT_HEADER)
if (headers is None or len(headers) == 0):
headers = get_decision_headers()
self._database_source.\
put_data_collection(headers, DataType.DOCUMENT_HEADER)
return headers

if (dataType == DataType.DOCUMENT_TEXT):
return {dataId: self.get_data(id,
DataType.DOCUMENT_TEXT)
for dataId in self._decision_urls}
raise ValueError("data type is not supported")
Loading

0 comments on commit dd3b8c9

Please sign in to comment.