-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from robot-lab/Refactoring
Refactoring
- Loading branch information
Showing
10 changed files
with
499 additions
and
7 deletions.
There are no files selected for viewing
Binary file not shown.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import os | ||
|
||
import json | ||
if __package__: | ||
# retrorsum compatibility | ||
from web_crawler.models.DatabaseWrapper import DatabaseWrapper as KSRFDatabaseWrapper | ||
from web_crawler.models.KSRFSource import * | ||
from web_crawler.models.LocalFileStorageSource import * | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import os | ||
|
||
import json | ||
if __package__: | ||
from web_crawler.web_crawler import DataSource,\ | ||
DataType, DataSourceType | ||
|
||
|
||
class DatabaseWrapper(DataSource): | ||
source = None | ||
DOCUMENTS = 'Documents' | ||
LINKS = 'Links' | ||
DOCUMENT_FIELDS = ['supertype', 'doc_type','title', | ||
'release_date' , 'text_source_url', | ||
'effective_date','absolute_path', | ||
'interredaction_id', 'cons_selected_info'] | ||
LINK_FIELDS = ['doc_id_from', 'doc_id_to', 'positions_list', 'citations_number'] | ||
def __init__(self, name, dataSource): | ||
super().__init__(name, DataSourceType.DATABASE) | ||
self.source = dataSource | ||
|
||
def prepare(self): | ||
return True | ||
|
||
def _request_fields(self, retDict, fieldNames, modelName, doc_id): | ||
for fieldName in fieldNames: | ||
retDict[fieldName] = self.source.\ | ||
get_data(fieldName, | ||
model_name=modelName, | ||
doc_id=doc_id) | ||
|
||
|
||
def _prepare_data(self, data, fieldsNames): | ||
fieldName = 'positions_list' | ||
if fieldName in fieldsNames and fieldName in data.keys(): | ||
data['citations_number'] = len(data[fieldName]) | ||
data[fieldName] = [json.dumps(data[fieldName][i]) for i in range(len(data[fieldName]))] | ||
|
||
fieldName = 'cons_selected_info' | ||
if fieldName in fieldsNames and fieldName in data.keys(): | ||
data[fieldName] = json.dumps(data[fieldName]) | ||
|
||
return data | ||
|
||
|
||
def _create_data(self, dataDict, fieldNames, modelName, **requireKwargs): | ||
data = dict() | ||
for fieldName in fieldNames: | ||
if fieldName in dataDict.keys(): | ||
data[fieldName] = dataDict[fieldName] | ||
data = self._prepare_data(data, fieldNames) | ||
self.source.create_data(model_name=modelName, **data, | ||
**requireKwargs) | ||
|
||
|
||
def _edit_data(self, dataDict, fieldNames, modelName, **requireKwargs): | ||
data = dict() | ||
for fieldName in fieldNames: | ||
if fieldName in dataDict.keys(): | ||
data[fieldName] = dataDict[fieldName] | ||
data = self._prepare_data(data, fieldNames) | ||
self.source.edit_data(data, model_name=modelName, **requireKwargs) | ||
|
||
|
||
def get_data(self, dataId, dataType): | ||
if dataType == DataType.DOCUMENT_HEADER: | ||
model_name = self.DOCUMENTS | ||
header = dict() | ||
self._request_fields(header, self.DOCUMENT_FIELDS, | ||
model_name, dataId) | ||
return header | ||
if dataType == DataType.DOCUMENT_TEXT: | ||
text = self.source.get_data('text', | ||
model_name=self.DOCUMENTS, | ||
doc_id=dataId) | ||
return text | ||
|
||
raise ValueError('Not supported data type') | ||
|
||
|
||
def get_all_data(self, dataType): | ||
uids = self.source.get_all_data('doc_id', | ||
model_name=self.DOCUMENTS, | ||
) | ||
if (dataType == DataType.DOCUMENT_HEADER or | ||
dataType == DataType.DOCUMENT_TEXT): | ||
ret = {} | ||
for uid in uids: | ||
ret[uid] = self.get_data(uid, dataType) | ||
return ret | ||
|
||
raise ValueError('Not supported data type') | ||
|
||
|
||
def put_data(self, docId, data, dataType): | ||
if dataType == DataType.DOCUMENT_HEADER: | ||
modelName = self.DOCUMENTS | ||
if (self.source.get_data('doc_id', model_name=self.DOCUMENTS, | ||
doc_id=docId) is None): | ||
self._create_data(data, self.DOCUMENT_FIELDS, | ||
modelName, doc_id=docId) | ||
else: | ||
self._edit_data(data, self.DOCUMENT_FIELDS, modelName, doc_id=docId) | ||
return | ||
|
||
if dataType == DataType.LINK: | ||
modelName = self.LINKS | ||
doc_id_from = data['doc_id_from'] | ||
doc_id_to = data['doc_id_to'] | ||
if (self.source.get_data('doc_id_from', model_name=self.LINKS, | ||
doc_id_from=doc_id_from, | ||
doc_id_to=doc_id_to) is None): | ||
self._create_data(data, self.LINK_FIELDS, modelName) | ||
else: | ||
self._edit_data(data, self.LINK_FIELDS, modelName, | ||
doc_id_from=doc_id_from, | ||
doc_id_to=doc_id_to) | ||
|
||
return | ||
|
||
if dataType == DataType.DOCUMENT_TEXT: | ||
if (self.source.get_data('doc_id', model_name=self.DOCUMENTS, | ||
doc_id=docId is None)): | ||
self.source.create_data(model_name=self.DOCUMENTS, | ||
doc_id=docId, | ||
text=data) | ||
else: | ||
self.source.edit_data({'text':data}, | ||
model_name=self.DOCUMENTS, | ||
doc_id=docId | ||
) | ||
return | ||
|
||
raise ValueError('Not supported data type') | ||
|
||
def put_data_collection(self, dataCollection, dataType): | ||
if (dataType == DataType.DOCUMENT_HEADER or | ||
dataType == DataType.DOCUMENT_TEXT): | ||
for uid in dataCollection: | ||
self.put_data(uid, dataCollection[uid], dataType) | ||
return | ||
if dataType == DataType.LINK: | ||
for link in dataCollection: | ||
self.put_data('', link, DataType.LINK) | ||
return | ||
raise ValueError('Not supported data type') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import os | ||
|
||
import json | ||
if __package__: | ||
from web_crawler.web_crawler import DataSource,\ | ||
DataType, DataSourceType | ||
from web_crawler.ksrf import * | ||
else: | ||
from web_crawler import DataSource,\ | ||
DataType, DataSourceType | ||
from ksrf import * | ||
|
||
|
||
class KSRFSource(DataSource): | ||
_temp_folder = 'ksrf_temp_folder' | ||
_decision_urls = dict() | ||
_database_source = None | ||
|
||
def __init__(self): | ||
super().__init__('KSRFSource', DataSourceType.WEB_SOURCE) | ||
|
||
def set_database(self, database): | ||
''' | ||
set given database data source to the data source | ||
''' | ||
self._database_source = database | ||
|
||
def prepare(self): | ||
''' | ||
It try to prepare the data source for work and | ||
return False if preparing have failed. | ||
It return True if all is ok. | ||
''' | ||
try: | ||
# TODO repair site available check | ||
# res = ping(KSRF_PAGE_URI) | ||
# if (not res): | ||
# return False | ||
headersFromBase = self._database_source.get_all_data( | ||
DataType.DOCUMENT_HEADER) | ||
|
||
if (headersFromBase is None or len(headersFromBase) == 0): | ||
headers = get_decision_headers() | ||
self._database_source.\ | ||
put_data_collection(headers, | ||
DataType.DOCUMENT_HEADER) | ||
else: | ||
headers = headersFromBase | ||
|
||
self._decision_urls = {} | ||
for dataId in headers: | ||
elem = headers[dataId] | ||
self._decision_urls[dataId] = elem['text_source_url'] | ||
return True | ||
except Exception as e: | ||
print(e) | ||
return False | ||
|
||
def get_data(self, dataId: str, dataType: DataType): | ||
''' | ||
It gets data by the given id and dataType and return data. | ||
If there is no such data, it returns None. | ||
-- | ||
Only DataType.DOCUMENT_TEXT is supported. | ||
''' | ||
if (not isinstance(dataType, DataType)): | ||
raise TypeError('dataType isn\'t instance of DataType') | ||
if dataType == DataType.DOCUMENT_TEXT: | ||
text = self._database_source.get_data(dataId, dataType) | ||
if (text is None): | ||
text = download_text(self._decision_urls[dataId], | ||
dataId, self._temp_folder, | ||
needReturnText=True) | ||
self._database_source.put_data(dataId, text, dataType) | ||
return text | ||
raise ValueError("data type is not supported") | ||
|
||
def get_all_data(self, dataType: DataType, needReload=False): | ||
''' | ||
Get's list of dict of all data of the given type. | ||
Supported data types: | ||
DataType.DOCUMENT_HEADER | ||
DataType.DOCUMENT_TEXT | ||
''' | ||
if (not isinstance(dataType, DataType)): | ||
raise TypeError('dataType isn\'t instance of DataType') | ||
|
||
if (dataType == DataType.DOCUMENT_HEADER): | ||
if (needReload): | ||
headers = get_decision_headers() | ||
self._database_source.\ | ||
put_data_collection(headers, DataType.DOCUMENT_HEADER) | ||
else: | ||
headers = self._database_source.get_all_data( | ||
DataType.DOCUMENT_HEADER) | ||
if (headers is None or len(headers) == 0): | ||
headers = get_decision_headers() | ||
self._database_source.\ | ||
put_data_collection(headers, DataType.DOCUMENT_HEADER) | ||
return headers | ||
|
||
if (dataType == DataType.DOCUMENT_TEXT): | ||
return {dataId: self.get_data(id, | ||
DataType.DOCUMENT_TEXT) | ||
for dataId in self._decision_urls} | ||
raise ValueError("data type is not supported") |
Oops, something went wrong.