Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring #29

Merged
merged 9 commits into from
Dec 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified dist/web_crawler-0.1-py3-none-any.whl
Binary file not shown.
Empty file modified web_crawler/Selenium/chromedriver.exe
100644 → 100755
Empty file.
37 changes: 32 additions & 5 deletions web_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
__version__ = '0.1'

if __package__:
from web_crawler import ksrf
from web_crawler import ksrf_models, tools
from web_crawler.web_crawler import WebCrawler
from web_crawler.web_crawler\
import DataSource, DataSourceType, DataType
from web_crawler.models.DatabaseWrapper import DatabaseWrapper
else:
import ksrf
import ksrf_models
from web_crawler import WebCrawler
from web_crawler\
import DataSource, DataSourceType, DataType
Local_database_source = ksrf.LocalFileStorageSource()
KSRF_Source = ksrf.KSRFSource()
Local_database_source = ksrf_models.LocalFileStorageSource()
KSRF_Source = ksrf_models.KSRFSource()
Crawler = WebCrawler([Local_database_source, KSRF_Source])


Expand All @@ -23,8 +24,34 @@ def Init(sourceNameList=None, databaseSource=None):
'''
global Crawler
Crawler.prepare_sources(sourceNameList, databaseSource)

def Init_by_data_model(sourceNameList=None, databaseSource=None):
'''
Initialize web_crawler for working.
Should be invoked before any actions with
Crawler
'''
global Crawler
Crawler.prepare_sources(
databaseSource=DatabaseWrapper('DatabaseSource', databaseSource))

def Init_by_KSRF_wrapper(dataModels):
'''
Initialize web_crawler for working.
Should be invoked before any actions with
Crawler.
Activate KSRF_Source and KSRF_database sources
'''
wrapper_name = 'KSRFDatabase'
wrapper = ksrf_models.KSRFDatabaseWrapper(wrapper_name,
dataModels)
Crawler.collected_sources[wrapper_name] = wrapper
Crawler.prepare_sources([wrapper_name, 'KSRFSource'], wrapper)


# Local_database_source.folder_path = 'D:\\programming\\Judyst\\files'
# Local_database_source.prepare()
# Init(databaseSource=Local_database_source)

__all__ = ['Crawler', 'DataSourceType', 'DataType', 'Init']
__all__ = ['Crawler', 'DataSourceType', 'DataType',
'Init', 'Init_by_data_model', 'DatabaseWrapper']
12 changes: 10 additions & 2 deletions web_crawler/ksrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,16 @@
else:
from web_crawler import DataSource, DataSourceType, DataType

PATH_TO_CHROME_WEB_DRIVER = os.path.join(
os.path.dirname(__file__), 'Selenium', 'chromedriver.exe')

if system_name().lower() == 'windows':
PATH_TO_CHROME_WEB_DRIVER = os.path.join(os.path.dirname(__file__),
'Selenium','chromedriver.exe')
else:
PATH_TO_CHROME_WEB_DRIVER = os.path.join(os.path.dirname(__file__),
'Selenium','chromedriver')



KSRF_PAGE_URI = 'http://www.ksrf.ru/ru/Decision/Pages/default.aspx'


Expand Down
12 changes: 12 additions & 0 deletions web_crawler/ksrf_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import os

import json
if __package__:
# retrorsum compatibility
from web_crawler.models.DatabaseWrapper import DatabaseWrapper as KSRFDatabaseWrapper
from web_crawler.models.KSRFSource import *
from web_crawler.models.LocalFileStorageSource import *




147 changes: 147 additions & 0 deletions web_crawler/models/DatabaseWrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os

import json
if __package__:
from web_crawler.web_crawler import DataSource,\
DataType, DataSourceType


class DatabaseWrapper(DataSource):
source = None
DOCUMENTS = 'Documents'
LINKS = 'Links'
DOCUMENT_FIELDS = ['supertype', 'doc_type','title',
'release_date' , 'text_source_url',
'effective_date','absolute_path',
'interredaction_id', 'cons_selected_info']
LINK_FIELDS = ['doc_id_from', 'doc_id_to', 'positions_list', 'citations_number']
def __init__(self, name, dataSource):
super().__init__(name, DataSourceType.DATABASE)
self.source = dataSource

def prepare(self):
return True

def _request_fields(self, retDict, fieldNames, modelName, doc_id):
for fieldName in fieldNames:
retDict[fieldName] = self.source.\
get_data(fieldName,
model_name=modelName,
doc_id=doc_id)


def _prepare_data(self, data, fieldsNames):
fieldName = 'positions_list'
if fieldName in fieldsNames and fieldName in data.keys():
data['citations_number'] = len(data[fieldName])
data[fieldName] = [json.dumps(data[fieldName][i]) for i in range(len(data[fieldName]))]

fieldName = 'cons_selected_info'
if fieldName in fieldsNames and fieldName in data.keys():
data[fieldName] = json.dumps(data[fieldName])

return data


def _create_data(self, dataDict, fieldNames, modelName, **requireKwargs):
data = dict()
for fieldName in fieldNames:
if fieldName in dataDict.keys():
data[fieldName] = dataDict[fieldName]
data = self._prepare_data(data, fieldNames)
self.source.create_data(model_name=modelName, **data,
**requireKwargs)


def _edit_data(self, dataDict, fieldNames, modelName, **requireKwargs):
data = dict()
for fieldName in fieldNames:
if fieldName in dataDict.keys():
data[fieldName] = dataDict[fieldName]
data = self._prepare_data(data, fieldNames)
self.source.edit_data(data, model_name=modelName, **requireKwargs)


def get_data(self, dataId, dataType):
if dataType == DataType.DOCUMENT_HEADER:
model_name = self.DOCUMENTS
header = dict()
self._request_fields(header, self.DOCUMENT_FIELDS,
model_name, dataId)
return header
if dataType == DataType.DOCUMENT_TEXT:
text = self.source.get_data('text',
model_name=self.DOCUMENTS,
doc_id=dataId)
return text

raise ValueError('Not supported data type')


def get_all_data(self, dataType):
uids = self.source.get_all_data('doc_id',
model_name=self.DOCUMENTS,
)
if (dataType == DataType.DOCUMENT_HEADER or
dataType == DataType.DOCUMENT_TEXT):
ret = {}
for uid in uids:
ret[uid] = self.get_data(uid, dataType)
return ret

raise ValueError('Not supported data type')


def put_data(self, docId, data, dataType):
if dataType == DataType.DOCUMENT_HEADER:
modelName = self.DOCUMENTS
if (self.source.get_data('doc_id', model_name=self.DOCUMENTS,
doc_id=docId) is None):
self._create_data(data, self.DOCUMENT_FIELDS,
modelName, doc_id=docId)
else:
self._edit_data(data, self.DOCUMENT_FIELDS, modelName, doc_id=docId)
return

if dataType == DataType.LINK:
modelName = self.LINKS
doc_id_from = data['doc_id_from']
doc_id_to = data['doc_id_to']
if (self.source.get_data('doc_id_from', model_name=self.LINKS,
doc_id_from=doc_id_from,
doc_id_to=doc_id_to) is None):
self._create_data(data, self.LINK_FIELDS, modelName)
else:
self._edit_data(data, self.LINK_FIELDS, modelName,
doc_id_from=doc_id_from,
doc_id_to=doc_id_to)

return

if dataType == DataType.DOCUMENT_TEXT:
if (self.source.get_data('doc_id', model_name=self.DOCUMENTS,
doc_id=docId is None)):
self.source.create_data(model_name=self.DOCUMENTS,
doc_id=docId,
text=data)
else:
self.source.edit_data({'text':data},
model_name=self.DOCUMENTS,
doc_id=docId
)
return

raise ValueError('Not supported data type')

def put_data_collection(self, dataCollection, dataType):
if (dataType == DataType.DOCUMENT_HEADER or
dataType == DataType.DOCUMENT_TEXT):
for uid in dataCollection:
self.put_data(uid, dataCollection[uid], dataType)
return
if dataType == DataType.LINK:
for link in dataCollection:
self.put_data('', link, DataType.LINK)
return
raise ValueError('Not supported data type')

106 changes: 106 additions & 0 deletions web_crawler/models/KSRFSource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os

import json
if __package__:
from web_crawler.web_crawler import DataSource,\
DataType, DataSourceType
from web_crawler.ksrf import *
else:
from web_crawler import DataSource,\
DataType, DataSourceType
from ksrf import *


class KSRFSource(DataSource):
_temp_folder = 'ksrf_temp_folder'
_decision_urls = dict()
_database_source = None

def __init__(self):
super().__init__('KSRFSource', DataSourceType.WEB_SOURCE)

def set_database(self, database):
'''
set given database data source to the data source
'''
self._database_source = database

def prepare(self):
'''
It try to prepare the data source for work and
return False if preparing have failed.
It return True if all is ok.
'''
try:
# TODO repair site available check
# res = ping(KSRF_PAGE_URI)
Vasar007 marked this conversation as resolved.
Show resolved Hide resolved
# if (not res):
# return False
headersFromBase = self._database_source.get_all_data(
DataType.DOCUMENT_HEADER)

if (headersFromBase is None or len(headersFromBase) == 0):
headers = get_decision_headers()
self._database_source.\
put_data_collection(headers,
DataType.DOCUMENT_HEADER)
else:
headers = headersFromBase

self._decision_urls = {}
for dataId in headers:
elem = headers[dataId]
self._decision_urls[dataId] = elem['text_source_url']
return True
except Exception as e:
print(e)
return False

def get_data(self, dataId: str, dataType: DataType):
'''
Vasar007 marked this conversation as resolved.
Show resolved Hide resolved
It gets data by the given id and dataType and return data.
If there is no such data, it returns None.
--
Only DataType.DOCUMENT_TEXT is supported.
'''
if (not isinstance(dataType, DataType)):
raise TypeError('dataType isn\'t instance of DataType')
if dataType == DataType.DOCUMENT_TEXT:
text = self._database_source.get_data(dataId, dataType)
if (text is None):
text = download_text(self._decision_urls[dataId],
dataId, self._temp_folder,
needReturnText=True)
self._database_source.put_data(dataId, text, dataType)
return text
raise ValueError("data type is not supported")

def get_all_data(self, dataType: DataType, needReload=False):
'''
Get's list of dict of all data of the given type.
Supported data types:
DataType.DOCUMENT_HEADER
DataType.DOCUMENT_TEXT
'''
if (not isinstance(dataType, DataType)):
raise TypeError('dataType isn\'t instance of DataType')

if (dataType == DataType.DOCUMENT_HEADER):
if (needReload):
headers = get_decision_headers()
self._database_source.\
put_data_collection(headers, DataType.DOCUMENT_HEADER)
else:
headers = self._database_source.get_all_data(
DataType.DOCUMENT_HEADER)
if (headers is None or len(headers) == 0):
headers = get_decision_headers()
self._database_source.\
put_data_collection(headers, DataType.DOCUMENT_HEADER)
return headers

if (dataType == DataType.DOCUMENT_TEXT):
return {dataId: self.get_data(id,
DataType.DOCUMENT_TEXT)
for dataId in self._decision_urls}
raise ValueError("data type is not supported")
Loading