diff --git a/dist/web_crawler-0.1-py3-none-any.whl b/dist/web_crawler-0.1-py3-none-any.whl index ccdd3ec..60dcad7 100644 Binary files a/dist/web_crawler-0.1-py3-none-any.whl and b/dist/web_crawler-0.1-py3-none-any.whl differ diff --git a/web_crawler/Selenium/chromedriver.exe b/web_crawler/Selenium/chromedriver.exe old mode 100644 new mode 100755 diff --git a/web_crawler/__init__.py b/web_crawler/__init__.py index 1e92c67..5798efe 100644 --- a/web_crawler/__init__.py +++ b/web_crawler/__init__.py @@ -1,17 +1,18 @@ __version__ = '0.1' if __package__: - from web_crawler import ksrf + from web_crawler import ksrf_models, tools from web_crawler.web_crawler import WebCrawler from web_crawler.web_crawler\ import DataSource, DataSourceType, DataType + from web_crawler.models.DatabaseWrapper import DatabaseWrapper else: - import ksrf + import ksrf_models from web_crawler import WebCrawler from web_crawler\ import DataSource, DataSourceType, DataType -Local_database_source = ksrf.LocalFileStorageSource() -KSRF_Source = ksrf.KSRFSource() +Local_database_source = ksrf_models.LocalFileStorageSource() +KSRF_Source = ksrf_models.KSRFSource() Crawler = WebCrawler([Local_database_source, KSRF_Source]) @@ -23,8 +24,34 @@ def Init(sourceNameList=None, databaseSource=None): ''' global Crawler Crawler.prepare_sources(sourceNameList, databaseSource) + +def Init_by_data_model(sourceNameList=None, databaseSource=None): + ''' + Initialize web_crawler for working. + Should be invoked before any actions with + Crawler + ''' + global Crawler + Crawler.prepare_sources( + databaseSource=DatabaseWrapper('DatabaseSource', databaseSource)) + +def Init_by_KSRF_wrapper(dataModels): + ''' + Initialize web_crawler for working. + Should be invoked before any actions with + Crawler. + Activate KSRF_Source and KSRF_database sources + ''' + wrapper_name = 'KSRFDatabase' + wrapper = ksrf_models.KSRFDatabaseWrapper(wrapper_name, + dataModels) + Crawler.collected_sources[wrapper_name] = wrapper + Crawler.prepare_sources([wrapper_name, 'KSRFSource'], wrapper) + + # Local_database_source.folder_path = 'D:\\programming\\Judyst\\files' # Local_database_source.prepare() # Init(databaseSource=Local_database_source) -__all__ = ['Crawler', 'DataSourceType', 'DataType', 'Init'] +__all__ = ['Crawler', 'DataSourceType', 'DataType', + 'Init', 'Init_by_data_model', 'DatabaseWrapper'] diff --git a/web_crawler/ksrf.py b/web_crawler/ksrf.py index 05cc62e..838e6e7 100644 --- a/web_crawler/ksrf.py +++ b/web_crawler/ksrf.py @@ -25,8 +25,16 @@ else: from web_crawler import DataSource, DataSourceType, DataType -PATH_TO_CHROME_WEB_DRIVER = os.path.join( - os.path.dirname(__file__), 'Selenium', 'chromedriver.exe') + +if system_name().lower() == 'windows': + PATH_TO_CHROME_WEB_DRIVER = os.path.join(os.path.dirname(__file__), + 'Selenium','chromedriver.exe') +else: + PATH_TO_CHROME_WEB_DRIVER = os.path.join(os.path.dirname(__file__), + 'Selenium','chromedriver') + + + KSRF_PAGE_URI = 'http://www.ksrf.ru/ru/Decision/Pages/default.aspx' diff --git a/web_crawler/ksrf_models.py b/web_crawler/ksrf_models.py new file mode 100644 index 0000000..12b73b4 --- /dev/null +++ b/web_crawler/ksrf_models.py @@ -0,0 +1,12 @@ +import os + +import json +if __package__: + # retrorsum compatibility + from web_crawler.models.DatabaseWrapper import DatabaseWrapper as KSRFDatabaseWrapper + from web_crawler.models.KSRFSource import * + from web_crawler.models.LocalFileStorageSource import * + + + + diff --git a/web_crawler/models/DatabaseWrapper.py b/web_crawler/models/DatabaseWrapper.py new file mode 100644 index 0000000..73e2970 --- /dev/null +++ b/web_crawler/models/DatabaseWrapper.py @@ -0,0 +1,147 @@ +import os + +import json +if __package__: + from web_crawler.web_crawler import DataSource,\ + DataType, DataSourceType + + +class DatabaseWrapper(DataSource): + source = None + DOCUMENTS = 'Documents' + LINKS = 'Links' + DOCUMENT_FIELDS = ['supertype', 'doc_type','title', + 'release_date' , 'text_source_url', + 'effective_date','absolute_path', + 'interredaction_id', 'cons_selected_info'] + LINK_FIELDS = ['doc_id_from', 'doc_id_to', 'positions_list', 'citations_number'] + def __init__(self, name, dataSource): + super().__init__(name, DataSourceType.DATABASE) + self.source = dataSource + + def prepare(self): + return True + + def _request_fields(self, retDict, fieldNames, modelName, doc_id): + for fieldName in fieldNames: + retDict[fieldName] = self.source.\ + get_data(fieldName, + model_name=modelName, + doc_id=doc_id) + + + def _prepare_data(self, data, fieldsNames): + fieldName = 'positions_list' + if fieldName in fieldsNames and fieldName in data.keys(): + data['citations_number'] = len(data[fieldName]) + data[fieldName] = [json.dumps(data[fieldName][i]) for i in range(len(data[fieldName]))] + + fieldName = 'cons_selected_info' + if fieldName in fieldsNames and fieldName in data.keys(): + data[fieldName] = json.dumps(data[fieldName]) + + return data + + + def _create_data(self, dataDict, fieldNames, modelName, **requireKwargs): + data = dict() + for fieldName in fieldNames: + if fieldName in dataDict.keys(): + data[fieldName] = dataDict[fieldName] + data = self._prepare_data(data, fieldNames) + self.source.create_data(model_name=modelName, **data, + **requireKwargs) + + + def _edit_data(self, dataDict, fieldNames, modelName, **requireKwargs): + data = dict() + for fieldName in fieldNames: + if fieldName in dataDict.keys(): + data[fieldName] = dataDict[fieldName] + data = self._prepare_data(data, fieldNames) + self.source.edit_data(data, model_name=modelName, **requireKwargs) + + + def get_data(self, dataId, dataType): + if dataType == DataType.DOCUMENT_HEADER: + model_name = self.DOCUMENTS + header = dict() + self._request_fields(header, self.DOCUMENT_FIELDS, + model_name, dataId) + return header + if dataType == DataType.DOCUMENT_TEXT: + text = self.source.get_data('text', + model_name=self.DOCUMENTS, + doc_id=dataId) + return text + + raise ValueError('Not supported data type') + + + def get_all_data(self, dataType): + uids = self.source.get_all_data('doc_id', + model_name=self.DOCUMENTS, + ) + if (dataType == DataType.DOCUMENT_HEADER or + dataType == DataType.DOCUMENT_TEXT): + ret = {} + for uid in uids: + ret[uid] = self.get_data(uid, dataType) + return ret + + raise ValueError('Not supported data type') + + + def put_data(self, docId, data, dataType): + if dataType == DataType.DOCUMENT_HEADER: + modelName = self.DOCUMENTS + if (self.source.get_data('doc_id', model_name=self.DOCUMENTS, + doc_id=docId) is None): + self._create_data(data, self.DOCUMENT_FIELDS, + modelName, doc_id=docId) + else: + self._edit_data(data, self.DOCUMENT_FIELDS, modelName, doc_id=docId) + return + + if dataType == DataType.LINK: + modelName = self.LINKS + doc_id_from = data['doc_id_from'] + doc_id_to = data['doc_id_to'] + if (self.source.get_data('doc_id_from', model_name=self.LINKS, + doc_id_from=doc_id_from, + doc_id_to=doc_id_to) is None): + self._create_data(data, self.LINK_FIELDS, modelName) + else: + self._edit_data(data, self.LINK_FIELDS, modelName, + doc_id_from=doc_id_from, + doc_id_to=doc_id_to) + + return + + if dataType == DataType.DOCUMENT_TEXT: + if (self.source.get_data('doc_id', model_name=self.DOCUMENTS, + doc_id=docId is None)): + self.source.create_data(model_name=self.DOCUMENTS, + doc_id=docId, + text=data) + else: + self.source.edit_data({'text':data}, + model_name=self.DOCUMENTS, + doc_id=docId + ) + return + + raise ValueError('Not supported data type') + + def put_data_collection(self, dataCollection, dataType): + if (dataType == DataType.DOCUMENT_HEADER or + dataType == DataType.DOCUMENT_TEXT): + for uid in dataCollection: + self.put_data(uid, dataCollection[uid], dataType) + return + if dataType == DataType.LINK: + for link in dataCollection: + self.put_data('', link, DataType.LINK) + return + raise ValueError('Not supported data type') + diff --git a/web_crawler/models/KSRFSource.py b/web_crawler/models/KSRFSource.py new file mode 100644 index 0000000..d6dafb0 --- /dev/null +++ b/web_crawler/models/KSRFSource.py @@ -0,0 +1,106 @@ +import os + +import json +if __package__: + from web_crawler.web_crawler import DataSource,\ + DataType, DataSourceType + from web_crawler.ksrf import * +else: + from web_crawler import DataSource,\ + DataType, DataSourceType + from ksrf import * + + +class KSRFSource(DataSource): + _temp_folder = 'ksrf_temp_folder' + _decision_urls = dict() + _database_source = None + + def __init__(self): + super().__init__('KSRFSource', DataSourceType.WEB_SOURCE) + + def set_database(self, database): + ''' + set given database data source to the data source + ''' + self._database_source = database + + def prepare(self): + ''' + It try to prepare the data source for work and + return False if preparing have failed. + It return True if all is ok. + ''' + try: + # TODO repair site available check + # res = ping(KSRF_PAGE_URI) + # if (not res): + # return False + headersFromBase = self._database_source.get_all_data( + DataType.DOCUMENT_HEADER) + + if (headersFromBase is None or len(headersFromBase) == 0): + headers = get_decision_headers() + self._database_source.\ + put_data_collection(headers, + DataType.DOCUMENT_HEADER) + else: + headers = headersFromBase + + self._decision_urls = {} + for dataId in headers: + elem = headers[dataId] + self._decision_urls[dataId] = elem['text_source_url'] + return True + except Exception as e: + print(e) + return False + + def get_data(self, dataId: str, dataType: DataType): + ''' + It gets data by the given id and dataType and return data. + If there is no such data, it returns None. + -- + Only DataType.DOCUMENT_TEXT is supported. + ''' + if (not isinstance(dataType, DataType)): + raise TypeError('dataType isn\'t instance of DataType') + if dataType == DataType.DOCUMENT_TEXT: + text = self._database_source.get_data(dataId, dataType) + if (text is None): + text = download_text(self._decision_urls[dataId], + dataId, self._temp_folder, + needReturnText=True) + self._database_source.put_data(dataId, text, dataType) + return text + raise ValueError("data type is not supported") + + def get_all_data(self, dataType: DataType, needReload=False): + ''' + Get's list of dict of all data of the given type. + Supported data types: + DataType.DOCUMENT_HEADER + DataType.DOCUMENT_TEXT + ''' + if (not isinstance(dataType, DataType)): + raise TypeError('dataType isn\'t instance of DataType') + + if (dataType == DataType.DOCUMENT_HEADER): + if (needReload): + headers = get_decision_headers() + self._database_source.\ + put_data_collection(headers, DataType.DOCUMENT_HEADER) + else: + headers = self._database_source.get_all_data( + DataType.DOCUMENT_HEADER) + if (headers is None or len(headers) == 0): + headers = get_decision_headers() + self._database_source.\ + put_data_collection(headers, DataType.DOCUMENT_HEADER) + return headers + + if (dataType == DataType.DOCUMENT_TEXT): + return {dataId: self.get_data(id, + DataType.DOCUMENT_TEXT) + for dataId in self._decision_urls} + raise ValueError("data type is not supported") diff --git a/web_crawler/models/LocalFileStorageSource.py b/web_crawler/models/LocalFileStorageSource.py new file mode 100644 index 0000000..e77a5b8 --- /dev/null +++ b/web_crawler/models/LocalFileStorageSource.py @@ -0,0 +1,122 @@ +import os +import json + +if __package__: + from web_crawler.web_crawler import DataSource,\ + DataType, DataSourceType + from web_crawler.ksrf import * + + +class LocalFileStorageSource(DataSource): + headers = dict() + folder_path = 'ksrf_temp_folder' + HEADERS_FILE_NAME = 'DecisionHeaders.json' + + def __init__(self): + super().__init__('LocalFileStorage', DataSourceType.DATABASE) + + def prepare(self): + try: + if (not os.path.exists(self.folder_path)): + os.mkdir(self.folder_path) + headersFilePath = os.path.join(self.folder_path, + self.HEADERS_FILE_NAME) + if (os.path.exists(headersFilePath)): + with open(headersFilePath, 'rt', encoding='utf-8')\ + as headersFile: + headers = json.loads(headersFile.read()) + self.headers = {uid: headers[uid] + for uid in headers + if 'not unique' not in headers[uid]} + + except: + return False + return True + + def get_data(self, dataId: str, dataType: DataType): + ''' + It gets data by the given id and dataType and return data. + If there is no such data, it returns None. + Supported data types: + DataType.DOCUMENT_HEADER + DataType.DOCUMENT_TEXT + ''' + if (not isinstance(dataType, DataType)): + raise TypeError('dataType isn\'t instance of DataType') + + if (dataType == DataType.DOCUMENT_HEADER): + return self.headers[dataId] + elif (dataType == DataType.DOCUMENT_TEXT): + textFileName = get_possible_text_location(dataId, self.folder_path) + if (not os.path.exists(textFileName)): + text = download_text(self.headers[dataId]['text_source_url'], + dataId, self.folder_path, True, True)[1] + with open(textFileName, 'rt', encoding='utf-8') as textFile: + text = textFile.read() + + if text is None: + raise ValueError("Can't get text") + else: + return text + else: + raise ValueError("Not supported data type") + + def get_all_data(self, dataType: DataType): + ''' + Get's list of dict of all data of the given type. + Supported data types: + DataType.DOCUMENT_HEADER + DataType.DOCUMENT_TEXT + ''' + if (not isinstance(dataType, DataType)): + raise TypeError('dataType isn\'t instance of DataType') + + if (dataType == DataType.DOCUMENT_HEADER): + if (len(self.headers) > 0): + return self.headers + else: + return None + + if (dataType == DataType.DOCUMENT_TEXT): + return {docID: self.get_data(docID, DataType.DOCUMENT_TEXT) + for docID in self.headers} + else: + raise ValueError("Not supported data type.") + + def put_data(self, docID, data, dataType: DataType): + ''' + Save the data in the local file. + Supported data types: + DataType.DOCUMENT_HEADER + DataType.DOCUMENT_TEXT + ''' + if (not isinstance(dataType, DataType)): + raise TypeError('dataType isn\'t instance of DataType') + if (dataType == DataType.DOCUMENT_HEADER): + self.headers[docID] = data + + elif (dataType == DataType.DOCUMENT_TEXT): + with open( + get_possible_text_location( + docID, self.folder_path), 'wt', encoding='utf-8')\ + as fileTXT: + fileTXT.write(data) + else: + raise ValueError('dataType ins\t supported') + + def put_data_collection(self, dataDict, dataType: DataType): + ''' + Iterate the given dataDict and invoke put_data for each + elem of the dictionary. + ''' + if (not isinstance(dataDict, dict)): + raise TypeError('dataDict isn\'t dict') + if (not isinstance(dataType, DataType)): + raise TypeError('dataType isn\'t instance of DataType') + for dataKey in dataDict: + self.put_data(dataKey, dataDict[dataKey], dataType) + if (dataType == DataType.DOCUMENT_HEADER): + with open(os.path.join(self.folder_path, + self.HEADERS_FILE_NAME), + 'wt', encoding='utf-8') as headersFile: + headersFile.write(json.dumps(self.headers)) diff --git a/web_crawler/tools.py b/web_crawler/tools.py new file mode 100644 index 0000000..669fd3c --- /dev/null +++ b/web_crawler/tools.py @@ -0,0 +1,69 @@ +# encoding: 'utf-8' +import time +import json + +if __package__: + from web_crawler.web_crawler import DataType +else: + from web_crawler import DataType + +DOCUMENT_FIELDS = ['supertype', 'doc_type','title', + 'release_date' , 'text_source_url', + 'effective_date','absolute_path', + 'interredaction_id', 'cons_selected_info'] + +LINK_FIELDS = ['doc_id_from', 'doc_id_to', 'positions_list', 'citations_number'] + + +FILE_FORMATS = ['jsonlines', 'json'] + +def fill_data_source_from_file(dataSource, fileName, + fileFormat='jsonlines', + dataType=DataType.DOCUMENT_HEADER): + with open(fileName,'rt', encoding='utf-8') as f: + if (fileFormat == FILE_FORMATS[0]): + if (dataType == DataType.LINK): + for line in f: + link = json.loads(line) + dataSource.put_data('', link, dataType) + else: + for line in f: + header = json.loads(line) + doc_id = list(header.keys())[0] + header = header[doc_id] + dataSource.put_data(doc_id, header, dataType) + elif fileFormat == FILE_FORMATS[1]: + if (dataType == DataType.LINK): + links = json.loads(f.read()) + for link in links: + dataSource.put_data('', link, dataType) + else: + headers = json.loads(f.read()) + for doc_id in headers: + dataSource.put_data(doc_id, headers[doc_id], dataType) + + +def split_dup_headers(headers): + pass + + +def updatae_database_from_source(databaseSource, source, supertype='KSRF'): + ''' + Update documents in databaseSource by documents from source + ''' + + if (supertype == 'KSRF'): + print(time.time()) + print('Start updating... ') + headers = source.get_all_data(DataType.DOCUMENT_HEADER) + print(f'headers length: {len(headers)}') + databaseSource.put_data_collection(headers, + DataType.DOCUMENT_HEADER) + print('headers loaded') + for uid in headers: + text = source.get_data(uid, DataType.DOCUMENT_TEXT) + databaseSource.put_data(uid, text, DataType.DOCUMENT_TEXT) + print(f'uid {uid} puted.') + print(time.time()) + print('all done') + diff --git a/web_crawler/web_crawler.py b/web_crawler/web_crawler.py index 938d979..8591d87 100644 --- a/web_crawler/web_crawler.py +++ b/web_crawler/web_crawler.py @@ -8,6 +8,7 @@ class DataType(Enum): DOCUMENT_HEADER = 0 DOCUMENT_TEXT = 1 ANALYZIS_RESULT = 2 + LINK = 3 class DataSourceType(Enum):