diff --git a/.gitignore b/.gitignore index ea2b3ea..c2b9743 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ ksrf_temp_folder/ TestResults/ link_analysis/json_to_pickle_converter.py link_analysis/my_funs.py +link_analysis/archive.py run.cmd #Decision Files Decision files0/ diff --git a/link_analysis/api_module.py b/link_analysis/api_module.py index d384b56..ca8d8d7 100644 --- a/link_analysis/api_module.py +++ b/link_analysis/api_module.py @@ -330,8 +330,8 @@ def start_process_with( if __name__ == "__main__": import time start_time = time.time() - process_period("18.06.1980", "18.07.2020", showPicture=False, - isNeedReloadHeaders=False, includeIsolatedNodes=False) + # process_period("18.06.1980", "18.07.2020", showPicture=False, + # isNeedReloadHeaders=False, includeIsolatedNodes=False) # process_period("18.06.1980", "18.07.2020", showPicture=False, # isNeedReloadHeaders=False, includeIsolatedNodes=False) # process_period( @@ -352,7 +352,7 @@ def start_process_with( # start_process_with(decisionID='КСРФ/1-П/2015', depth=3) - load_and_visualize() + # load_and_visualize() # start_process_with( # decisionID='КСРФ/1-П/2015', depth=10, @@ -369,5 +369,10 @@ def start_process_with( # showPicture=True, isNeedReloadHeaders=False) # source = web_crawler.Crawler.get_data_source('LocalFileStorage') # text=source.get_data('КСРФ/19-П/2014', web_crawler.DataType.DOCUMENT_TEXT) + + # process_period("18.09.2018", "18.07.2020", showPicture=True, + # isNeedReloadHeaders=False, includeIsolatedNodes=True) + import my_funs + my_funs.saving_all_clean_links() print(f"Headers collection spent {time.time()-start_time} seconds.") input('press any key...') \ No newline at end of file diff --git a/link_analysis/converters.py b/link_analysis/converters.py index d1fe7ad..1a5d49f 100644 --- a/link_analysis/converters.py +++ b/link_analysis/converters.py @@ -3,7 +3,7 @@ import os from typing import Dict, Iterable, TypeVar, Type, List, Union, Any -from models import Header, DuplicateHeader, DocumentHeader +from models import Header, DocumentHeader from final_analysis import CleanLink # Don't forget to add to this place new classes where implemented diff --git a/link_analysis/final_analysis.py b/link_analysis/final_analysis.py index 698b032..b52d272 100644 --- a/link_analysis/final_analysis.py +++ b/link_analysis/final_analysis.py @@ -1,5 +1,5 @@ import re -from models import Header, DuplicateHeader, CleanLink +from models import Header, CleanLink, Positions from models import LinkGraph from rough_analysis import RoughLink from typing import Dict, Tuple, List, Union @@ -10,7 +10,7 @@ def get_clean_links( collectedLinks: Dict[Header, List[RoughLink]], - courtSiteContent: Dict[str, Union[Header, DuplicateHeader]], + courtSiteContent: Dict[str, Header], courtPrefix: str='КСРФ/') -> Tuple[Dict[Header, List[CleanLink]], Dict[Header, List[RoughLink]]]: ''' @@ -35,17 +35,10 @@ def get_clean_links( gottenID = (courtPrefix + number[0].upper() + '/' + years.pop()) if gottenID in courtSiteContent: - try: - if isinstance(courtSiteContent[gottenID], - DuplicateHeader): - raise TypeError("It links on duplicating " - "document") - except TypeError: - break eggs = True years.clear() headerTo = courtSiteContent[gottenID] - positionAndContext = (link.position, link.context) + positionAndContext = link.positions cleanLink = None for cl in checkedLinks[headerFrom]: if cl.header_to == headerTo: diff --git a/link_analysis/models.py b/link_analysis/models.py index 3c128d2..c213702 100644 --- a/link_analysis/models.py +++ b/link_analysis/models.py @@ -1,6 +1,6 @@ import datetime import collections -from typing import Type, Optional, Union +from typing import Type, Optional, Union, Dict import dateutil.parser # License: Apache Software Licenseid, BSD License (Dual License) @@ -46,29 +46,25 @@ def __hash__(self) -> int: @staticmethod def convert_from_dict(key: str, - oldFormatHeader: dict):# -> Type[DocumentHeader]: + oldFormatHeader: Dict):# -> Type[DocumentHeader]: """ Convert dict object to instance of subclass of class DocumentHeader. :param key: str. Key which related with oldFormatHeader. - :param oldFormatHeader: {dict}. + :param oldFormatHeader: dict. Dict object that stores data about document. :return: DocumentHeader. Instance of one of subclasses (Header or DuplicateHeader). """ - + if 'not unique' in oldFormatHeader: + raise TypeError("'class DuplicateHeader' is not supported anymore.") if not isinstance(key, str): raise TypeError(f"'key' must be instance of {str}") - if (not isinstance(oldFormatHeader, dict) and not isinstance(oldFormatHeader, tuple) and - not isinstance(oldFormatHeader, list)): - raise TypeError(f"'oldFormatHeader' must be instance of {dict} or {tuple} or {list}") - - if 'not unique' in oldFormatHeader: - return DuplicateHeader.convert_from_dict(key, oldFormatHeader) - else: - return Header.convert_from_dict(key, oldFormatHeader) + if not isinstance(oldFormatHeader, dict) : + raise TypeError(f"'oldFormatHeader' must be instance of {dict}") + return Header.convert_from_dict(key, oldFormatHeader) class Header(DocumentHeader): @@ -201,7 +197,7 @@ def convert_from_dict(key: str, oldFormatHeader: dict): if not isinstance(key, str): raise TypeError(f"'key' mus be instance of {str}") if not isinstance(oldFormatHeader, dict): - raise TypeError("'oldFormatHeader' must be instance of 'dict'") + raise TypeError(f"'oldFormatHeader' must be instance of {dict}'") try: docID = key @@ -222,275 +218,101 @@ def convert_from_dict(key: str, oldFormatHeader: dict): return Header(docID, supertype, docType, title, releaseDate, textSourceUrl, textLocation) -class DuplicateHeader(DocumentHeader): - - """ - Subclass of DocumentHeader. Implements storage of data - about document whose identifier is not unique. - - :attribute doc_id: str. - Common ID of duplicated documents. - :attribute header_list: list. - List with instances of class Header. - Any of them stores data about one of the duplicated documents. - - :method append: instancemethod. - Append instance of class Header that contains data - about duplicated document at self.header_list. - :method convert_to_dict: instancemethod. - Convert instance to dict object. - :method convert_from_dict: staticmethod. - Convert dict object to instance of own class. - Called from superclass by iterface method with same name. - """ - - def __init__(self, docID, supertype=None, docType=None, title=None, releaseDate=None, - textSourceUrl=None, textLocation=None): +class Link: + def __init__(self, headerFrom): """ - Constructor with optinal arguments. You must specify either - argument 'docID' only to create empty list that ready to append - new elements or all arguments except optional 'textLocation' to create - list with first element. - - :param docID: str. - Common ID of duplicated documents. - :param supertype: str. - Supertype of first duplicated document that be added at list. - :param docType: str. - Type of first duplicated document that be added at list. - :param title: str. - Title of first duplicated document that be added at list. - :param releaseDate: datetime.date. - Release date of first duplicated document that be added at list. - :param textSourceUrl: str. - URL of text source of first duplicated document that be added at list. - :param textLocation: str, optional (default=None). - Text location of first duplicated document that be added at list. + :param headerFrom: class Header + Citing document """ - if isinstance(docID, str): - super().__init__(docID) - else: - raise TypeError(f"'docID' must be instance of {str}") - if isinstance(supertype, str) or supertype is None: - self.supertype = supertype - else: - raise TypeError(f"'supertype' must be instance of {str}") - if isinstance(docType, str) or docType is None: - self.doc_type = docType - else: - raise TypeError(f"'docType' must be instance of {str}") - if isinstance(title, str) or title is None: - self.title = title - else: - raise TypeError(f"'title' must be instance of {str}") - if isinstance(releaseDate, datetime.date) or releaseDate is None: - self.release_date = releaseDate - else: - raise TypeError(f"'release_date' must be instance of {datetime.date}") - if isinstance(textSourceUrl, str) or textSourceUrl is None: - self.text_source_url = textSourceUrl - else: - raise TypeError(f"'textSourceUrl' must be instance of {str}") - if isinstance(textLocation, str) or textLocation is None: - self.text_location = textLocation - else: - raise TypeError(f"'textLocation' must be instance of {str}") - - if (supertype is None and docType is None and title is None and - releaseDate is None and textSourceUrl is None and - textLocation is None): - self.header_list = [] - elif (supertype is not None and docType is not None and - title is not None and releaseDate is not None and - textSourceUrl is not None): - self.header_list = [Header(docID, supertype, docType, title, - releaseDate, textSourceUrl, textLocation)] - else: - raise ValueError("You must specify either argument 'docID' only or" - " all arguments except optional 'textLocation'") + if not isinstance(headerFrom, Header): + raise TypeError(f"'headerFrom' must be instance of {Header}") + self.header_from = headerFrom def __eq__(self, other): if not isinstance(other, type(self)): raise TypeError(f"Compared objects must be of the same type:" f"{type(self)} or {type(other)}") - return (super().__eq__(other) and - (collections.Counter(self.header_list) == - collections.Counter(other.header_list))) + return self.header_from == other.header_from def __ne__(self, other): return not self.__eq__(other) def __hash__(self): - return super().__hash__() + return hash(self.header_from) - def append(self, supertype, docType, title, releaseDate, textSourceUrl, - textLocation=None): - """ - Append instance of class Header that contains data - about duplicated document at self.header_list. +class Positions: - :param supertype: str. - Supertype of document that be added at list. - :param docType: str. - Type of document that be added at list. - :param title: str. - Title of document that be added at list. - :param releaseDate: datetime.date. - Release date of document that be added at list. - :param textSourceUrl: str. - URL of text source of document that be added at list. - :param textLocation: str, optional (default=None). - Text location of document that be added at list. - """ - if isinstance(supertype, str) or supertype is None: - self.supertype = supertype - else: - raise TypeError(f"'supertype' must be instance of {str}") - if isinstance(docType, str) or docType is None: - self.doc_type = docType + def __init__(self, contextStartPos, contextEndPos, linkStartPos, linkEndPos): + if isinstance(contextStartPos, int): + self.context_start = contextStartPos else: - raise TypeError(f"'docType' must be instance of {str}") - if isinstance(title, str) or title is None: - self.title = title + raise TypeError(f"'contextStartPos' must be {int}") + if isinstance(contextEndPos, int): + self.context_end = contextEndPos else: - raise TypeError(f"'title' must be instance of {str}") - if isinstance(releaseDate, datetime.date) or releaseDate is None: - self.release_date = releaseDate + raise TypeError(f"'contextEndPos' must be {int}") + if isinstance(linkStartPos, int): + self.link_start = linkStartPos else: - raise TypeError(f"'releaseDate' must be instance of {datetime.date}") - if isinstance(textSourceUrl, str) or textSourceUrl is None: - self.text_source_url = textSourceUrl + raise TypeError(f"'linkStartPos' must be {int}") + if isinstance(linkEndPos, int): + self.link_end = linkEndPos else: - raise TypeError(f"'textSourceUrl' must be instance of {str}") - if isinstance(textLocation, str) or textLocation is None: - self.text_location = textLocation - else: - raise TypeError(f"'textLocation' must be instance of {str}") - - h = Header(self.doc_id, supertype, docType, title, releaseDate, textSourceUrl, - textLocation) - if h not in self.header_list: - self.header_list.append(h) - - def convert_to_dict(self): - """ - Convert instance to dict object that stores all values of attributes of instance. - - :return: dict. - Dict object that stores values of attributes of instance. - """ - dhList = [] - for dupHeader in self.header_list: - dh = { - 'supertype': dupHeader.supertype, - 'doc_type': dupHeader.doc_type, - 'title': dupHeader.title, - 'release_date': dupHeader.release_date.strftime('%d.%m.%Y'), - 'text_source_url': dupHeader.text_source_url - } - if dupHeader.text_location is not None: - dh['text_location'] = dupHeader.text_location - dhList.append(dh) - return ('not unique', dhList) - - @staticmethod - def convert_from_dict(key: str, oldFormatHeader: dict): - """ - Convert dict object to instance of own class. - Called from superclass by iterface method with same name. - - :param key: str. - Key which related with oldFormatHeader. - :param oldFormatHeader: dict. - Dict object that stores data about document. - - :return: DuplicateHeader. - Instance of own class. - """ - if not isinstance(key, str): - raise TypeError(f"'key' mus be instance of {str}") - if (not isinstance(oldFormatHeader, dict) and not isinstance(oldFormatHeader, tuple) and - not isinstance(oldFormatHeader, list)): - raise TypeError(f"'oldFormatHeader' must be instance of {dict} or {tuple} or {list}") - docID = key - duplicateHeader = DuplicateHeader(docID) - try: - for dh in oldFormatHeader[1]: - supertype = dh['supertype'] - docType = dh['doc_type'] - title = dh['title'] - releaseDate = dateutil.parser.parse(dh['release_date'], - dayfirst=True).date() - textSourceUrl = dh['text_source_url'] - if 'text_location' in dh: - textLocation = dh['text_location'] - else: - textLocation = None - duplicateHeader.append(supertype, docType, title, releaseDate, - textSourceUrl, textLocation) - except KeyError: - raise KeyError( - "'supertype', 'doc_type', 'title', 'release_date', " - "'text_source_url' is required, only 'text_location' " - "is optional") - return duplicateHeader - - -class Link: - def __init__(self, headerFrom): - """ - :param headerFrom: class Header - Citing document - """ - if not isinstance(headerFrom, Header): - raise TypeError("Variable 'headerFrom' is not instance " - "of class Header") - self.header_from = headerFrom + raise TypeError(f"'linkEndPos' must be {int}") def __eq__(self, other): if not isinstance(other, type(self)): raise TypeError(f"Compared objects must be of the same type:" f"{type(self)} or {type(other)}") - return self.header_from == other.header_from + return (self.context_start == other.context_start and + self.context_end == other.context_end and + self.link_start == other.link_start and + self.link_end == other.link_end) def __ne__(self, other): return not self.__eq__(other) - + def __hash__(self): - return hash(self.header_from) - + return hash(tuple(hash(self.context_start), + hash(self.context_end), + hash(self.link_start), + hash(self.link_end) + )) class RoughLink(Link): - def __init__(self, headerFrom, body, context, position): + def __init__(self, headerFrom: Header, body: str, positions: Positions): """ :param headerFrom: class Header Citing document """ if not isinstance(headerFrom, Header): - raise TypeError("Variable 'headerFrom' is not instance " - "of class Header") + raise TypeError(f"'headerFrom' must be instance of {Header}") super().__init__(headerFrom) - self.body = body - self.context = context - self.position = position + if isinstance(body, str): + self.body = body + else: + raise TypeError(f"'body' must be instance of {str}") + if isinstance(positions, Positions): + self.positions = positions + else: + raise TypeError(f"'positions' must be instance of {Positions}") def __eq__(self, other): if not isinstance(other, type(self)): raise TypeError(f"Compared objects must be of the same type:" f"{type(self)} or {type(other)}") return (super().__eq__(other) and - self.context == other.context and self.body == other.body and - self.position == other.position) + self.positions == other.positions) def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return hash(tuple([super().__hash__(), - hash(self.context), hash(self.body), - hash(self.position)])) + hash(self.positions)])) + class CleanLink(Link): @@ -499,19 +321,31 @@ class member: positionsAndContexts: list of tuple(int, str) where int variable (position) is start position of str variable (contex) in text """ + def __init__(self, headerFrom, headerTo, citationsNumber, - positionsAndContexts): + positionsList): """ positionsAndContexts: tuple or list of tuples, " or set of tuples(int, str) """ super().__init__(headerFrom) - self.header_to = headerTo - self.citations_number = citationsNumber - if isinstance(positionsAndContexts, tuple): - self.positions_and_contexts = [positionsAndContexts] + if isinstance(headerTo, Header): + self.header_to = headerTo + else: + raise TypeError(f"'headerTo' must be instance of {Header}") + + if isinstance(citationsNumber, int): + self.citations_number = citationsNumber + else: + raise TypeError(f"'citationsNumber' must be instance of {int}") + if isinstance(positionsList, Positions): + self.positions_list = [positionsList] + elif (isinstance(positionsList, list) or + isinstance(positionsList, set) or + isinstance(positionsList, tuple)): + self.positions_list = list(positionsList) else: - self.positions_and_contexts = list(positionsAndContexts) + raise TypeError(f"'positionsList' must be instance of {list} or {tuple} or {set}") def __eq__(self, other): if not isinstance(other, type(self)): @@ -520,8 +354,8 @@ def __eq__(self, other): return (super().__eq__(other) and self.header_to == other.header_to and self.citations_number == other.citations_number and - (collections.Counter(self.positions_and_contexts) == - collections.Counter(other.positions_and_contexts))) + (collections.Counter(self.positions_list) == + collections.Counter(other.positions_list))) def __ne__(self, other): return not self.__eq__(other) @@ -530,17 +364,15 @@ def __hash__(self): return hash(tuple([super().__hash__(), hash(self.header_to)])) def append(self, positionAndContext: tuple): - self.positions_and_contexts.append(positionAndContext) + self.positions_list.append(positionAndContext) def convert_to_dict(self): cleanLinkDict = { 'doc_id_from': self.header_from.doc_id, - 'doc_id_to': self.header_to.doc_id, - 'to_doc_title': self.header_to.title, - 'citations_number': self.citations_number, - 'contexts_list': [pac[1] for pac in self.positions_and_contexts], - 'positions_list': [pac[0] for pac in self.positions_and_contexts] + 'doc_id_to': self.header_to.doc_id } + positionsDictList = [pos.__dict__ for pos in self.positions_list] + cleanLinkDict['positions_list'] = positionsDictList return cleanLinkDict @@ -553,27 +385,36 @@ def __init__(self, supertypes=None, docTypes=None, firstDate=None, lastDate=None): if hasattr(supertypes, '__iter__'): self.supertypes = set(supertypes) + for st in supertypes: + if not isinstance(st, str): + raise TypeError(f"any element from 'supertypes' must be instance of {str}") + elif supertypes is None: + self.supertypes = supertypes else: - self.supertypes = None + raise TypeError(f"'supertypes' must be iterable structure: {list}, {set}, {tuple}") + if hasattr(docTypes, '__iter__'): self.doc_types = set(docTypes) + for st in docTypes: + if not isinstance(st, str): + raise TypeError(f"any element from 'docTypes' must be instance of {str}") + elif docTypes is None: + self.doc_types = docTypes else: - self.doc_types = None + raise TypeError(f"'docTypes' must be iterable structure: {list}, {set}, {tuple}") if firstDate is None: self.first_date = datetime.date.min elif isinstance(firstDate, datetime.date): self.first_date = firstDate else: - raise TypeError("Variable 'firstDate' is not instance " - "of datetime.date") + raise TypeError(f"'firstDate' must be instance of {datetime.date}") if lastDate is None: self.last_date = datetime.date.max elif isinstance(lastDate, datetime.date): self.last_date = lastDate else: - raise TypeError("Variable 'lastDate' is not instance " - "of datetime.date") + raise TypeError(f"'lastDate' must be instance of {datetime.date}") def __eq__(self, other): if not isinstance(other, type(self)): @@ -594,6 +435,8 @@ def __hash__(self): hash(self.last_date)])) def check_header(self, header): + if not isinstance(header, Header): + raise TypeError(f"'header' must be instance of {Header}") if ((self.supertypes is None or header.supertype in self.supertypes) and (self.doc_types is None or @@ -603,7 +446,7 @@ def check_header(self, header): else: return False - def get_filtered_headers(self, headersDict): + def get_filtered_headers(self, headersDict: Dict[str, Header]) -> Dict[str, Header]: resultDict = {} for key in headersDict: if (isinstance(headersDict[key], Header) and @@ -623,8 +466,19 @@ def __init__(self, supertype=None, docTypes=None, firstDate=None, lastDate=None, indegreeRange=None, outdegreeRange=None): super().__init__(supertype, docTypes, firstDate, lastDate) - self.indegree_range = indegreeRange - self.outdegree_range = outdegreeRange + if (isinstance(indegreeRange, tuple) or isinstance(indegreeRange, list)): + self.indegree_range = tuple(indegreeRange) + elif indegreeRange is None: + self.indegree_range = indegreeRange + else: + raise TypeError(f"'indegreeRange' must be instance of {tuple} or {list}") + if (isinstance(outdegreeRange, tuple) or isinstance(outdegreeRange, list)): + self.outdegree_range = tuple(outdegreeRange) + elif outdegreeRange is None: + self.outdegree_range = outdegreeRange + else: + raise TypeError(f"'outdegreeRange' must be instance of {tuple} or {list}") + def __eq__(self, other): if not isinstance(other, type(self)): @@ -648,12 +502,25 @@ class GraphEdgesFilter(): Arguments contains conditions for which edges will be selected.\n weightsRange: tuple that implements line segment [int, int] """ - def __init__(self, headersFilterFrom=None, headerFilterTo=None, + def __init__(self, headersFilterFrom=None, headersFilterTo=None, weightsRange=None): + if (isinstance(headersFilterFrom, HeadersFilter) or + headersFilterFrom is None): + self.headers_filter_from = headersFilterFrom + else: + raise TypeError(f"'headersFilterFrom' must be instance of {HeadersFilter}") + if (isinstance(headersFilterTo, HeadersFilter) or + headersFilterTo is None): + self.headers_filter_to = headersFilterTo + else: + raise TypeError(f"'headersFilterTo' must be instance of {HeadersFilter}") + if (isinstance(weightsRange, tuple) or isinstance(weightsRange, list)): + self.weights_range = tuple(weightsRange) + elif weightsRange is None: + self.weights_range = weightsRange + else: + raise TypeError(f"'weightsRange' must be instance of {tuple} or {list}") - self.headers_filter_from = headersFilterFrom - self.headers_filter_to = headerFilterTo - self.weights_range = weightsRange def __eq__(self, other): if not isinstance(other, type(self)): @@ -673,6 +540,9 @@ def __hash__(self): def check_edge(self, edge: CleanLink): """edge: class CleanLink""" + if not isinstance(edge, CleanLink): + raise TypeError(f"'edge' must be instance of {CleanLink}") + if ((self.headers_filter_from is None or self.headers_filter_from.check_header(edge.header_from) ) and @@ -690,6 +560,8 @@ def get_filtered_edges(self, edges): edges: list or set of instances of class CleanLink\n returns set of instances of class CleanLink """ + if not (isinstance(edges, set) or isinstance(edges, list) or isinstance(edges, tuple)): + raise TypeError(f"'edge' must be of instance of {set} or {list} or {tuple}") result = {edge for edge in edges if self.check_edge(edge)} return result @@ -726,14 +598,14 @@ def __hash__(self): def add_node(self, node): if not isinstance(node, Header): - raise TypeError("Variable 'node' is not instance " - "of class Header") + raise TypeError(f"'node is not instance " + "of {Header}") self.nodes.add(node) def add_edge(self, edge): if not isinstance(edge, CleanLink): - raise TypeError("Variable 'edge' is not instance " - "of class CleanLink") + raise TypeError(f"'edge' is not instance " + "of {CleanLink}") self.edges.add(edge) def get_all_nodes_degrees(self): @@ -749,14 +621,14 @@ def get_all_nodes_degrees(self): def get_subgraph(self, nodesFilter=None, edgesFilter=None, includeIsolatedNodes=True): - if (nodesFilter is None and edgesFilter is None): - return self + if not isinstance(includeIsolatedNodes, bool): + raise TypeError("'includeIsolatedNodes' must be instance of {bool}") subgraph = LinkGraph() # filters nodes if nodesFilter is not None: if not isinstance(nodesFilter, GraphNodesFilter): - raise TypeError("Variable 'nodesFilter' is not instance " + raise TypeError(f"Variable 'nodesFilter' is not instance " "of class GraphNodesFilter") if (nodesFilter.indegree_range is not None or nodesFilter.outdegree_range is not None): @@ -783,7 +655,7 @@ def get_subgraph(self, nodesFilter=None, edgesFilter=None, # filters edges if edgesFilter is not None: if not isinstance(edgesFilter, GraphEdgesFilter): - raise TypeError("Variable 'edgesFilter' is not instance " + raise TypeError(f"Variable 'edgesFilter' is not instance " "of class GraphEdgesFilter") # If nodes are filtered, we must check the edges @@ -840,12 +712,6 @@ class IterableLinkGraph(LinkGraph): # stub # "https://goto.ru") # h4 = Header("456-О-О/2018", "КСРФ/О-О", "Заголовк", date, # "https://goto.ru") - h5 = DuplicateHeader("456-О-О/2018", "КСРФ/О-О", "Заголовк", date, - "https://goto.ru") - h6 = DuplicateHeader("426-О-О/2018", "КСРФ/О-О", "Заголовк", date, - "https://goto.ru") - print(hash(h5)) - h5 == h6 # h5.append("КСРФ/О-О", "Заголовк", datetime.date(1990, 1, 2), # "https://goto.ru") # h6 = DuplicateHeader("456-О-О/2018", "КСРФ/О-О", "Заголовк", date, diff --git a/link_analysis/rough_analysis.py b/link_analysis/rough_analysis.py index 1a2f940..f696a38 100644 --- a/link_analysis/rough_analysis.py +++ b/link_analysis/rough_analysis.py @@ -1,45 +1,43 @@ import re -from typing import Dict, List, Union -from models import Header, RoughLink, DuplicateHeader +from typing import Dict, List, Union, Type +from models import Header, RoughLink, Positions # link pattern main part lpMP = (r".*?\sот[\s\d]+?(?:(?:января|февраля|марта|апреля|мая|июня|июля|" r"августа|сентября|октября|ноября|декабря)+?[\s\d]+?года|\d{2}\." - r"\d{2}\.\d{4})[\s\d]+?(№|N)[\s\d]+?[-\w/]*.*?") + r"\d{2}\.\d{4})[\s\d]+?(?:№|N)[\s\d]+?[-\w/]*.*?") # link pattern prefix #1 -lpPRF1 = r"(?<=\.\s)\s*?[А-Я]" +lpPRF1 = r"(?<=\.\s)\s*?[А-ЯA-Z]" # link pattern postfix #1 -lpPSF1 = r"(?=\.\s[А-Я])" +lpPSF1 = r"(?=\.\s[А-ЯA-Z])" # link pattern prefix #2 -lpPRF2 = r"(?<=^)\s*?[А-Яа-я]" +lpPRF2 = r"(?<=^)\s*?[А-ЯA-Zа-яa-z]" # link pattern postfix #2 lpPSF2 = r"(?=\.$)" -linkPattern = re.compile( - f"(?:{lpPRF1+lpMP+lpPSF1}|{lpPRF1+lpMP+lpPSF2}|{lpPRF2+lpMP+lpPSF1}|" - f"{lpPRF2+lpMP+lpPSF2})", re.VERBOSE) +linkPattern = re.compile(f"""(?:{lpPRF1+lpMP+lpPSF1}|{lpPRF1+lpMP+lpPSF2}| + {lpPRF2+lpMP+lpPSF1}|{lpPRF2+lpMP+lpPSF2})""", re.VERBOSE) # pattern for removing of redundant leading sentences -reductionPattern = re.compile(r"(?:[А-Я].*[^А-Я]\.\s*(?=[А-Я])|^[А-Яа-я]" - r".*[^А-Я]\.\s*(?=[А-Я]))") +reductionPattern = re.compile(r"(?:[А-ЯA-Z].*[^А-ЯA-Z]\.\s*(?=[А-ЯA-Z])|^[А-ЯA-Zа-яa-z]" + r".*[^А-ЯA-Z]\.\s*(?=[А-ЯA-Z]))") # same part of two regular expressions below -samePart = (r"т[\s\d]+?(?:(?:января|февраля|марта|апреля|мая|июня|июля|" - r"августа|сентября|октября|ноября|декабря)+?[\s\d]+?года|\d{2}" - r"\.\d{2}\.\d{4})(?=\s)") -splitPattern = re.compile( - f"(?i)о(?={samePart})") -datePattern = re.compile( - f"(?i){samePart}") +splitPattern = re.compile(r"""(?i)о(?=т[\s\d]+?(?:(?:января|февраля|марта|апреля|мая|июня|июля| + августа|сентября|октября|ноября|декабря)+?[\s\d]+?года|\d{2} + \.\d{2}\.\d{4})[\s\d]+?(?:№|N))""") +datePattern = re.compile(r"""(?i)т[\s\d]+?(?:(?:января|февраля|марта|апреля|мая|июня|июля| + августа|сентября|октября|ноября|декабря)+?[\s\d]+?года|\d{2} + \.\d{2}\.\d{4})(?=\s)""") numberPattern = re.compile(r'(?:№|N)[\s\d]+[-\w/]*') opinionPattern = re.compile(r'(?i)мнение\s+судьи\s+конституционного') -def get_rough_links(header: Header) -> List[RoughLink]: +def get_rough_links(header: Header) -> Union[List[RoughLink], Type[TypeError], Type[FileNotFoundError]]: """ :param header: instance of class models.Header """ try: - with open(header.text_location, 'r', encoding="utf-8") as file: + with open(header.text_location, 'r', encoding="utf-8") as file: # debug file reading will be deleted soon text = file.read() except TypeError: return TypeError @@ -53,18 +51,31 @@ def get_rough_links(header: Header) -> List[RoughLink]: matchObjects = linkPattern.finditer(text) for match in matchObjects: linksForSplit = match[0] - context = reductionPattern.sub('', linksForSplit) + '.' - position = match.start(0) + len(splitPattern.split(linksForSplit, - maxsplit=1)[0]) + 1 + reduct = reductionPattern.search(linksForSplit) + if reduct is not None: + reductCorrection = reduct.end() + #context = linksForSplit.replace(reduct[0], '') + '.' + else: + reductCorrection = 0 + #context = linksForSplit + linkCorrection = len(splitPattern.split(linksForSplit, + maxsplit=1)[0]) + contextStartPos = match.start(0) + reductCorrection + contextEndPos = match.end(0) + 1 + linkStartPos = match.start(0) + linkCorrection + splitedLinksForDifferentYears = splitPattern.split(linksForSplit)[1:] for oneYearLinks in splitedLinksForDifferentYears: date = datePattern.search(oneYearLinks)[0] - numbers = numberPattern.findall(oneYearLinks) - for number in numbers: - gottenRoughLink = 'о' + date + ' ' + number.upper() - roughLinks.append(RoughLink(header, gottenRoughLink, context, - position)) - position += len(oneYearLinks) + 1 + matchNumbers = list(numberPattern.finditer(oneYearLinks)) + + linkEndPos = linkStartPos + matchNumbers[-1].end(0) + 1 + for number in matchNumbers: + gottenRoughLink = 'о' + date + ' ' + number[0].upper() + roughLinks.append(RoughLink(header, gottenRoughLink, + Positions(contextStartPos, contextEndPos, + linkStartPos, linkEndPos))) + linkStartPos += len(oneYearLinks) + 1 return roughLinks @@ -73,7 +84,7 @@ def get_rough_links(header: Header) -> List[RoughLink]: def get_rough_links_for_multiple_docs( - headers: Dict[str, Union[Header, DuplicateHeader]]) -> Dict[Header, List[RoughLink]]: + headers: Dict[str, Header]) -> Dict[Header, List[RoughLink]]: """ :param header: dict of instances of class models.Header return dict with list of instances of class RoughLink @@ -83,8 +94,8 @@ def get_rough_links_for_multiple_docs( """ result = {} # type: Dict[Header, List[RoughLink]] for decisionID in headers: - if isinstance(headers[decisionID], DuplicateHeader): - continue + if not isinstance(headers[decisionID], Header): + raise TypeError(f"Any element of 'headers' must be instance of {Header}") maybeRoughLinks = get_rough_links(headers[decisionID]) if maybeRoughLinks is TypeError: if PATH_NONE_VALUE_KEY not in result: