Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mehchanges #93

Merged
merged 5 commits into from
Oct 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
*.rar
ksrf_temp_folder/
TestResults/
link_analysis/json_to_pickle_converter.py
link_analysis/my_funs.py
link_analysis/archive.py
run.cmd
#Decision Files
Decision files0/
Decision Files/*
<<<<<<< HEAD
Decision Files0/
env64/
link_analysis/json_to_pickle_converter.py
link_analysis/my_funs.py
=======
>>>>>>> 2baad1f36ae87adc1badf8aaf9b08a48e9b3d127
Decision Files/
*.pickle
*.json

# test graph
graph.json


.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -101,6 +103,7 @@ celerybeat-schedule
.env
.venv
env/
env64/
venv/
ENV/
env.bak/
Expand All @@ -122,9 +125,4 @@ venv.bak/
# VS code
.vscode

# test graph
graph.json


.DS_Store

Binary file modified dist/link_analysis-0.1-py3-none-any.whl
Binary file not shown.
Binary file modified dist/link_analysis-0.1.tar.gz
Binary file not shown.
111 changes: 67 additions & 44 deletions link_analysis/api_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

# other imports---------------------------------------------------------
import os.path
from datetime import date
import datetime

from dateutil import parser
import dateutil.parser
# License: Apache Software License, BSD License (Dual License)

# imports Core modules--------------------------------------------------
Expand All @@ -17,6 +17,7 @@
import visualizer
import converters
from web_crawler import ksrf
import web_crawler
# methods---------------------------------------------------------------


Expand All @@ -30,7 +31,7 @@
PICKLE_HEADERS_FILENAME)
PATH_TO_JSON_GRAPH = 'graph.json'

MY_DEBUG = False
MY_DEBUG = True

def collect_headers(pathToFileForSave, pagesNum=None):
headersOld = ksrf.get_decision_headers(pagesNum)
Expand Down Expand Up @@ -59,7 +60,7 @@ def download_texts_for_headers(headers, folder=DECISIONS_FOLDER_NAME):
(headers[key].text_location is None or
not os.path.exists(headers[key].text_location))):
oldFormatHeader = headers[key].convert_to_dict()
ksrf.download_decision_texts({key: oldFormatHeader}, folder)
ksrf.download_all_texts({key: oldFormatHeader}, folder)


def load_graph(pathToGraph=PATH_TO_JSON_GRAPH):
Expand All @@ -86,12 +87,15 @@ def load_and_visualize(pathTograph=PATH_TO_JSON_GRAPH):

def process_period(
firstDateOfDocsForProcessing=None, lastDateOfDocsForProcessing=None,
supertypesForProcessing=None,
docTypesForProcessing=None,
firstDateForNodes=None, lastDateForNodes=None,
nodesIndegreeRange=None, nodesOutdegreeRange=None, nodesTypes=None,
includeIsolatedNodes=True,
firstDateFrom=None, lastDateFrom=None, docTypesFrom=None,
supertypesFrom=None,
firstDateTo=None, lastDateTo=None, docTypesTo=None,
supertypesTo=None,
weightsRange=None,
graphOutputFilePath=PATH_TO_JSON_GRAPH,
showPicture=True, isNeedReloadHeaders=False):
Expand All @@ -102,10 +106,10 @@ def process_period(
draw graph and show it to user.
'''
if isinstance(firstDateOfDocsForProcessing, str):
firstDateOfDocsForProcessing = parser.parse(
firstDateOfDocsForProcessing = dateutil.parser.parse(
firstDateOfDocsForProcessing, dayfirst=True).date()
if isinstance(lastDateOfDocsForProcessing, str):
lastDateOfDocsForProcessing = parser.parse(
lastDateOfDocsForProcessing = dateutil.parser.parse(
lastDateOfDocsForProcessing, dayfirst=True).date()
if (firstDateOfDocsForProcessing is not None and
lastDateOfDocsForProcessing is not None and
Expand All @@ -114,10 +118,10 @@ def process_period(
"than the last date.")

if isinstance(firstDateForNodes, str):
firstDateForNodes = parser.parse(
firstDateForNodes = dateutil.parser.parse(
firstDateForNodes, dayfirst=True).date()
if isinstance(lastDateForNodes, str):
lastDateForNodes = parser.parse(
lastDateForNodes = dateutil.parser.parse(
lastDateForNodes, dayfirst=True).date()
if (firstDateForNodes is not None and
lastDateForNodes is not None and
Expand All @@ -126,21 +130,21 @@ def process_period(
"than the last date.")

if isinstance(firstDateFrom, str):
firstDateFrom = parser.parse(
firstDateFrom = dateutil.parser.parse(
firstDateFrom, dayfirst=True).date()
if isinstance(lastDateFrom, str):
lastDateFrom = parser.parse(
lastDateFrom = dateutil.parser.parse(
lastDateFrom, dayfirst=True).date()
if (firstDateFrom is not None and
lastDateFrom is not None and
firstDateFrom > lastDateFrom):
raise ValueError("date error: The first date is later than the last date.")

if isinstance(firstDateTo, str):
firstDateTo = parser.parse(
firstDateTo = dateutil.parser.parse(
firstDateTo, dayfirst=True).date()
if isinstance(lastDateTo, str):
lastDateTo = parser.parse(
lastDateTo = dateutil.parser.parse(
lastDateTo, dayfirst=True).date()
if (firstDateTo is not None and
lastDateTo is not None and
Expand All @@ -149,12 +153,14 @@ def process_period(

decisionsHeaders = {}
if (isNeedReloadHeaders or not os.path.exists(PATH_TO_PICKLE_HEADERS)):
num = 3 # stub, del after web_crawler updating
decisionsHeaders = collect_headers(PATH_TO_PICKLE_HEADERS, num)
# num = 3 # stub, del after web_crawler updating
# decisionsHeaders = collect_headers(PATH_TO_PICKLE_HEADERS, num)
decisionsHeaders = collect_headers(PATH_TO_PICKLE_HEADERS)
else:
decisionsHeaders = converters.load_pickle(PATH_TO_PICKLE_HEADERS)

hFilter = models.HeadersFilter(
supertypesForProcessing,
docTypesForProcessing,
firstDateOfDocsForProcessing, lastDateOfDocsForProcessing)
usingHeaders = hFilter.get_filtered_headers(decisionsHeaders)
Expand All @@ -173,27 +179,31 @@ def process_period(
if (rough_analysis.PATH_NONE_VALUE_KEY in roughLinksDict or
rough_analysis.PATH_NOT_EXIST_KEY in roughLinksDict):
raise ValueError('Some headers have no text')
links = final_analysis.get_clean_links(roughLinksDict,
decisionsHeaders)[0]


response = final_analysis.get_clean_links(roughLinksDict,
decisionsHeaders)
links, rejectedLinks = response[0], response[1]
if MY_DEBUG:
converters.save_pickle(links, 'allCleanLinks.pickle')
converters.save_pickle(links, 'TestResults\\allCleanLinks.pickle')
converters.save_pickle(rejectedLinks, 'TestResults\\allRejectedLinks.pickle')
linkGraph = final_analysis.get_link_graph(links)
if MY_DEBUG:
converters.save_pickle(linkGraph, 'linkGraph.pickle')
converters.save_pickle(linkGraph, 'TestResults\\linkGraph.pickle')
nFilter = models.GraphNodesFilter(
nodesTypes, firstDateForNodes, lastDateForNodes, nodesIndegreeRange,
nodesOutdegreeRange)
hFromFilter = models.HeadersFilter(
supertypesFrom,
docTypesFrom,
firstDateFrom, lastDateFrom)
hToFilter = models.HeadersFilter(
supertypesTo,
docTypesTo,
firstDateTo, lastDateTo)
eFilter = models.GraphEdgesFilter(hFromFilter, hToFilter, weightsRange)
subgraph = linkGraph.get_subgraph(nFilter, eFilter, includeIsolatedNodes)
if MY_DEBUG:
converters.save_pickle(subgraph, 'subgraph.pickle')
converters.save_pickle(subgraph, 'TestResults\\subgraph.pickle')
linkGraphLists = (subgraph.get_nodes_as_IDs_list(),
subgraph.get_edges_as_list_of_tuples())

Expand All @@ -208,7 +218,9 @@ def start_process_with(
firstDateForNodes=None, lastDateForNodes=None, nodesIndegreeRange=None,
nodesOutdegreeRange=None, nodesTypes=None, includeIsolatedNodes=True,
firstDateFrom=None, lastDateFrom=None, docTypesFrom=None,
supertypesFrom=None,
firstDateTo=None, lastDateTo=None, docTypesTo=None,
supertypesTo=None,
weightsRange=None,
graphOutputFilePath=PATH_TO_JSON_GRAPH,
showPicture=True, isNeedReloadHeaders=False,
Expand All @@ -221,40 +233,41 @@ def start_process_with(
raise "argument error: depth of the recursion must be large than 0."

if isNeedReloadHeaders or not os.path.exists(PATH_TO_PICKLE_HEADERS):
num = 3 # stub, del after web_crawler updating
headers = collect_headers(PATH_TO_PICKLE_HEADERS, num)
# num = 3 # stub, del after web_crawler updating
# headers = collect_headers(PATH_TO_PICKLE_HEADERS, num)
headers = collect_headers(PATH_TO_PICKLE_HEADERS)
else:
headers = converters.load_pickle(PATH_TO_PICKLE_HEADERS)
if (decisionID not in headers):
raise ValueError("Unknown uid")

if isinstance(firstDateForNodes, str):
firstDateForNodes = parser.parse(
firstDateForNodes = dateutil.parser.parse(
firstDateForNodes, dayfirst=True).date()
if isinstance(lastDateForNodes, str):
lastDateForNodes = parser.parse(
lastDateForNodes = dateutil.parser.parse(
lastDateForNodes, dayfirst=True).date()
if (firstDateForNodes is not None and
lastDateForNodes is not None and
firstDateForNodes > lastDateForNodes):
raise ValueError("date error: The first date is later than the last date.")

if isinstance(firstDateFrom, str):
firstDateFrom = parser.parse(
firstDateFrom = dateutil.parser.parse(
firstDateFrom, dayfirst=True).date()
if isinstance(lastDateFrom, str):
lastDateFrom = parser.parse(
lastDateFrom = dateutil.parser.parse(
lastDateFrom, dayfirst=True).date()
if (firstDateFrom is not None and
lastDateFrom is not None and
firstDateFrom > lastDateFrom):
raise ValueError("date error: The first date is later than the last date.")

if isinstance(firstDateTo, str):
firstDateTo = parser.parse(
firstDateTo = dateutil.parser.parse(
firstDateTo, dayfirst=True).date()
if isinstance(lastDateTo, str):
lastDateTo = parser.parse(
lastDateTo = dateutil.parser.parse(
lastDateTo, dayfirst=True).date()
if (firstDateTo is not None and
lastDateTo is not None and
Expand Down Expand Up @@ -292,9 +305,11 @@ def start_process_with(
nodesTypes, firstDateForNodes, lastDateForNodes, nodesIndegreeRange,
nodesOutdegreeRange)
hFromFilter = models.HeadersFilter(
supertypesFrom,
docTypesFrom,
firstDateFrom, lastDateFrom)
hToFilter = models.HeadersFilter(
supertypesTo,
docTypesTo,
firstDateTo, lastDateTo)
eFilter = models.GraphEdgesFilter(hFromFilter, hToFilter, weightsRange)
Expand All @@ -316,8 +331,9 @@ def start_process_with(
import time
start_time = time.time()
# process_period("18.06.1980", "18.07.2020", showPicture=False,
# isNeedReloadHeaders=False, includeIsolatedNodes=True)

# isNeedReloadHeaders=False, includeIsolatedNodes=False)
# process_period("18.06.1980", "18.07.2020", showPicture=False,
# isNeedReloadHeaders=False, includeIsolatedNodes=False)
# process_period(
# firstDateOfDocsForProcessing='18.03.2013',
# lastDateOfDocsForProcessing='14.08.2018',
Expand All @@ -335,21 +351,28 @@ def start_process_with(
# showPicture=True, isNeedReloadHeaders=False)

# start_process_with(decisionID='КСРФ/1-П/2015', depth=3)

# load_and_visualize()
start_process_with(
decisionID='КСРФ/1-П/2015', depth=10,
firstDateForNodes='18.03.2014', lastDateForNodes='14.08.2018',
nodesIndegreeRange=(0, 25), nodesOutdegreeRange=(0, 25),
nodesTypes={'КСРФ/О', 'КСРФ/П'},
includeIsolatedNodes=False,
firstDateFrom='18.03.2011', lastDateFrom='14.08.2019',
docTypesFrom={'КСРФ/О', 'КСРФ/П'},
firstDateTo='18.03.2011', lastDateTo='14.08.2018',
docTypesTo={'КСРФ/О', 'КСРФ/П'},
weightsRange=(1, 5),
graphOutputFilePath=PATH_TO_JSON_GRAPH,
showPicture=True, isNeedReloadHeaders=False)

# start_process_with(
# decisionID='КСРФ/1-П/2015', depth=10,
# firstDateForNodes='18.03.2014', lastDateForNodes='14.08.2018',
# nodesIndegreeRange=(0, 25), nodesOutdegreeRange=(0, 25),
# nodesTypes={'КСРФ/О', 'КСРФ/П'},
# includeIsolatedNodes=False,
# firstDateFrom='18.03.2011', lastDateFrom='14.08.2019',
# docTypesFrom={'КСРФ/О', 'КСРФ/П'},
# firstDateTo='18.03.2011', lastDateTo='14.08.2018',
# docTypesTo={'КСРФ/О', 'КСРФ/П'},
# weightsRange=(1, 5),
# graphOutputFilePath=PATH_TO_JSON_GRAPH,
# showPicture=True, isNeedReloadHeaders=False)
# source = web_crawler.Crawler.get_data_source('LocalFileStorage')
# text=source.get_data('КСРФ/19-П/2014', web_crawler.DataType.DOCUMENT_TEXT)

# process_period("18.09.2018", "18.07.2020", showPicture=True,
# isNeedReloadHeaders=False, includeIsolatedNodes=True)
import my_funs
my_funs.saving_all_clean_links()
print(f"Headers collection spent {time.time()-start_time} seconds.")
# get_only_unique_headers()
input('press any key...')
6 changes: 3 additions & 3 deletions link_analysis/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
from typing import Dict, Iterable, TypeVar, Type, List, Union, Any

from models import Header, DuplicateHeader, DocumentHeader
from models import Header, DocumentHeader
from final_analysis import CleanLink

# Don't forget to add to this place new classes where implemented
Expand Down Expand Up @@ -66,7 +66,7 @@ def save_json(jsonSerializableData: object, pathToFile: str) -> bool:
dirname = os.path.dirname(pathToFile)
if dirname:
os.makedirs(dirname, exist_ok=True)
with open(pathToFile, 'w') as jsonFile:
with open(pathToFile, 'w', encoding='utf-8') as jsonFile:
json.dump(jsonSerializableData, jsonFile)
except OSError:
return False
Expand All @@ -75,7 +75,7 @@ def save_json(jsonSerializableData: object, pathToFile: str) -> bool:

def load_json(pathToFile: str) -> Union[object, None]:
try:
with open(pathToFile) as jsonFile:
with open(pathToFile, encoding='utf-8') as jsonFile:
data = json.load(jsonFile)
except OSError:
return None
Expand Down
13 changes: 3 additions & 10 deletions link_analysis/final_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from models import Header, DuplicateHeader, CleanLink
from models import Header, CleanLink, Positions
from models import LinkGraph
from rough_analysis import RoughLink
from typing import Dict, Tuple, List, Union
Expand All @@ -10,7 +10,7 @@

def get_clean_links(
collectedLinks: Dict[Header, List[RoughLink]],
courtSiteContent: Dict[str, Union[Header, DuplicateHeader]],
courtSiteContent: Dict[str, Header],
courtPrefix: str='КСРФ/') -> Tuple[Dict[Header, List[CleanLink]],
Dict[Header, List[RoughLink]]]:
'''
Expand All @@ -35,17 +35,10 @@ def get_clean_links(
gottenID = (courtPrefix + number[0].upper() +
'/' + years.pop())
if gottenID in courtSiteContent:
try:
if isinstance(courtSiteContent[gottenID],
DuplicateHeader):
raise TypeError("It links on duplicating "
"document")
except TypeError:
break
eggs = True
years.clear()
headerTo = courtSiteContent[gottenID]
positionAndContext = (link.position, link.context)
positionAndContext = link.positions
cleanLink = None
for cl in checkedLinks[headerFrom]:
if cl.header_to == headerTo:
Expand Down
Loading