Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
5 changes: 2 additions & 3 deletions benchmarks/bench.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Pymorphy2 benchmark utility.

Expand All @@ -15,8 +13,9 @@

"""
import logging
import sys
import os
import sys

from docopt import docopt

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
Expand Down
1 change: 0 additions & 1 deletion benchmarks/shrink-unigrams.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import unicode_literals\n",
"import random\n",
"import math\n",
"random.seed(0)\n",
Expand Down
10 changes: 4 additions & 6 deletions benchmarks/speed.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals, division
import logging
import codecs
import os
import functools
import datetime
import functools
import logging
import os

from pymorphy2 import MorphAnalyzer
from benchmarks import utils
from pymorphy2 import MorphAnalyzer

logger = logging.getLogger('pymorphy2.bench')

Expand Down
5 changes: 2 additions & 3 deletions benchmarks/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals, division
import gc
import time
import timeit
import gc


def measure(func, inner_iterations=1, repeats=5):
"""
Expand Down
6 changes: 2 additions & 4 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-
#
# pymorphy2 documentation build configuration file, created by
# sphinx-quickstart on Sun Jul 29 04:34:30 2012.
#
Expand All @@ -10,9 +8,9 @@
#
# All configuration values have a default; values that are commented out
# serve to show the default.
from __future__ import unicode_literals

import sys, os
import os
import sys

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
Expand Down
3 changes: 1 addition & 2 deletions pymorphy2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
# -*- coding: utf-8 -*-
from .version import __version__
from .analyzer import MorphAnalyzer
from .version import __version__
24 changes: 10 additions & 14 deletions pymorphy2/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals, division
import os
import heapq
import collections
import heapq
import logging
import threading
import operator
import os
import threading
import warnings

import pymorphy2.lang
from pymorphy2 import opencorpora_dict
from pymorphy2.dawg import ConditionalProbDistDAWG
import pymorphy2.lang

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -66,7 +64,7 @@ def normalized(self):
# return self._dict.build_paradigm_info(self.para_id)


class ProbabilityEstimator(object):
class ProbabilityEstimator:
def __init__(self, dict_path):
cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg')
self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path)
Expand Down Expand Up @@ -140,14 +138,12 @@ def lang_dict_path(lang):
return lang_paths[lang]

raise ValueError(
"Can't find a dictionary for language %r. Installed languages: %r. "
"Try installing pymorphy2-dicts-%s package." % (
lang, list(lang_paths.keys()), lang
)
f"Can't find a dictionary for language {lang!r}. Installed languages: {list(lang_paths.keys())!r}. "
f"Try installing pymorphy2-dicts-{lang} package."
)


class MorphAnalyzer(object):
class MorphAnalyzer:
"""
Morphological analyzer for Russian language.

Expand Down Expand Up @@ -293,8 +289,8 @@ def choose_language(cls, dictionary, lang):
if dictionary.lang != lang:
# allow incorrect 'lang' values, but show a warning
warnings.warn(
"Dictionary language (%r) doesn't match "
"analyzer language (%r)." % (dictionary.lang, lang)
f"Dictionary language ({dictionary.lang!r}) doesn't match "
f"analyzer language ({lang!r})."
)

return lang
Expand Down
49 changes: 0 additions & 49 deletions pymorphy2/cache.py

This file was deleted.

43 changes: 14 additions & 29 deletions pymorphy2/cli.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals, print_function, division

import sys
import logging
import time
import codecs
import operator
import sys
import time
from functools import lru_cache

import pymorphy2
from pymorphy2.cache import lru_cache, memoized_with_single_argument
from pymorphy2.utils import get_mem_usage
from pymorphy2.tokenizers import simple_word_tokenize

PY2 = sys.version_info[0] == 2
from pymorphy2.utils import get_mem_usage

# Hacks are here to make docstring compatible with both
# docopt and sphinx.ext.autodoc.
Expand Down Expand Up @@ -80,10 +73,7 @@ def main(argv=None):
else:
score, lemmatize, tag = True, True, True

if PY2:
out_file = codecs.getwriter('utf8')(sys.stdout)
else:
out_file = sys.stdout
out_file = sys.stdout

return parse(
morph=morph,
Expand Down Expand Up @@ -112,14 +102,9 @@ def main(argv=None):
def _open_for_read(fn):
""" Open a file for reading """
if fn in ['-', '', None]:
if PY2:
return codecs.getreader('utf8')(sys.stdin)
else:
return sys.stdin
if PY2:
return codecs.open(fn, 'rt', encoding='utf8')
else:
return open(fn, 'rt', encoding='utf8')
return sys.stdin

return open(fn, 'rt', encoding='utf8')


# ============================ Commands ===========================
Expand Down Expand Up @@ -178,7 +163,7 @@ def parse(morph, in_file, out_file, tokenize, score, normal_form, tag,

_parse = parser.parse
if cache_size == 'unlim':
_parse = memoized_with_single_argument({})(_parse)
_parse = lru_cache(None)(_parse)
else:
cache_size = int(cache_size)
if cache_size:
Expand All @@ -189,7 +174,7 @@ def parse(morph, in_file, out_file, tokenize, score, normal_form, tag,
_write(_parse(token))


class _TokenParserFormatter(object):
class _TokenParserFormatter:
"""
This class defines its `parse` method based on arguments passed.
Some ugly code is to make all ifs work only once, not for each token.
Expand Down Expand Up @@ -218,14 +203,14 @@ def __init__(self, morph, score, normal_form, tag, newlines, thresh):
if score:
def _parse_token(tok):
seq = [
"%s:%0.3f=%s" % (p.normal_form, p.score, p.tag)
f"{p.normal_form}:{p.score:0.3f}={p.tag}"
for p in morph_parse(tok) if p.score >= thresh
]
return tpl % (tok, join(seq))
else:
def _parse_token(tok):
seq = [
"%s:%s" % (p.normal_form, p.tag)
f"{p.normal_form}:{p.tag}"
for p in morph_parse(tok) if p.score >= thresh
]
return tpl % (tok, join(seq))
Expand All @@ -241,7 +226,7 @@ def _parse_token(tok):
key=val, reverse=True
)
if score:
seq = ["%s:%0.3f" % (lemma, w) for (lemma, w) in items]
seq = [f"{lemma}:{w:0.3f}" for (lemma, w) in items]
else:
seq = [lemma for (lemma, w) in items]

Expand All @@ -250,7 +235,7 @@ def _parse_token(tok):
if score:
def _parse_token(tok):
seq = [
"%0.3f=%s" % (p.score, p.tag)
f"{p.score:0.3f}={p.tag}"
for p in morph_parse(tok) if p.score >= thresh
]
return tpl % (tok, join(seq))
Expand Down
21 changes: 9 additions & 12 deletions pymorphy2/dawg.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division

try:
from dawg import DAWG, RecordDAWG, IntCompletionDAWG
EXTENSION_AVAILABLE = True
Expand All @@ -25,14 +22,14 @@ class WordsDawg(RecordDAWG):
# We are storing 2 unsigned short ints as values:
# the paradigm ID and the form index (inside paradigm).
# Byte order is big-endian (this makes word forms properly sorted).
DATA_FORMAT = str(">HH")
DATA_FORMAT = ">HH"

def __init__(self, data=None):
if data is None:
super(WordsDawg, self).__init__(self.DATA_FORMAT)
super().__init__(self.DATA_FORMAT)
else:
assert_can_create()
super(WordsDawg, self).__init__(self.DATA_FORMAT, data)
super().__init__(self.DATA_FORMAT, data)


class PredictionSuffixesDAWG(WordsDawg):
Expand All @@ -43,7 +40,7 @@ class PredictionSuffixesDAWG(WordsDawg):
# We are storing 3 unsigned short ints as values:
# count, the paradigm ID and the form index (inside paradigm).
# Byte order is big-endian (this makes word forms properly sorted).
DATA_FORMAT = str(">HHH")
DATA_FORMAT = ">HHH"


class ConditionalProbDistDAWG(IntCompletionDAWG):
Expand All @@ -52,17 +49,17 @@ class ConditionalProbDistDAWG(IntCompletionDAWG):

def __init__(self, data=None):
if data is None:
super(ConditionalProbDistDAWG, self).__init__()
super().__init__()
else:
assert_can_create()
dawg_data = (
("%s:%s" % (word, tag), int(prob*self.MULTIPLIER))
(f"{word}:{tag}", int(prob * self.MULTIPLIER))
for (word, tag), prob in data
)
super(ConditionalProbDistDAWG, self).__init__(dawg_data)
super().__init__(dawg_data)

def prob(self, word, tag):
dawg_key = "%s:%s" % (word, tag)
dawg_key = f"{word}:{tag}"
return self.get(dawg_key, 0) / self.MULTIPLIER


Expand All @@ -71,7 +68,7 @@ def is_prefixed(self, word):
return bool(self.prefixes(word))


class PythonPrefixMatcher(object):
class PythonPrefixMatcher:
def __init__(self, prefixes):
self._prefixes = tuple(prefixes)

Expand Down
2 changes: 0 additions & 2 deletions pymorphy2/lang/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import ru, uk
2 changes: 0 additions & 2 deletions pymorphy2/lang/ru/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .config import *
2 changes: 0 additions & 2 deletions pymorphy2/lang/ru/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
"""
Constants and configuration for Russian language.
"""
from __future__ import absolute_import, unicode_literals
from pymorphy2 import units

# paradigm prefixes used for dictionary compilation
Expand Down
2 changes: 0 additions & 2 deletions pymorphy2/lang/uk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .config import *
3 changes: 0 additions & 3 deletions pymorphy2/lang/uk/_prefixes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals

# Prefixes which don't change the word parse.
# The list is from
# https://github.com/languagetool-org/languagetool/blob/master/languagetool-language-modules/uk/src/main/resources/org/languagetool/resource/uk/dash_prefixes.txt
Expand Down
2 changes: 0 additions & 2 deletions pymorphy2/lang/uk/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
"""
Constants and configuration for Ukrainian language.
"""
from __future__ import absolute_import, unicode_literals
from pymorphy2 import units
from ._prefixes import KNOWN_PREFIXES

Expand Down
Loading