In [3]:
import re

In [5]:
import difflib

def get_fixes_diff(clean, text):
    diff = []
    for symb in enumerate(difflib.ndiff(clean, text)):
        if symb[1][0] in ['-', ' ']:
            diff.append(symb[1][0])
    return diff

### Markdown

In [6]:
text = '''Узнать больше об условиях можно тут:<span>&#xa0;</span></span><a class="external-link" href="file:///C:/Users/Elizaveta/AppData/Roaming/Microsoft/Word/www.sberbank.ru" rel="nofollow" style="text-decoration: none;text-align: left;" title="">www.sberbank.ru</a></td><td class="highlight-blue confluenceTd" colspan="1" data-highlight-colour="blue" title="Background colour : Blue"><span style="color: rgb(23,43,77);" title="">Подробные условия смотрите<span>&#xa0;</span></span><a class="external-link" href="http://www.sberbank.ru/" rel="nofollow" style="text-decoration: none;text-align: left;" title="">на сайте СберБанка</a></td><td class="highlight-blue confluenceTd" colspan="1" data-highlight-colour="blue" title="Background colour : Blue"><span style="color: rgb(255,0,0);" title="">IN PROGRESS</span>'''

In [7]:
diff = get_fixes_diff(text, text)

In [None]:
err = {}

In [8]:
def highlight_url(diff, text, url):
    start = text.lower().find(url)
    stop = start + len(url)
    diff[start:stop] = ['-'] * len(url)
    return diff

In [9]:
def check_markdown(text, diff, err):
    keyword = 'markdown'
    text = re.sub(r'<.*?>', ' ', text)
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    urls = [x[0] for x in re.findall(regex, text)]
    if keyword not in err.keys():
        err[keyword] = []
    if len(urls) > 0:
        err[keyword].append('Неправильно оформлены ссылки: {}'.format(urls))
    for url in urls:
        diff = highlight_url(diff, text, url)
    return text, diff, err

In [10]:
text, diff, err = check_markdown(text, diff, err)

### Depersonalization

In [None]:
%%capture
!pip install pymorphy2

In [12]:
import pymorphy2
analyzer = pymorphy2.MorphAnalyzer(lang='ru')

In [64]:
text = '''В течении трех дней мы оформим вашу карту'''

In [65]:
err = {}

In [66]:
diff = get_fixes_diff(text, text)

In [67]:
def find_pronoun(text):
    text = text.lower()
    text = text.split(' ')
    pronoun_list = ['я', 'мы']
    for token in text:
        if token in pronoun_list:
            return token
    return False

In [68]:
def get_text_part(text, pronoun):
    text = text.lower().split(' ')
    for i, token in enumerate(text):
        if token == pronoun:
            return ' '.join(text[i:])
    return False

In [69]:
def get_verb(text):
    for token in text.split(' '):
        pos = analyzer.parse(token)[0].tag.POS
        if pos in ['VERB', 'INFN']:
            verb = analyzer.parse(token)[0]
            return token, verb.inflect({'perf', 'tran', 'sing', '3per', 'futr', 'indc'}).word
    return None

In [70]:
def replace_pronoun(text, pronoun, verb_old, verb_new):
    text = text.replace(verb_old, verb_new)
    text = text.replace(pronoun, 'банк')
    return text

In [72]:
def depersonalization(text, err):
    keyword = 'personalization'
    pronoun = find_pronoun(text)
    if pronoun:
        verb_old, verb_new = get_verb(get_text_part(text, pronoun))
        text = replace_pronoun(text, pronoun, verb_old, verb_new)
    if keyword not in err.keys():
        err[keyword] = []
    if pronoun:
        err[keyword].append(pronoun + ' ' + verb_old + ' > ' + 'банк' + ' ' + verb_new)
    return text, err

In [74]:
text, err = depersonalization(text, err)

### Dash 

In [None]:
text = '''В течении трех дней мы оформим вашу карту'''

In [None]:
err = {}

In [None]:
diff = get_fixes_diff(text, text)

### Number decoration

### Currency designation 

### Lists

### Naming