### NLP of legal texts
Analysis of agreements between governments 
* Key words & key phrases extraction with TF-IDF and N-gramms
* NER for DATES with ([Natasha](https://natasha.github.io/demo/) (rule-based lib for Russian language). Sequence model, implemented in [AnaGo](https://anago.herokuapp.com/) and NER by [DeepMIPT](https://demo.ipavlov.ai/) have lower accuracy for this type of text.
* Dictionary method and morphological analysis for finding ORGANIZATIONS and COUNTRIES (accuracy is more important than the opportunity to expand the lists)

In [1]:
import re
import os
import docx
from datetime import date
import pymorphy2
import gensim
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
m = pymorphy2.MorphAnalyzer()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from natasha import (
    DatesExtractor,
)

[nltk_data] Downloading package punkt to /home/alissia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data preprocessing

In [2]:
def getText(document):
    if document.endswith(".txt"):
        with open(document, "r") as f:
            return f.readlines()
    elif document.endswith(".docx"):
        doc = docx.Document(document)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
        text = '\n'.join(fullText)
        file_lines = []
        for line in text.splitlines():
            if line != '':
                file_lines.append(line)
        return file_lines

In [3]:
def lemm(word):
    word = re.sub("(</?.*?>)|(<>)|(\\d|\\W)+", '', word).lower()
    return m.parse(word)[0].normal_form.strip()
def preprocess(file_readed_by_lines):
    return [[lemm(word) for word in word_tokenize(text) if ((lemm(word) not in stopWords) and len(word)>3)] for text in file_readed_by_lines]

In [4]:
stopWords = set(line.strip() for line in open('files/RUstopwords.txt', 'r'))
# can be expanded
print(len(stopWords))

151


In [5]:
countries = set(line.strip().lower() for line in open(os.path.abspath('files/countries.txt'), 'r'))
print(len(countries))

204


In [7]:
organizations = set(line.strip().lower() for line in open(os.path.abspath('files/organizations.txt'), 'r'))
# can be expanded
print(len(organizations))

156


In [8]:
# corpus is uploaded from https://xn--80abucjiibhv9a.xn--p1ai/%D0%BC%D0%B8%D0%BD%D0%B8%D1%81%D1%82%D0%B5%D1%80%D1%81%D1%82%D0%B2%D0%BE/68/%D1%84%D0%B0%D0%B9%D0%BB/916/%D0%9C%D0%A1_%D0%9D%D0%A2%D0%A1.pdf
# doesn't contain the test doc
corp = 'corpus.txt'
corpus = preprocess(getText(os.path.abspath('files/'+corp)))

In [9]:
filename = 'testdoc.docx'

In [26]:
#filename = 'testdoc.txt'

In [27]:
doc_lines = getText(os.path.abspath('files/'+filename))
doc = preprocess(doc_lines)

### Computing TF-IDF and extracting key words

In [28]:
def data_tfidf(text_lines, sep, space=""):
    data = ""
    for line in text_lines:
        data += sep.join(line) + space
    return data

#  sorts the values in the vector while preserving the column index
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key = lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=20):
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    for idx, score in sorted_items:
        fname = feature_names[idx]
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results
    
def get_keywords(doc, corpus_tfidf):
    
    cv = CountVectorizer(max_df=0.85, stop_words=stopWords)
    word_count_vector = cv.fit_transform(word_tokenize(corpus_tfidf))
    tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    feature_names = cv.get_feature_names()
    
    tf_idf_vector = tfidf_transformer.transform(cv.transform([doc])) # enumerates a vector of tf-idf scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    return extract_topn_from_vector(feature_names,sorted_items,20) 

In [29]:
corpus_tfidf = data_tfidf(data_tfidf(corpus, " ", space=" "), " ")
doc_tfidf = data_tfidf(data_tfidf(doc, " "), " ")

In [30]:
keywords = list(get_keywords(doc_tfidf, corpus_tfidf))

In [31]:
kw_euristics = [[word for word in line if (word[-3:]=="ция")] for line in doc]
for line in kw_euristics:
    for word in line:
        keywords.append(word)

### Key phrases with tf-idf using N-gramms

In [32]:
def words_to_bigramms(text, str_bigrams = ""):
    for line in text:
        bigrams = ngrams(line,2)
        for k1, k2 in Counter(bigrams):
            str_bigrams += k1+ "_" + k2+ "_" + " "
    return str_bigrams

In [33]:
def words_to_trigramms(text, str_trigrams = ""):
    for line in text:
        trigrams = ngrams(line,3)
        for k1, k2,k3 in Counter(trigrams):
            str_trigrams += k1+ "_" + k2+ "_" +k3 + " "
    return str_trigrams

In [34]:
keyphrases = list(get_keywords(words_to_bigramms(corpus), words_to_bigramms(doc)))
keyphrases += list(get_keywords(words_to_trigramms(corpus), words_to_bigramms(doc)))

### Output

In [35]:
# euristics and morphological analysis
title = str(doc_lines[0]).strip()
topic = title.split(' ')[-2:]   
noun = m.parse(topic[1])[0]
adj = m.parse(topic[0])[0].inflect({noun.tag.gender, 'sing', 'nomn'})
print("\nНазвание документа: %s%s" % (title[0].upper(), title[1:]))



Название документа: СОГЛАШЕНИЕ между Правительством Российской Федерации и Правительством Федеративной Республики Бразилии о научно-техническом сотрудничестве


In [36]:
# dictionary method 
orgs = []
orgs_ = [[word for word in line if (word in organizations)] for line in doc]
for line in orgs_:
    for word in line:
        orgs.append(word)
for word in set(orgs):
    print("\nОрганизации:", word[0].upper()+word[1:])

In [37]:
# dictionary method 
coun_euristics = []
coun = [[word for word in line if (word in countries)] for line in doc]
for line in coun:
    for word in line:
        coun_euristics.append(word)
for word in set(coun_euristics):
    print("\nСтраны:", word[0].upper()+word[1:])


Страны: Бразилия


In [38]:
# euristics and morphological analysis
act_type = ["в рамках", "содружества", "государств-участников", "межгосударственном"]
title_lemm = ' '.join([lemm(word) for word in word_tokenize(title)])

for word in act_type:
    if lemm(word) in title_lemm:
        type = "Многостороннее соглашение"
    else:
        type = "Двустороннее соглашение"
print("\nВид документа: %s" % type) 


Вид документа: Двустороннее соглашение


In [39]:
print("\nНаправление:", adj[0][0].upper()+adj[0][1:].lower(), lemm(noun[0]))
print("Область:", adj[0][0].upper()+adj[0][1:].lower(), lemm(noun[0]))


Направление: Научно-техническое сотрудничество
Область: Научно-техническое сотрудничество


In [40]:
# NER
dates = []
attention = set()
extractor = DatesExtractor()
for line in doc_lines:
    matches = extractor(line)
    for index, match in enumerate(matches):
        try:
            dates.append(date(match.fact.year, match.fact.month, match.fact.day))
        except TypeError as e:
            attention = match.fact.year, match.fact.month, match.fact.day
# usually acts with earlier dates are denied or are the ones which the current document is based on
data2 = dates.pop(dates.index(max(dates)))
data1 = max(dates);
print("\nДата заключения:", data1)
print("\nДата вступления в силу:", data2)
if len(attention) > 0:
    print("\n*Документ также содержит даты в неполном формате", attention)


Дата заключения: 1997-11-21

Дата вступления в силу: 1999-09-30

*Документ также содержит даты в неполном формате (2002, 12, None)


In [41]:
print("\nКлючевые слова:")
for word in set(keywords):
    print(word)

print("\nНаиболее часто встречающиеся выражения (n-gramms):")
for phrase in set(keyphrases):
    print(phrase)


Ключевые слова:
сотрудничество
бразилия
правительство
комиссия
реализация
страна
каждый
организация
сила
информация
проект
сторона
научнотехнический
настоящее
федеративный
соглашение
действие
республика
отношение
рекомендация
ассоциация
координация
настоящий
свой
бразилиа
федерация

Наиболее часто встречающиеся выражения (n-gramms):
договариваться_сторона_
научнотехнический_информация_
сотрудничество_область_
прекращение_действие_
соглашение_научнотехнический_
декабрь__
наука_техника_
федерация_правительство_
область_наука_
интеллектуальный_собственность_
бразилия_апрель_
рамка_настоящий_
_год_
правительство_российский_
сотрудничество_рамка_
бразилиа_ноябрь_
российский_федерация_
результат_сотрудничество_
распределение_право_
научнотехнический_сотрудничество_
настоящий_соглашение_
настоящее_соглашение_
вступать_сила_
оба_страна_
