1. Install ElasticSearch (ES).
2. Install an ES plugin for Polish https://github.com/allegro/elasticsearch-analysis-morfologik 
3. Define an ES analyzer for Polish texts containing:
   1. standard tokenizer
   1. synonym filter with the following definitions:
      1. kpk - kodeks postępowania karnego
      1. kpc - kodeks postępowania cywilnego
      1. kk - kodeks karny
      1. kc - kodeks cywilny
   1. Morfologik-based lemmatizer
   1. lowercase filter

In [537]:
import re
import tarfile
import os
import matplotlib.pyplot as plt
import numpy as np
import regex
import pandas as pd
import morfeusz2
%matplotlib inline



from elasticsearch import Elasticsearch
from elasticsearch_dsl import analyzer, tokenizer

es = Elasticsearch("http://localhost:9200")
resp = es.info()
resp

{'name': 'x-dell',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'hrJwCHZtSGm_1kBHAztvag',
 'version': {'number': '7.10.1',
  'build_flavor': 'default',
  'build_type': 'deb',
  'build_hash': '1c34507e66d7db1211f66f3513706fdf548736aa',
  'build_date': '2020-12-05T01:00:33.671820Z',
  'build_snapshot': False,
  'lucene_version': '8.7.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [593]:
from elasticsearch_dsl import  *
from elasticsearch import *
from datetime import datetime

from elasticsearch_dsl import connections

connections.create_connection(hosts=['localhost'])
es = Elasticsearch()

synonym = token_filter('my_name', type='synonym',
                       synonyms =
                       ["kpk => kodeks postępowania karnego",
                        "kpc => kodeks postępowania cywilnego",
                        "kk => kodeks karny",
                        "kc => kodeks cywilny"
                       ])

my_analyzer = analyzer('my_analyzer',
    tokenizer=tokenizer('standard'),
    filter=["lowercase",synonym,"morfologik_stem",]
)


# es.delete_by_query(index=['ustawa'], body={"query": {"match_all": {}}})


class Ustawa(Document):
    title = Text()
    created_at = Integer()
    content = Text(analyzer=my_analyzer)
    
    class Index:
        name = 'ustawa'
        
        

Ustawa.init()


path="../data/ustawy"
i=0
for filename in os.listdir(path):
    with open(os.path.join(path, filename), 'r',encoding='utf-8') as f: # open in readonly mode
        
        # title = 
        year = int(re.search(r'.*\/([^\/]{4})',f.name).group(1))
        first = Ustawa(title=f.name,content=f.read(),created_at = year)
        
        first.save(using=es)
        i+=1
        # if i==50:
        #     break
        # break
print(i) 








1179


In [594]:
from elasticsearch_dsl import *
from elasticsearch import *

In [599]:
from elasticsearch import MatchPhrase


ImportError: cannot import name 'MatchPhrase' from 'elasticsearch' (/home/x/anaconda3/envs/ml/lib/python3.8/site-packages/elasticsearch/__init__.py)

In [542]:
elasticsearch_dsl.connections.add_connection('python_client', es)

print(es.cat.plugins(params={'v': '', 'h': 'name,component,version,description'}))


name   component           version description
x-dell analysis-morfologik 7.10.1  Morfologik Polish Lemmatizer plugin for Elasticsearch



In [543]:
search = Search(index='ustawa',using=es).query('match', content='ustawa').count()
print(search)
# r =search.execute()
# print(len(r))

1178


In [547]:

s = Search(using=es, index="ustawa").count()
print(s)

1179


In [334]:
s = Search(using=es, index="ustawa").query("match_phrase", title='../data/ustawy/2001_906.txt').count()
print(s)

1


4. Define an ES index for storing the contents of the legislative acts.
5. Load the data to the ES index.
6. Determine the number of legislative acts containing the word **ustawa** (in any form).

In [330]:
s = Search(using=es, index="ustawa").filter("match_phrase", content='ustawa').count()
print(f'There are {s} documents with the word "ustawa" in them')

There are 1178 documents with the word "ustawa" in them


7. Determine the number of occurrences of the word **ustawa** by searching for this particular form, including the other inflectional forms.
8. Determine the number of occurrences of the word **ustaw** by searching for this particular form, including the other inflectional forms.

9. Determine the number of legislative acts containing the words **kodeks postępowania cywilnego** 
   in the specified order, but in any inflection form.

In [546]:
s = Search(using=es, index="ustawa").query("match_phrase", content='kodeks postępowania cywilnego').count()
print(f'The number of files containing the words "kodeks postępowania cywilnego" is: {s}')

s = Search(using=es, index="ustawa").query("match_phrase", content='kpc').count()
print(f'The number of files containing the words "kpc" is: {s}')

The number of files containing the words "kodeks postępowania cywilnego" is: 99
The number of files containing the words "kpc" is: 99


10. Determine the number of legislative acts containing the words **wchodzi w życie** 
   (in any form) allowing for up to 2 additional words in the searched phrase.

In [631]:
q = elasticsearch_dsl.query.MatchPhrase(content={'query' : "wchodzi w życie",'slop' :2})
s = Search(using=es, index="ustawa").query(q).count()

print(s)

2348


In [143]:
s = Search(using=es, index="ustawa").query("multi_match", query='skreśla się').execute()
# print()
# for i in s.scan():
#     print(i)

print(s[0]['content'])





Dz.U. z 2000 r. Nr 122, poz. 1324
                                                                                
                                                                                
                                                                                
                                                                                
                                     USTAWA
                            z dnia 22 grudnia 2000 r.
                                        
    o zmianie ustawy o powszechnym ubezpieczeniu zdrowotnym, ustawy o podatku
   dochodowym od osób fizycznych, ustawy o podatku dochodowym od osób prawnych
    oraz ustawy o zryczałtowanym podatku dochodowym od niektórych przychodów
                         osiąganych przez osoby fizyczne
                                        
                                        
                                     Art. 1.
W ustawie z dnia 6 lutego 1997 r. o powszechnym ubezpieczeniu zdrowotnym (Dz.U.
N

In [84]:
body = {
    # "from":0,
    # "size":1,
    "query": {
        "match_phrase": {
            "sentence":"Ustawa"
        }
    }
}

res = es.search(index="ustawa", body=body)

res

  res = es.search(index="ustawa", body=body)


{'took': 0,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [96]:
import os
path="../data/ustawy"
for filename in os.listdir(path):
    with open(os.path.join(path, filename), 'r') as f: # open in readonly mode
        print(f.read())
        text=f.read()
        break
      # do your stuff






Dz.U. z 1998 r. Nr 117, poz. 759                                      
                                        
                                        
                                        
                                        
                                        
                                     USTAWA
                             z dnia 25 lipca 1998 r.
                                        
                       o zmianie ustawy o systemie oświaty
                                        
                                        
                                     Art. 1.
W ustawie z dnia 7 września 1991 r. o systemie oświaty (Dz.U. z 1996 r. Nr 67,
poz. 329, Nr 106, poz. 496 oraz z 1997 r. Nr 28, poz. 153 i Nr 141, poz. 943)
wprowadza się następujące zmiany:
1) w art. 1:
   a) po pkt 5 dodaje się pkt 5a w brzmieniu:
      "5a) opiekę nad uczniami ze znacznymi lub sprzężonymi dysfunkcjami
         poprzez umożliwianie realizowania indywidualnych form i programó

In [4]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()

doc = {
    'author': 'kimchy',
    'text': 'Elasticsearch: cool. bonsai cool.',
    'timestamp': datetime.now(),
}
res = es.index(index="test-index", id=1, document=doc)
print(res['result'])

res = es.get(index="test-index", id=1)
print(res['_source'])

es.indices.refresh(index="test-index")

res = es.search(index="test-index", query={"match_all": {}})
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])



created
{'author': 'kimchy', 'text': 'Elasticsearch: cool. bonsai cool.', 'timestamp': '2021-10-30T22:11:21.389803'}
Got 1 Hits:
2021-10-30T22:11:21.389803 kimchy: Elasticsearch: cool. bonsai cool.


In [2]:
from elasticsearch_dsl.query import MultiMatch, Match

# {"multi_match": {"query": "python django", "fields": ["title", "body"]}}
MultiMatch(query='python django', fields=['title', 'body'])

# {"match": {"title": {"query": "web framework", "type": "phrase"}}}
Match(title={"query": "web framework", "type": "phrase"})


Match(title={'query': 'web framework', 'type': 'phrase'})

11. Determine the 10 documents that are the most relevant for the phrase **konstytucja**.
12. Print the excerpts containing the word **konstytucja** (up to three excerpts per document) 
   from the previous task.

In [588]:
from pprint import pprint

s = Search(using=es, index="ustawa").filter("match_phrase", content='konstytucja').sort()
s.highlight(fields='content').execute()
for i,hit in enumerate(s[:10]):
    pprint(vars(hit))

    # print(i, hit.title,hit)
    break
# print(s[:10])



{'_d_': {'content': '\n'
                    '\n'
                    '\n'
                    '\n'
                    'Dz.U. z 1998 r. Nr 91, poz. 578 \n'
                    '                                        \n'
                    '                                                                                \n'
                    ' \n'
                    ' \n'
                    ' \n'
                    '                                     USTAWA\n'
                    '                            z dnia 5 czerwca 1998 r.\n'
                    '                                        \n'
                    '                            o samorządzie powiatowym\n'
                    '                                        \n'
                    '                                        \n'
                    '                                   Rozdział 1\n'
                    '                                 Przepisy ogólne\n'
                    '              

In [586]:
s = Search(using=es, index="ustawa").query('nested', query=Q('match', content='konstytucja'), inner_hits={"highlight": {"fields": {"content": {}}}})

for hit in s:
    for comment in hit.meta.inner_hits.content.hits:
        # not 100% sure about the path, but look into comment.meta
        for fragment in comment.meta.highlight['content']:
            print(fragment)


RequestError: RequestError(400, 'illegal_argument_exception', "[nested] requires 'path' field")

In [454]:
s = Search().query('nested', path='comments', query=Q('match_phrase', content='konstytucja'), inner_hits={"highlight": {"fields": {"comments.content": {}}}})

for hit in s:
    for comment in hit.meta.inner_hits.comments.hits:
        # not 100% sure about the path, but look into comment.meta
        for fragment in comment.meta.highlight['comment.content']:
            print(fragment)


[]
