In [19]:
%matplotlib inline
import re
import tarfile
import os
import matplotlib.pyplot as plt
import numpy as np
import regex
import pandas as pd
import morfeusz2
from elasticsearch_dsl import *
from elasticsearch_dsl import query
from elasticsearch import *

1. Install ElasticSearch (ES).
2. Install an ES plugin for Polish https://github.com/allegro/elasticsearch-analysis-morfologik 

In [2]:
# before restarting kernel please remove all content from ElasticSearch
# curl -X DELETE 'http://localhost:9200/_all'

es = Elasticsearch("http://localhost:9200")
resp = es.info()
resp

{'name': 'x-dell',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'hrJwCHZtSGm_1kBHAztvag',
 'version': {'number': '7.10.1',
  'build_flavor': 'default',
  'build_type': 'deb',
  'build_hash': '1c34507e66d7db1211f66f3513706fdf548736aa',
  'build_date': '2020-12-05T01:00:33.671820Z',
  'build_snapshot': False,
  'lucene_version': '8.7.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [3]:
connections.add_connection('python_client', es)
connections.create_connection(hosts=['localhost'])

print(es.cat.plugins(params={'v': '', 'h': 'name,component,version,description'}))


name   component           version description
x-dell analysis-morfologik 7.10.1  Morfologik Polish Lemmatizer plugin for Elasticsearch



3. Define an ES analyzer for Polish texts containing:
   1. standard tokenizer
   1. synonym filter with the following definitions:
      1. kpk - kodeks postępowania karnego
      1. kpc - kodeks postępowania cywilnego
      1. kk - kodeks karny
      1. kc - kodeks cywilny
   1. Morfologik-based lemmatizer
   1. lowercase filter

4. Define an ES index for storing the contents of the legislative acts.
5. Load the data to the ES index.

In [4]:


synonym = token_filter('my_name', type='synonym',
                       synonyms =
                       ["kpk => kodeks postępowania karnego",
                        "kpc => kodeks postępowania cywilnego",
                        "kk => kodeks karny",
                        "kc => kodeks cywilny"
                       ])

my_analyzer = analyzer('my_analyzer',
    tokenizer=tokenizer('standard'),
    filter=["lowercase",synonym,"morfologik_stem",]
)


# es.delete_by_query(index=['ustawa'], body={"query": {"match_all": {}}})


class Ustawa(Document):
    title = Text()
    created_at = Integer()
    content = Text(analyzer=my_analyzer)
    
    class Index:
        name = 'ustawa'
        
        

Ustawa.init()

iterations = 0
path="../data/ustawy"
for filename in os.listdir(path):
    with open(os.path.join(path, filename), 'r',encoding='utf-8') as f: # open in readonly mode
        
        year = int(re.search(r'.*\/([^\/]{4})',f.name).group(1))
        first = Ustawa(title=f.name,content=f.read(),created_at = year)
        
        first.save(using=es)
        iterations+=1





In [5]:

s = Search(using=es, index="ustawa").count()
print('Test that number of files is the same as number of iterations')
print(iterations == s)


Test that number of files is the same as number of iterations
True


In [6]:
print('test that random document eg. 2001_906.txt occures only once')
s = Search(using=es, index="ustawa").query("match_phrase", title='../data/ustawy/2001_906.txt').count()
print(s)

test that random document eg. 2001_906.txt occures only once
1


6. Determine the number of legislative acts containing the word **ustawa** (in any form).

In [7]:
s = Search(using=es, index="ustawa").filter("match_phrase", content='ustawa').count()
print(f'There are {s} documents with the word "ustawa" in them')

There are 1178 documents with the word "ustawa" in them


In [8]:
s = Search(index='ustawa',using=es).query('match', content='ustawa').count()
print(f'There are {s} documents with the word "ustawa" in them')


There are 1178 documents with the word "ustawa" in them


7. Determine the number of occurrences of the word **ustawa** by searching for this particular form, including the other inflectional forms.
8. Determine the number of occurrences of the word **ustaw** by searching for this particular form, including the other inflectional forms.

In [9]:
s = Search(using=es, index="ustawa").query("match_all")[:3000].execute()
ids = [x.meta.id for x in s]

amount=0
for id in ids:
    s = es.termvectors(index="ustawa", fields="content", id =id,field_statistics=True)
    try:
        amount += s['term_vectors']['content']['terms']['ustawa']['term_freq']
    except KeyError:
        pass
print(amount)

24934


In [10]:

print('Show term_vestors for the first document on the list')
print('Number of occurences of the word "ustawa"',s['term_vectors']['content']['terms']['ustawa']['term_freq'])
s = es.termvectors(index="ustawa", fields="content", id =ids[0],field_statistics=False)
print(list(s['term_vectors']['content']['terms'].keys())[:100])



Show term_vestors for the first document on the list
['1', '14', '19', '1998', '1999', '2', '229', '26', '3', 'a', 'artykuł', 'australią', 'czerwiec', 'dokonanie', 'dokonać', 'dz.u', 'dzień', 'ekstradycja', 'kanberze', 'luty', 'między', 'na', 'numer', 'o', 'ocean', 'od', 'oda', 'ogłosić', 'ogłoszenie', 'ojciec', 'po', 'podpisać', 'polski', 'pozycja', 'prezydent', 'przez', 'ratyfikacja', 'rok', 'rzeczpospolita', 'się', 'umowa', 'umowy', 'upływ', 'ustawa', 'w', 'wchodzić', 'wiek', 'wyrażać', 'z', 'zgoda', 'życie', 'żyto', 'żyć']
Number of occurences of the word "ustawa" 2


9. Determine the number of legislative acts containing the words **kodeks postępowania cywilnego** 
   in the specified order, but in any inflection form.

In [11]:
s = Search(using=es, index="ustawa").query("match_phrase", content='kodeks postępowania cywilnego').count()
print(f'The number of files containing the words "kodeks postępowania cywilnego" is: {s}')

s = Search(using=es, index="ustawa").query("match_phrase", content='kpc').count()
print(f'The number of files containing the words "kpc" is: {s}')

The number of files containing the words "kodeks postępowania cywilnego" is: 99
The number of files containing the words "kpc" is: 99


10. Determine the number of legislative acts containing the words **wchodzi w życie** 
   (in any form) allowing for up to 2 additional words in the searched phrase.

In [17]:

q = query.MatchPhrase(content={'query' : "wchodzi w życie",'slop' :2})
s = Search(using=es, index="ustawa").query(q).count()

print(s)

1174


11. Determine the 10 documents that are the most relevant for the phrase **konstytucja**.
12. Print the excerpts containing the word **konstytucja** (up to three excerpts per document) 
   from the previous task.

In [18]:
q = query.Match(content={'query' : "konstytucja"})
s = Search(using=es, index="ustawa").query(q).highlight('content',number_of_fragments=3,type='plain',fragment_size=80)[:10].execute()

for i,hit in enumerate(s):
    fragments = hit.meta['highlight']['content']
    print(i,hit['title'],hit.meta.score)
    for fragment in fragments:
        print('-', fragment)


0 ../data/ustawy/1997_629.txt 6.8693724
- 
           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej Polskiej
-  przygotowania i 
uchwalenia <em>Konstytucji</em> Rzeczypospolitej Polskiej (Dz.U. Nr 67, poz
-  przedstawienia
                Zgromadzeniu Narodowemu projektu nowej <em>Konstytucji</em>
1 ../data/ustawy/2000_443.txt 6.664267
- . 89 ust.
  1 i art. 90 <em>Konstytucji</em> Rzeczypospolitej Polskiej, oraz inne umowy
- . 89
     ust. 1 lub art. 90 <em>Konstytucji</em> Rzeczypospolitej Polskiej, lub
   3
-  przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>
     Rzeczypospolitej
2 ../data/ustawy/1997_604.txt 6.633459
-  co do zgodności z <em>Konstytucją</em> celów lub 
   zasad działania partii politycznej
-  <em>Konstytucją</em>.
 2. Na postanowienie, o którym mowa w ust. 1, zażalenie nie przysługuje
-  
   politycznej z <em>Konstytucją</em>, Sąd odmawia wpisu partii do ewidencji.
 4
3 ../data/ustawy/1996_350.txt 6.628284
-  <em>Konstytucji</em> lub us