# Ex. 2.1

In [65]:
import csv
import sys
import collections
import math
import itertools
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from scipy.stats import linregress

# Some fields overflow default CSV buffer so we set incredibly huge default
csv.field_size_limit(sys.maxsize)
# Read file
def openTab(path, skip_lines=0, create_counters=True):
    with open(path, newline = '') as file:
        lowerer = lambda x: x.lower()
        
        r = csv.reader(file, delimiter='\t', quoting=csv.QUOTE_NONE)
        r = r if skip_lines == 0 else itertools.islice(r, skip_lines, None)
        [F, L] = map(list, zip(*sorted([(e[0], e[1]) for e in r])))
        [f, l] = [list(map(lowerer, list(F))), list(map(lowerer, list(L)))]
        
        if not create_counters:
            return (
                set(F), set(L), set(f), set(l),
                None, None, None, None)
        
        f_counts = collections.Counter(f)
        l_counts = collections.Counter(l)
        F_counts = collections.Counter(F)
        L_counts = collections.Counter(L)
        
        def create_counter(o):
            def counter(x):
                nonlocal o
                return o[x]
            return counter
        
        return (
            set(F), set(L), set(f), set(l),
            create_counter(F_counts), create_counter(L_counts),
            create_counter(f_counts), create_counter(l_counts))

In [66]:
(Y_F, Y_L, Y_f, Y_l, f_F, f_L, f_f, f_l) = openTab('1_NKJP1M-frequency.tab')

In [25]:
(X_F, X_L, X_f, X_l, _, _, _, _) = openTab('sgjp.tab', 29, False)

In [11]:
class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            
            for col in row:
                html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)

# Ex. 2.2

In [90]:
from IPython.core.display import display
rf = lambda v: str(round(v, 4))

for i in ["F", "f", "L", "l"]:
    
    vs = dict(globals(), **locals())
    X = vs[f"X_{i}"]
    Y = vs[f"Y_{i}"]
    f = vs[f"f_{i}"]
    XY = X.intersection(Y)
    X_len = len(X)
    Y_len = len(Y)
    XY_len = len(XY)

    F1 = sum([f(s) for s in XY])
    F2 = sum([f(s) for s in Y])
    
    A1 = XY_len / X_len
    A2 = XY_len / Y_len
    A3 = F1 / F2
    
    l = ListTable()
    l.append(["property", "value"])
    l.append(["case", f"X_{i}, Y_{i}"])
    l.append(["|X and Y| / |X|", rf(A1)])
    l.append(["|X and Y| / |Y|", rf(A2)])
    l.append(["sum[s in XY] f_i(s) / sum[s in Y] f_i(s)", rf(A3)])
    print("\n")
    display(l)
l
    





0,1
property,value
case,"X_F, Y_F"
|X and Y| / |X|,0.0231
|X and Y| / |Y|,0.77
sum[s in XY] f_i(s) / sum[s in Y] f_i(s),0.7829






0,1
property,value
case,"X_f, Y_f"
|X and Y| / |X|,0.0239
|X and Y| / |Y|,0.89
sum[s in XY] f_i(s) / sum[s in Y] f_i(s),0.902






0,1
property,value
case,"X_L, Y_L"
|X and Y| / |X|,0.0977
|X and Y| / |Y|,0.6441
sum[s in XY] f_i(s) / sum[s in Y] f_i(s),0.7817






0,1
property,value
case,"X_l, Y_l"
|X and Y| / |X|,0.1002
|X and Y| / |Y|,0.6769
sum[s in XY] f_i(s) / sum[s in Y] f_i(s),0.8023


0,1
property,value
case,"X_l, Y_l"
|X and Y| / |X|,0.1002
|X and Y| / |Y|,0.6769
sum[s in XY] f_i(s) / sum[s in Y] f_i(s),0.8023


In [19]:
# Read file
from collections import defaultdict
import csv
import math
import itertools
import numpy as np
def getLemmatizationsVariantCounts(path, lem_mapper=None, form_mapper=None, skip_lines=0):
    fn_id = lambda x: x
    if not lem_mapper:
        lem_mapper = fn_id
    if not form_mapper:
        form_mapper = fn_id
    with open(path, newline = '') as file:
        r = csv.reader(file, delimiter='\t', quoting=csv.QUOTE_NONE)
        r = r if skip_lines == 0 else itertools.islice(r, skip_lines, None)
       
        d = defaultdict(set)
        for e in r:
            d[form_mapper(e[0])].add(lem_mapper(e[1]))
        r = defaultdict(int)
        for lems in d.items():
            r[lems[0]] = len(lems[1])
        return r

ambig_count = sum([ 1 for c in getLemmatizationsVariantCounts('sgjp.tab', None, None, 29).values() if c > 1 ])
ambig_count_comma_cutoff = sum([ 1 for c in getLemmatizationsVariantCounts('sgjp.tab', lambda x: x.split(":")[0], None, 29).values() if c > 1])

l = ListTable()
l.append(["property", "value"])
l.append(["amb. lemmatizations (sgjp)", ambig_count])
l.append(["amb. lemmatizations (sgjp, comma cutoff)", ambig_count_comma_cutoff])
print("\n")
display(l)





0,1
property,value
amb. lemmatizations (sgjp),333459
"amb. lemmatizations (sgjp, comma cutoff)",186818


# Ex. 2.3

In [80]:
def checkLemmatizationUsing(path_src, path_dict, lem_mappers=[None, None], form_mappers=[None, None], skips=[0, 0]):
    s = getLemmatizationsVariantCounts(path_src, lem_mappers[0], form_mappers[0], skips[0])
    d = getLemmatizationsVariantCounts(path_dict, lem_mappers[1], form_mappers[1], skips[1])
    return (sum([1 for form in s.keys() if d[form] == 1]), len(s.keys()))
        
lowerer = lambda x: x.lower()
clu_without_lowers, clu_count = checkLemmatizationUsing('1_NKJP1M-frequency.tab', 'sgjp.tab', [None, None], [None, None], [0, 29])
clu_with_lowers, _ = checkLemmatizationUsing('1_NKJP1M-frequency.tab', 'sgjp.tab', [None, None], [lowerer, lowerer], [0, 29])

l = ListTable()
l.append(["property", "percents", "count", "all forms"])
l.append(["Lemmatization NKJP1M vs SGJP", clu_without_lowers/clu_count, clu_without_lowers, clu_count])
l.append(["Lemmatization NKJP1M vs SGJP (lower letters)", clu_with_lowers/clu_count, clu_with_lowers, clu_count])
print("\n")
display(l)





0,1,2,3
property,percents,count,all forms
Lemmatization NKJP1M vs SGJP,0.6092989866042181,87421,143478
Lemmatization NKJP1M vs SGJP (lower letters),0.5698573997407268,81762,143478


# Additional tasks

In [78]:
#
# This is a code that queries policarp (additional tasks no. 1)
#
import requests
import time
from bs4 import BeautifulSoup

# Query online poliqarp API
def queryPoliqarp(query, corpus):
    s = requests.Session()
    s.get(f"http://nkjp.pl/poliqarp/{corpus}/")
    cond = True
    rep = 0
    while cond:
        s.post("http://nkjp.pl/poliqarp/query/",  {"query": query, "corpus": corpus})
        t = s.get(f"http://www.nkjp.pl/poliqarp/{corpus}/query/").text
        ret = BeautifulSoup(t)
        results = []
        trs = ret.select(".query-results table tr")
        for tr in trs:
            tds = tr.select("td")
            text_items = []
            original_texts = []
            highlight_texts = []
            for td in tds:
                strongs = td.select("strong")
                if len(strongs) == 0:
                    for span in td.select("span"):
                        highlight_texts.append(span.text)
                        original_texts.append(span.text)
                        text_items.append({
                            "text": span.text.strip(),
                            "anno": span["title"].replace(span.text, "").replace("\n", "").replace("[", "").replace("]", "").strip().split(":"),
                        })
                else:
                    original_texts.append(strongs[0].text)
                    highlight_texts.append(f" [{strongs[0].text.strip()}]")
                    text_items.append({
                        "text": strongs[0].text.strip(),
                        "anno": td.text.replace(strongs[0].text, "").replace("\n", "").replace("[","").replace("]","").strip().split(":")
                    })
            results += [{
                "highlighted": ("".join(highlight_texts).strip()),
                "text": ("".join(original_texts).strip()),
                "items": text_items,
            }]
        cond = False
        if len(results) <= 0 and rep < 2:
            cond = True
        if cond:
            time.sleep(1)
        rep = rep + 1
    return results

# queryPoliqarp("każdy .* pewien", "nkjp1M")
queryPoliqarp("każdy", "nkjp1M")


[{'highlighted': 'dopóki nie udowodniono winy to [każdy] jest niewinny – jeszcze Jarosław',
  'text': 'dopóki nie udowodniono winy to każdy jest niewinny – jeszcze Jarosław',
  'items': [{'text': 'dopóki', 'anno': ['dopóki', 'comp']},
   {'text': 'nie', 'anno': ['nie', 'qub']},
   {'text': 'udowodniono', 'anno': ['udowodnić', 'imps', 'perf']},
   {'text': 'winy', 'anno': ['wina', 'subst', 'sg', 'gen', 'f']},
   {'text': 'to', 'anno': ['to', 'conj']},
   {'text': 'każdy', 'anno': ['każdy', 'adj', 'sg', 'nom', 'm1', 'pos']},
   {'text': 'jest', 'anno': ['być', 'fin', 'sg', 'ter', 'imperf']},
   {'text': 'niewinny', 'anno': ['niewinny', 'adj', 'sg', 'nom', 'm1', 'pos']},
   {'text': '–', 'anno': ['–', 'interp']},
   {'text': 'jeszcze', 'anno': ['jeszcze', 'qub']},
   {'text': 'Jarosław', 'anno': ['Jarosław', 'subst', 'sg', 'nom', 'm1']}]},
 {'highlighted': 'fajnie by było. – [każdy] swoją połóweczkę jakąś wyciągnie.',
  'text': 'fajnie by było. – każdy swoją połóweczkę jakąś wyciągnie.',
