# Multiword expressions identification and extraction

The task shows two simple methods useful for identifying multiword expressions (MWE) in corpora.

## Tasks

1. Use SpaCy [tokenizer API](https://spacy.io/api/tokenizer) to tokenize the text from the law corpus.

In [1]:
import os
import pickle
import locale
# python -m spacy download en_core_web_sm
# python -m spacy download pl_core_news_sm
import re
import string
import tarfile
from collections import Counter

import matplotlib
import matplotlib.pyplot as plt
import morfeusz2
import numpy as np
import pandas as pd
import regex
import spacy
from elasticsearch import *
from elasticsearch.helpers import *
from elasticsearch_dsl import *
from elasticsearch_dsl import query
from spacy.tokenizer import *

matplotlib.style.use("ggplot")
import time
import math
import Levenshtein
import operator


%matplotlib inline
import pandas as pd
locale.setlocale(locale.LC_COLLATE, 'pl_PL.UTF-8')


'pl_PL.UTF-8'

2. Compute **bigram** counts of downcased tokens.  Given the sentence: "The quick brown fox jumps over the
   lazy dog.", the bigram counts are as follows:
   
   * "the quick": 1
   * "quick brown": 1
   * "brown fox": 1
   * . ...
   * "dog .": 1

In [2]:
nlp = spacy.load("pl_core_news_sm")
tokenizer = Tokenizer(nlp.vocab)

tokens = {}
tokens_list = []
i = 0
path = "../data/ustawy"
for filename in os.listdir(path):
    with open(os.path.join(path, filename), "r", encoding="utf-8") as file: 
        act = file.read()
        act = regex.sub(r"\s+", " ", act)
        act = regex.sub(r"­","",act)
        act = act.lower()
        words = [token.text for token in tokenizer(act)]
        tokens[file.name] = words
        tokens_list = tokens_list + words
        i += 1
        if i % 200 == 0:
            print(i)
            
old_tokens_list = tokens_list

200
400
600
800
1000


In [3]:
tokens_list[0:10]

[' ', 'dz.u.', 'z', '1998', 'r.', 'nr', '117,', 'poz.', '759', 'ustawa']

In [4]:
def separate_puctuations(tokens):
    new_tokens = []
    for token in tokens:
        splitted = regex.findall(r"[\w']+|[.,!?;]", token)  #https://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
        new_tokens +=splitted
    return new_tokens

tokens = ['new,','fast,','expensive'] 
separate_puctuations(tokens)

['new', ',', 'fast', ',', 'expensive']

In [5]:
def bigrams(words):
    words = list(map(lambda x: x.strip(),words))
    words = zip(words, words[1:])
    return [' '.join(pair) for pair in words]

text = "The quick brown fox jumps over the lazy dog."
words = [token.text for token in tokenizer(text)]
print(bigrams(words))

['The quick', 'quick brown', 'brown fox', 'fox jumps', 'jumps over', 'over the', 'the lazy', 'lazy dog.']


In [6]:
tokens_list = separate_puctuations(tokens_list)
gram2 = bigrams(tokens_list)

In [7]:
Counter(gram2).most_common(5)


[('art .', 83779),
 ('ust .', 53552),
 ('poz .', 45222),
 ('. 1', 43484),
 (', poz', 43192)]

   
3. Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries **after**
   computing the bigram counts.
    

In [8]:
# data = gram2.filter()
gram2 =[token for token in gram2 if all(char not in string.punctuation and not char.isdigit() for char in token)]
gram2[0:5]
Counter(gram2).most_common(5)

[('w art', 32045),
 ('mowa w', 28471),
 ('w ust', 23557),
 ('o których', 13885),
 ('których mowa', 13858)]

4. Use [pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information) to compute the measure 
   for all pairs of words. 

In [9]:

def to_probabilities(tokens):
    tokens_freq = Counter(tokens)
    count = sum(tokens_freq.values())
    return {k: v/count for k, v in tokens_freq.items()}

p_bigram = to_probabilities(gram2)


p_token = to_probabilities(tokens_list) 
# map(lambda x: x/count,bigram_freq)
# word_freq = Counter(tokens_list

# reduce(list(bigram_freq.values))

In [10]:
def pmi(x,y): #pointwise_mutual_information
    result = p_bigram[x+" "+y] / (p_token[x] * p_token[y])
    return math.log2(result)
    

gram2_pmis =  {}
for key in gram2:
    if len(key.split())>2:
        print(key)
    gram2_pmis[key] = pmi(*key.split())


    

In [11]:
pmis = dict(sorted(gram2_pmis.items(), key=operator.itemgetter(1),reverse=True))
list(pmis.items())[:5]


[('korzy stający', 23.024484997199306),
 ('gałki ocznej', 23.024484997199306),
 ('przedemery talne', 23.024484997199306),
 ('organa uchwałodawcze', 23.024484997199306),
 ('kropki wstawić', 23.024484997199306)]

In [12]:

"""             środków trwałych, jeżeli w umowie leasingu zastrzeżono, że korzy
             stający będzie ponosił ciężar tych podatków i składek niezależnie"""
old_tokens_list.index('korzy')
old_tokens_list[22536:22560]

['w',
 'umowie',
 'leasingu',
 'zastrzeżono,',
 'że',
 'korzy',
 'stający',
 'będzie',
 'ponosił',
 'ciężar',
 'tych',
 'podatków',
 'i',
 'składek',
 'niezależnie',
 'od',
 'opłat',
 'za',
 'używanie,',
 '3)',
 'kaucji',
 'określonej',
 'w',
 'umowie']

In [13]:
# tokens_list.index('zrze')
# tokens_list[4421639:4421650]

In [14]:
separate_puctuations(["tery­torialnego"])

['tery', 'torialnego']

5. Sort the word pairs according to that measure in the descending order and determine top 10 entries.

6. Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset (>=5
   occurrences).

In [15]:
gram2_filtered = {k: v for k, v in Counter(gram2).items() if v>=5}
pmis = dict(sorted(gram2_filtered.items(), key=operator.itemgetter(1),reverse=True))
list(pmis.items())[:5]


[('w art', 32045),
 ('mowa w', 28471),
 ('w ust', 23557),
 ('o których', 13885),
 ('których mowa', 13858)]

7. Use [log likelihood ratio](http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html) (LLR) to compute the measure
   for all pairs of words.

In [16]:
gram2_freq = Counter(gram2)

In [17]:
def H(k):
    # print(k)

    N = np.sum(k)
    # print(N)
    # print(np.sum(k/N * np.ma.log(k/N).filled(0)))
    return np.sum(k/N * np.ma.log(k/N).filled(0))


def llr(a,b):

    k11 = gram2_freq[a+' '+b]
    k12 = sum([count for key, count in gram2_freq.items() if not a in key and b in key])
    k21 = sum([count for key, count in gram2_freq.items() if a in key and not b in key])
    k22 = sum([count for key, count in gram2_freq.items() if not a in key and not b in key])
    k = np.array([[k11,k12],[k21,k22]])
    rowSums = np.sum(k, axis=1).tolist()
    colSums = np.sum(k, axis=0).tolist()
    
    return 2* np.sum(k) * (H(k) - H(rowSums) - H(colSums))    


llr('w','art2')

# bigram_llr =  {}
# length = len(gram2)
# i=0 
# for key in gram2:
#     if len(key.split())>2:
#         print(key)
#     bigram_llr[key] = llr(*key.split())
#     if i%10==0:
#         print(f'{i}/{length}')
#     print(key,bigram_llr[key])
#     i+=1
    


0.0

In [20]:
from collections import defaultdict

token_count = defaultdict(int)

for bigram, count in gram2_freq.items():
    (first_token,second_token) = bigram.split()
    token_count[first_token] += count
    token_count[second_token] += count
    
total = sum(gram2_freq.values())

In [21]:
def H(k):
    # print(k)

    N = np.sum(k)
    # print(N)
    # print(np.sum(k/N * np.ma.log(k/N).filled(0)))
    return np.sum(k/N * np.ma.log(k/N).filled(0))


def llr(a,b):

    k11 = gram2_freq[a+' '+b]
    k12 = token_count[b] - k11
    k21 = token_count[a] - k11
    k22 = total - k21 - k12 - k11
    k = np.array([[k11,k12],[k21,k22]])
    rowSums = np.sum(k, axis=1).tolist()
    colSums = np.sum(k, axis=0).tolist()
    
    return 2* np.sum(k) * (H(k) - H(rowSums) - H(colSums))    


llr('w','art2')

0.0

In [27]:
gram2_llr =  {}
length = len(gram2)
i=0 
for key in gram2:
    if len(key.split())>2:
        print(key)
    gram2_llr[key] = llr(*key.split())
    if i%(int(length/10))==0:
        print(f'{i}/{length}')
    # print(key,gram2_llr[key])
    i+=1
    



0/2837496
283749/2837496
567498/2837496
851247/2837496
1134996/2837496
1418745/2837496
1702494/2837496
1986243/2837496
2269992/2837496
2553741/2837496
2837490/2837496


In [28]:
gram2_llr = dict(sorted(gram2_llr.items(), key=operator.itemgetter(1),reverse=True))
list(gram2_llr.items())[:5]

[('otrzymuje brzmienie', 102885.48395536352),
 ('w w', 88950.24561342891),
 ('w art', 72556.59014900832),
 ('których mowa', 65874.30552844425),
 ('w ust', 59140.47968532207)]

In [29]:
gram2_llr

{'otrzymuje brzmienie': 102885.48395536352,
 'w w': 88950.24561342891,
 'w art': 72556.59014900832,
 'których mowa': 65874.30552844425,
 'w ust': 59140.47968532207,
 'o których': 52416.33194280648,
 'mowa w': 51071.7654550929,
 'drodze rozporządzenia': 45996.84967469449,
 'dodaje się': 43483.15738019904,
 'którym mowa': 42425.906420601474,
 'i nr': 41886.19348378814,
 'minister właściwy': 39539.04214363478,
 'w i': 36793.85822369282,
 'o którym': 33843.66988989214,
 'rzeczypospolitej polskiej': 33569.22897909207,
 'stosuje się': 32448.263758649955,
 'z dnia': 30621.884528580806,
 'do spraw': 30501.65240743863,
 'z w': 30177.393943712257,
 'w z': 30037.615909239837,
 'w do': 28805.913704274455,
 'na podstawie': 27579.657281771495,
 'i w': 26897.02148290106,
 'co najmniej': 25028.95432194936,
 'której mowa': 24790.016318889237,
 'a także': 24561.116407116402,
 'od dnia': 24115.328533295917,
 'na w': 23957.390332709227,
 'w na': 23922.080774459813,
 'w o': 23129.485454435428,
 'o w': 2300

8. Sort the word pairs according to that measure in the descending order and display top 10 entries.

9. Compute **trigram** counts for the whole corpus and perform the same filtering.

In [34]:
def trigrams(words):
    words = list(map(lambda x: x.strip(),words))
    words = zip(words, words[1:],words[2:])
    return [' '.join(pair) for pair in words]

text = "The quick brown fox jumps over the lazy dog."
words = [token.text for token in tokenizer(text)]
print(trigrams(words))




['The quick brown', 'quick brown fox', 'brown fox jumps', 'fox jumps over', 'jumps over the', 'over the lazy', 'the lazy dog.']


In [35]:
gram3 = trigrams(tokens_list)
gram3 =[token for token in gram3 if all(char not in string.punctuation and not char.isdigit() for char in token)]
gram3[0:5]
gram3_freq = Counter(gram3)

In [36]:

def to_probabilities(tokens):
    tokens_freq = Counter(tokens)
    count = sum(tokens_freq.values())
    return {k: v/count for k, v in tokens_freq.items()}

p_gram3 = to_probabilities(gram3)


In [37]:
def pmi3(x,y,z): #pointwise_mutual_information
    result = p_gram3[x+" "+y+" "+z] / (p_token[x] * p_token[y] * p_token[z])
    return math.log2(result)
    

gram3_pmis =  {}
for key in gram3:
    if len(key.split())!=3:
        print(key)
    gram3_pmis[key] = pmi3(*key.split())

    



In [38]:
list(gram3_pmis.items())[:5]


[('ustawa z dnia', 13.618171861614767),
 ('o zmianie ustawy', 15.025339449632657),
 ('zmianie ustawy o', 14.622148193201921),
 ('ustawy o systemie', 12.351357582605694),
 ('o systemie oświaty', 17.44810027180696)]

In [None]:
def H(k):
    # print(k)

    N = np.sum(k)
    # print(N)
    # print(np.sum(k/N * np.ma.log(k/N).filled(0)))
    return np.sum(k/N * np.ma.log(k/N).filled(0))


def llr(a,b,c):

    k11 = gram2_freq[a+' '+b+' '+c]
    k12 = sum([count for key, count in gram3_freq.items() if not a in key and b in key])
    k21 = sum([count for key, count in gram3_freq.items() if a in key and not b in key])
    k22 = sum([count for key, count in gram3_freq.items() if not a in key and not b in key])
    k = np.array([[k11,k12],[k21,k22]])
    rowSums = np.sum(k, axis=1).tolist()
    colSums = np.sum(k, axis=0).tolist()
    
    return 2* np.sum(k) * (H(k) - H(rowSums) - H(colSums))    


llr('w','art2')

10. Use PMI (with 5 occurrence threshold) and LLR to compute top 10 results for the trigrams. Devise a method for computing the values, based on the
   results for bigrams.

11. Create a table comparing the methods (separate table for bigrams and trigrams).

12. Answer the following questions:

   a. Why do we have to filter the bigrams, rather than the token sequence?
   
   b. Which measure (PMI, PMI with filtering, LLR) works better for the bigrams and which for the trigrams?
   
   c. What types of expressions are discovered by the methods.
   
   d. Can you devise a different type of filtering that would yield better results?