In [1]:
import sys
import numpy as np

from collections import defaultdict
from tqdm import tqdm
from pprint import pprint

# Configuration

In [2]:
USE_APPEND_MYMODULES = True

if USE_APPEND_MYMODULES is True:
    sys.path.append('./mypackage/')

In [3]:
corpus_fname = "./data/2016-10-28_article_all_normed.txt"

In [4]:
max_l_length = 6
max_r_length = 5
min_count = 30

In [5]:
from corpus import Corpus

In [6]:
corpus = Corpus(corpus_fname, iter_sent=True)
len(corpus)

176597

In [7]:
L = defaultdict(lambda: defaultdict(lambda: 0))
R = defaultdict(lambda: defaultdict(lambda: 0))

In [8]:
for num_sent, sent in enumerate(tqdm(corpus)):
    
    # if num_sent % 5000 == 0:
        # sys.stdout.write("\rinserting %d sents..." % num_sent)
    
    for token in sent.split():
        
        if len(token)<2:
            continue
            
        for e in range(2, min(max_l_length, len(token)) + 1):
            subword_from = token[:e-1]
            subword_to = token[:e]
            L[subword_from][subword_to] += 1
            
        for b in range(2, min(max_r_length + 1, len(token))):
            subword_from = token[:-b+1]
            subword_to = token[-b:]
            R[subword_from][subword_to] += 1

100%|██████████| 176597/176597 [00:20<00:00, 8426.78it/s] 


In [9]:
L.get("트와이스")

defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
            {'트와이스가': 36,
             '트와이스까': 1,
             '트와이스는': 40,
             '트와이스다': 1,
             '트와이스와': 6,
             '트와이스의': 34,
             '트와이스코': 140,
             '트와이스타': 1})

In [10]:
R.get("트와이스")

defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
            {'스가': 36,
             '스까지': 1,
             '스는': 40,
             '스다': 1,
             '스와': 6,
             '스의': 34,
             '스코스터': 140,
             '스타로': 1})

# Accessor Variety (Feng et ai., 2004)

http://www.mitpressjournals.org/doi/abs/10.1162/089120104773633394

In [11]:
def get_accessor_variety(word):
    
    # av_l: ?-린이
    # av_r: 어린-?
    
    av_l = 0 if not word in R else len(R[word])
    av_r = 0 if not word in L else len(L[word])
    return (av_l, av_r)


for word in ['박근', '박근혜', '국방', '국방부', '국방부는', '국방장', '국방장관', '트와이', '트와이스']:
    av = get_accessor_variety(word)
    print('%s\t(%d, %d)' % (word, av[0], av[1]))

박근	(50, 7)
박근혜	(41, 23)
국방	(66, 24)
국방부	(26, 13)
국방부는	(0, 0)
국방장	(9, 1)
국방장관	(8, 7)
트와이	(9, 1)
트와이스	(8, 8)


# Branching Entropy (Jin & Tanaka, 2006)
http://dl.acm.org/citation.cfm?id=1273129

In [12]:
def get_branching_entropy(word):
    
    def entropy(extensions):
        '''extensions: dict[str]: int
        '''
        sum_ = sum(extensions.values())
        if sum_ == 0:
            return 0
        
        entropy = 0
        for v in extensions.values():
            if v == 0: continue
            prob = v / sum_
            entropy += (prob * np.log(prob)) # important
        return -1 * entropy

    # be_l: ?-린이
    # be_r: 어린-?
    
    be_l = 0 if not word in R else entropy(R[word])
    be_r = 0 if not word in L else entropy(L[word])    
    return (be_l, be_r)


for word in ['박근', '박근혜', '국방', '국방부', '국방부는', '국방장', '국방장관', '트와이', '트와이스']:
    be = get_branching_entropy(word)
    print('%s\t(%.3f, %.3f)' % (word, be[0], be[1]))

박근	(0.594, 0.082)
박근혜	(2.707, 2.305)
국방	(2.580, 1.097)
국방부	(1.776, 1.601)
국방부는	(0.000, 0.000)
국방장	(1.809, -0.000)
국방장관	(1.631, 1.575)
트와이	(1.142, -0.000)
트와이스	(1.313, 1.313)
