In [4]:
import networkx
import re
import pandas as pd

In [5]:
#출처 : https://bab2min.tistory.com/570
class RawTaggerReader:
    def __init__(self, plot, tagger = None):
        if tagger:
            self.tagger = tagger
        else :
            from konlpy.tag import Komoran
            self.tagger = Komoran()
        self.plot = plot
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        ch = self.rgxSplitter.split(self.plot)
        if len(ch) == 1: ch.append('.')
        for s in map(lambda a,b:a+b, ch[::2], ch[1::2]):
            if not s: continue
            yield self.tagger.pos(s)

class TextRank:
    def __init__(self, **kargs):
        self.graph = None
        self.window = kargs.get('window', 5)
        self.coef = kargs.get('coef', 1.0)
        self.threshold = kargs.get('threshold', 0.005)
        self.dictCount = {}
        self.dictBiCount = {}
        self.dictNear = {}
        self.nTotal = 0

    def load(self, sentenceIter, wordFilter = None):
        def insertPair(a, b):
            if a > b: a, b = b, a
            elif a == b: return
            self.dictBiCount[a, b] = self.dictBiCount.get((a, b), 0) + 1
 
        def insertNearPair(a, b):
            self.dictNear[a, b] = self.dictNear.get((a, b), 0) + 1
 
        for sent in sentenceIter:
            for i, word in enumerate(sent):
                if wordFilter and not wordFilter(word): continue
                self.dictCount[word] = self.dictCount.get(word, 0) + 1
                self.nTotal += 1
                if i - 1 >= 0 and (not wordFilter or wordFilter(sent[i-1])): insertNearPair(sent[i-1], word)
                if i + 1 < len(sent) and (not wordFilter or wordFilter(sent[i+1])): insertNearPair(word, sent[i+1])
                for j in range(i+1, min(i+self.window+1, len(sent))):
                    if wordFilter and not wordFilter(sent[j]): continue
                    if sent[j] != word: insertPair(word, sent[j])

    def getPMI(self, a, b):
        import math
        co = self.dictNear.get((a, b), 0)
        if not co: return None
        return math.log(float(co) * self.nTotal / self.dictCount[a] / self.dictCount[b])
 
    def getI(self, a):
        import math
        if a not in self.dictCount: return None
        return math.log(self.nTotal / self.dictCount[a])
 
    def build(self):
        self.graph = networkx.Graph()
        self.graph.add_nodes_from(self.dictCount.keys())
        for (a, b), n in self.dictBiCount.items():
            self.graph.add_edge(a, b, weight=n*self.coef + (1-self.coef))
 
    def rank(self):
        return networkx.pagerank(self.graph, weight='weight')
 
    def extract(self, ratio = 0.1):
        ranks = self.rank()

        if int(len(ranks) * ratio) > 10:
            cand = sorted(ranks, key=ranks.get, reverse=True)[:int(len(ranks) * ratio)]
        else:
            cand = sorted(ranks, key=ranks.get, reverse=True)[:10]
        pairness = {}
        startOf = {}
        tuples = {}

        for k in cand:
            tuples[(k,)] = self.getI(k) * ranks[k]
            for l in cand:
                if k == l: continue
                pmi = self.getPMI(k, l)
                if pmi: pairness[k, l] = pmi
 
        for (k, l) in sorted(pairness, key=pairness.get, reverse=True):
            if k not in startOf: startOf[k] = (k, l)
 
        for (k, l), v in pairness.items():
            pmis = v
            rs = ranks[k] * ranks[l]
            path = (k, l)
            tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)
            last = l
            while last in startOf and len(path) < 7:
                if last in path: break
                pmis += pairness[startOf[last]]
                last = startOf[last][1]
                rs *= ranks[last]
                path += (last,)
                tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)

        used = set()
        both = {}
        for k in sorted(tuples, key=tuples.get, reverse=True):
            if len(k) == 1:
                if used.intersection(set(k)): continue
                both[k] = tuples[k]
                for w in k:
                    used.add(w)
        
        return both

In [6]:
df = pd.read_csv('./data/naver_movie_dataset.csv')

In [None]:
results = []
for idx in df.index:
    keywords = []
    tr = TextRank(window=5, coef=1)
    tr.load(RawTaggerReader(df.loc[idx,'plot']))
            #and w[0] in model_kmr.wv.vocab and (w[1] in ('NNG', 'NNP', 'VV')))
    tr.build()
    kw = tr.extract(0.2)
    for k in sorted(kw, key=kw.get, reverse=True):
        keywords.append(k[0][0])
        #print("%s\t%g" % (k, kw[k]))
    if idx %100 == 0:
        print(idx,'/10255',keywords)
    results.append(keywords)

0 /10255 ['의', 'ㄴ', '아', '을', '이', '하', '를', '는', '가문', '어', '프레디', '늑대인간', '에', '는', '!']
100 /10255 ['을', '하', '는', 'ㄴ', '고', '의', '가', '에', '를', '는', '하', '박사', '세계', '나치', '로저', '대위', '에리스', '있', '.', '부대', '로', '부대원', '에서']
200 /10255 [',', 'ㄴ', '를', '는', '은', '도', '.', '며', '하', '에서', '하루', '지', '날', '의', '어느', '꿈', '이', '카페', 'ㄴ다']
300 /10255 ['ㄴ', '들', '하', '되', '을', '이', '에', '섬', ',', '은', '가', '기', '고', '하']
400 /10255 ['’', '‘', 'ㄴ', '하', '과', '은', '의', '석영', '에게', '남', '여자', '중', '들', '를', '자리', '는', '.']
500 /10255 ['을', '하', '아', '은', '만', '경', ')', '는', '를', 'ㄴ', '(', '은', '게', '에', '며', '도', '행사', '의', '팔순', '기', '.']
