In [1]:
def read_data(filename):
    with open(filename,'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]
    return data

train_data = read_data('dff.csv')
test_data = read_data('dff.csv')

In [2]:
from konlpy.tag import Twitter
pos_tagger = Twitter()

def tokenize(doc):
    return['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

train_docs = [(tokenize(row[0]), row[0]) for row in train_data]
test_docs = [(tokenize(row[0]), row[0]) for row in test_data]

In [3]:
from pprint import pprint
pprint(train_docs[100])

(['101/Number', ',/Punctuation', '병리/Noun', '과/Josa'], '101,병리과')


In [4]:
tokens = [ t for d in train_docs for t in d[0]]
print(len(tokens))

import nltk
text = nltk.Text(tokens, name ='NMSC')
print(text)
print(len(text.tokens))
print(len(set(text.tokens)))
pprint(text.vocab().most_common(10))

26854
<Text: NMSC>
26854
7445
[(',/Punctuation', 6825),
 ('진료/Noun', 1426),
 ('안내/Noun', 1037),
 ('과/Josa', 712),
 ('위치/Noun', 684),
 ('예약/Noun', 583),
 ('병원/Noun', 537),
 ('예/Noun', 494),
 ('가정의학/Noun', 441),
 ('대학/Noun', 430)]


In [5]:
text.collocations()

,/Punctuation 진료/Noun; 진료/Noun 예약/Noun; 가정의학/Noun 과/Josa; 대학/Noun
안내/Noun; ,/Punctuation 예/Noun; A/Alpha 병동/Noun; ,/Punctuation
가정의학/Noun; 이/Determiner 덕철/Noun; ,/Punctuation 대학/Noun; 처음/Noun
으로/Josa; 병원/Noun 안내/Noun; 식당/Noun 시간/Noun; ,/Punctuation 식당/Noun;
경쟁률/Noun 보기/Noun; 강희/Noun 철/Noun; 2018/Number -/Punctuation; 클리/Noun
닉/Noun; 진료/Noun 과/Josa; 의료/Noun 진/Noun; 식당/Noun 위치/Noun


In [6]:
selected_words = [f[0] for f in text.vocab().most_common(2000)]

def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}

train_docs = train_docs[:10000]

train_xy = [(term_exists(d), c) for d, c in train_docs]
test_xy = [(term_exists(d), c) for d, c in test_docs]

In [7]:
classifier = nltk.NaiveBayesClassifier.train(train_xy) #Naive Bayes classifier 적용

In [8]:
#print(nltk.classify.accuracy(classifier, test_xy))

In [9]:
classifier.show_most_informative_features(10)

Most Informative Features
         exists(원목/Noun) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
     exists(1123/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
        exists(알려줭/Noun) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
      exists(484/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
      exists(159/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
      exists(389/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
      exists(591/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
      exists(702/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
     exists(1001/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0
      exists(958/Number) = False          전문분야 : : 1,눈이 아 =      1.2 : 1.0


In [12]:
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle


TaggedDocument = namedtuple('TaggedDocument','words tags')

tagged_train_docs = [TaggedDocument(d,[c]) for d, c in train_docs]
tagged_test_docs = [TaggedDocument(d,[c]) for d, c in test_docs]



In [13]:
#사전구축
doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs) 


# Train document vectors! 
for epoch in range(10): 
    doc_vectorizer.train(tagged_train_docs)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate 
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay 
    
    
#To save
# doc_vectorizer.save('doc2vec.model')






ValueError: You must specify either total_examples or total_words, for proper job parameters updationand progress calculations. The usual value is total_examples=model.corpus_count.

In [17]:
pprint(doc_vectorizer.most_similar('암/Noun'))

[('21/Number', 0.14951092004776),
 ('시험/Noun', 0.1463509052991867),
 ('4/Number', 0.13269299268722534),
 ('챗봇/Noun', 0.12864935398101807),
 ('과/Josa', 0.12334549427032471),
 ('신촌역/Noun', 0.11548204720020294),
 ('정형외과/Noun', 0.11526899039745331),
 ('구성/Noun', 0.11221339553594589),
 ('AM/Alpha', 0.10999355465173721),
 ('?/Punctuation', 0.10643953084945679)]


  """Entry point for launching an IPython kernel.
