In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('text/article.csv', usecols=['pic_num', 'article'], engine='python', encoding='euc-kr')

In [4]:
def read_article(filepath):
    '''
    helper function to read in the file and yield each line at a time.
    '''
    with open(filepath) as f:
        for article in f:
            yield article

#### 텍스트의 열마다 카데고리 명을 가진 텍스트 파일을 만든다
1. word2vec에서 관련성을 높일 수 있다.
2. 만약에 본문에서 텍스트가 없던 경우 빈 칸을 채워줄 수 있다.

In [5]:
%%time

categories = ['bathroom', 'bedroom', 'kitchen', 'livingroom']
category_text = {'bathroom':'욕실', 'bedroom':'침실', 'kitchen':'주방', 'livingroom':'거실'}
by_category_path = 'text/by_category/'
article_categorized_path = 'text/article_categorized.txt'

with open(article_categorized_path, 'w') as fw:
    for category in categories:
        print(category)
        with open(by_category_path + category + '.csv', 'r') as fr:
            for content in data.article.values:
                content = content.split('\n')[0]
                if 'pic_num,article' in content:
                    continue
                if not content:
                    break

                if category_text[category] not in content:
                    
                    categorized_text = content + ' ' + category_text[category]
                else:
                    categorized_text = content
                #한 파일에 다 쓴다    
                fw.write(categorized_text + '\n')

bathroom
bedroom
kitchen
livingroom
Wall time: 77.1 ms


In [6]:
import konlpy
import nltk
import json

konlpy.download('komoran-dic')

KoNLPy downloader
[konlpy_data] Downloading package 'komoran-dic'...
[konlpy_data] Download finished
[konlpy_data] Unzipping file C:\Users\smingdisco\AppData\Roaming\konlpy_data\dictionaries/komoran-dic.zip
Done


In [7]:
filepath = 'text/article_categorized.txt'

with open(filepath, 'r') as f:
    article = f.read()

In [10]:
%%time

json_data = open('text/ko.json', 'rt', encoding='UTF8').read()
stop_words = json.loads(json_data)
words = konlpy.tag.Twitter().pos(article)
parser = nltk.RegexpParser("NP: {<Adjective>*<Noun>*}")
chunks = parser.parse(words)

print("\n# Writing Noun Adjective only")
noun_adjective_only_file_path = 'text/noun_adj_article.txt'

with open(noun_adjective_only_file_path, 'w') as f:
    for subtree in chunks.subtrees():
        if subtree.label()=='NP':
            parse = ' '.join((e[0] for e in list(subtree) if e not in stop_words and len(e) > 1 ))
            f.write(parse + ' ')


# Writing Noun Adjective only
Wall time: 7min 40s


In [11]:
import codecs
BLOCKSIZE = 1048576 # or some other, desired size in bytes
sourceFileName = "text/noun_adj_article.txt"
targetFileName = "text/noun_adj_article_utf.txt"
with codecs.open(sourceFileName, "r") as sourceFile:
    with codecs.open(targetFileName, "w", "utf-8") as targetFile:
        while True:
            contents = sourceFile.read(BLOCKSIZE)
            if not contents:
                break
            targetFile.write(contents)

In [12]:
%%time
sentences_normalized_filepath = 'text/noun_adj_article_utf.txt'
bigram_model_filepath = 'text/bigram_model'
sentences_for_word2vec_filepath = 'text/sentences_for_word2vec.txt'

# gensim's LineSentence provies a convenient way to iterate over lines in a text file.
# it outputs one line at a time, so you can save memory space. it works well with other gensim components.
from gensim.models.word2vec import LineSentence
from gensim.models import Phrases
# we take normalized sentences as unigram sentences, which means we didn't apply any phrase modeling yet.
unigram_sentences = LineSentence(sentences_normalized_filepath)

bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)

with open(sentences_for_word2vec_filepath, 'w') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = bigram_model[unigram_sentence]
        f.write(' '.join(bigram_sentence) + '\n')



Wall time: 3.99 s


### Word vector modeling with Word2Vec

In [13]:
from gensim.models import Word2Vec

In [14]:
import codecs
BLOCKSIZE = 1048576 # or some other, desired size in bytes
sourceFileName = 'text/sentences_for_word2vec.txt'
targetFileName = 'text/sentences_for_word2vec_utf.txt'
with codecs.open(sourceFileName, "r") as sourceFile:
    with codecs.open(targetFileName, "w", "utf-8") as targetFile:
        while True:
            contents = sourceFile.read(BLOCKSIZE)
            if not contents:
                break
            targetFile.write(contents)

In [15]:
%%time

sentences_for_word2vec_filepath = 'text/sentences_for_word2vec_utf.txt'
word2vec_filepath = 'text/word2vec_model'

sentences_for_word2vec = LineSentence(sentences_for_word2vec_filepath)

# initiate the model with 100 dimensions of vectors, 5 words to look before and after each focus word, etc.
# perform another 10 epochs of training
#embedding_model  = Word2Vec(sentences_for_word2vec, size=100, window=5, min_count=5, workers = 4, sg=1)

embedding_model  = Word2Vec(size=100, window=7, min_count=60, workers=4, sg=1, iter=100)
embedding_model.build_vocab(sentences_for_word2vec)
embedding_model.train(sentences_for_word2vec, epochs=embedding_model.iter, total_examples=embedding_model.corpus_count)
    
embedding_model.save(word2vec_filepath)


Wall time: 8.99 s


In [16]:
#unload unnecessary memory
embedding_model.init_sims(replace=True)

In [17]:
print(embedding_model.most_similar(positive=["욕실"], topn=50))

[('주방', 0.7823333740234375), ('공간', 0.5755470991134644), ('디자인', 0.5070724487304688), ('침실', 0.4837484061717987), ('분위기', 0.4634815454483032), ('조명', 0.3515093922615051), ('아이디어', 0.3448489308357239), ('거실', 0.33103805780410767), ('있는', 0.3224629759788513), ('타일', 0.31537044048309326), ('화이트', 0.28660720586776733), ('가구', 0.2810284197330475), ('인테리어', 0.27817273139953613), ('수납', 0.2742149233818054), ('벽', 0.24577267467975616), ('색상', 0.24403813481330872), ('모던', 0.240724578499794), ('건', 0.23260971903800964), ('욕조', 0.23255866765975952), ('바닥', 0.23086263239383698), ('작은', 0.22819387912750244), ('거울', 0.22213870286941528), ('나무', 0.22101475298404694), ('같은', 0.20993520319461823), ('느낌', 0.20883585512638092), ('개성', 0.20637747645378113), ('고급스러운', 0.1933993697166443), ('파티션', 0.1905672252178192), ('스타일', 0.19001181423664093), ('재질', 0.18473412096500397), ('과감', 0.18002989888191223), ('유리', 0.179835706949234), ('세련된', 0.17974317073822021), ('컬러', 0.17939430475234985), ('수_있는', 0.1783550

In [18]:
print(embedding_model.most_similar(positive=["주방"], topn=50))

[('욕실', 0.7823333740234375), ('공간', 0.5823670029640198), ('침실', 0.5461084246635437), ('디자인', 0.4935576319694519), ('거실', 0.45514625310897827), ('조명', 0.3707321286201477), ('분위기', 0.36931049823760986), ('아이디어', 0.31567156314849854), ('가구', 0.3110201954841614), ('수납', 0.310720294713974), ('있는', 0.281094491481781), ('인테리어', 0.261688232421875), ('타일', 0.25824031233787537), ('화이트', 0.23392754793167114), ('벽', 0.22018681466579437), ('기능', 0.20708784461021423), ('것', 0.20648670196533203), ('컬러', 0.19701643288135529), ('모던', 0.18771231174468994), ('스타일', 0.17789939045906067), ('및_분위기', 0.17472344636917114), ('아일랜드', 0.17223945260047913), ('흰색', 0.16664552688598633), ('테라스', 0.16511251032352448), ('따뜻한', 0.16439273953437805), ('개성', 0.16330109536647797), ('목재', 0.16326820850372314), ('나무', 0.1614684909582138), ('감', 0.15774337947368622), ('작은', 0.1556023806333542), ('천장', 0.15243366360664368), ('포인트', 0.1511402428150177), ('깔끔한', 0.15101829171180725), ('감각', 0.14984872937202454), ('자', 0.148850

In [19]:
num_words = 2000
word_embeddings = pd.DataFrame(embedding_model.wv.syn0[:num_words, :], index=embedding_model.wv.index2word[:num_words])
word_embeddings.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
주방,0.063515,0.035828,-0.163579,0.072392,0.127996,0.045399,-0.205755,0.221592,0.117537,0.020764,...,0.005597,0.036799,0.131786,-0.185329,0.066229,0.027013,-0.077444,0.057848,-0.009024,-0.177999
욕실,0.056751,-0.001797,-0.136228,-0.055955,0.027535,0.035236,-0.18552,0.207014,0.136846,0.05865,...,0.016614,-0.087505,0.102614,-0.065265,0.036491,0.011621,0.003837,0.056054,-0.025624,-0.058417
침실,0.236241,0.035015,-0.24054,0.010719,-0.016849,0.113503,0.039309,0.20846,0.20504,-0.107345,...,-0.040094,-0.051208,0.080476,-0.285389,-0.097936,0.116151,-0.018927,0.171645,0.010793,-0.153641
거실,0.179722,0.028995,-0.131144,-0.043775,0.005347,0.049651,0.03987,0.175661,0.132975,-0.023586,...,-0.036328,-0.048867,0.16558,-0.290612,0.033515,0.032168,0.034656,0.127358,0.032019,-0.069061
공간,0.117706,-0.00127,-0.003223,0.056807,0.060596,0.117467,-0.051614,0.224069,0.083199,0.102337,...,-0.056284,-0.113975,0.179174,-0.260941,-0.015147,0.173605,0.074017,0.082512,0.095347,-0.184487
디자인,0.046204,0.076524,-0.105821,0.07514,0.00374,0.063069,0.134921,-0.003537,0.131503,0.096796,...,-0.0461,-0.08858,0.201027,-0.222572,-0.049944,0.054295,0.01055,-0.066318,0.015942,-0.222374
인테리어,-0.069292,-0.032781,-0.258591,-0.029989,-0.043954,0.055436,0.106581,-0.002393,0.087048,0.104503,...,-0.10855,-0.184765,-0.080518,-0.217894,-0.152116,-0.019068,-0.045112,-0.073587,-0.008797,-0.117964
분위기,-0.109292,0.075498,-0.300119,-0.078531,0.088136,0.0874,-0.005362,0.217306,0.05084,0.017885,...,-0.04964,-0.017742,0.010883,-0.009842,0.032099,0.093488,-0.020878,0.092507,-0.028831,0.008885
수납,0.100399,0.026037,-0.106327,-0.03949,-0.135616,0.07239,0.013723,0.181978,0.023805,-0.031632,...,-0.077501,-0.109568,0.285844,-0.034071,0.092282,0.008141,-0.194086,-0.022568,-0.006358,-0.122346
스타일,0.034095,0.012481,-0.197035,-0.179153,0.13145,0.149684,-0.054042,0.079016,-0.035133,-0.06478,...,0.108111,0.129002,0.035491,0.019178,-0.047771,0.147068,-0.041609,-0.076965,-0.045102,-0.125439
