In [1]:
import gensim
import gensim.downloader as api
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary
from gensim import similarities
from gensim.test.utils import common_corpus, common_dictionary

In [4]:
def custom_tokenizer(strings) :
    wordList = strings.split()
    result = []
    for word in wordList :
        if (word.endswith('noun') or word.endswith('alpha') or word.endswith('number')) :
            result.append(word)
            
        if (len(result) >= 200) :
            break
            
    return result

In [6]:
fname = './train_data_1012/trainingSet/excluded_total_dataset.txt'
sentences = []
with open(fname, "r", encoding = 'utf-8') as fp :
    temp = fp.readlines()
    for sentence in temp :
        split_ = sentence.split("\t")
        sentences.append(split_[3])

print(len(sentences))

798339


In [7]:
%time tokens = [custom_tokenizer(sentence) for sentence in sentences]
print(type(tokens))
print(len(tokens))

CPU times: user 2min 18s, sys: 12.3 s, total: 2min 30s
Wall time: 2min 30s
<class 'list'>
798339


In [8]:
%time corpus_id2word = Dictionary(tokens)
print(type(corpus_id2word))
print(len(corpus_id2word.keys()))

CPU times: user 2min 47s, sys: 94.9 ms, total: 2min 47s
Wall time: 2min 47s
<class 'gensim.corpora.dictionary.Dictionary'>
410417


In [9]:
# min_df가 10일 때의 number of words가 116816임
%time corpus_id2word.filter_extremes(keep_n = 116816) 
print(len(corpus_id2word.keys()))

CPU times: user 1.12 s, sys: 84.4 ms, total: 1.21 s
Wall time: 1.21 s
116816


In [10]:
corpus_id2word.save("./lsi_dictionary.dict") # cutting한 dictionary를 저장

In [11]:
print(tokens[0])

['증시/noun', '3/number', '악재/noun', '연말/noun', '산타랠리/noun', '긴축/noun', '우려/noun', '유럽/noun', '재정/noun', '위기/noun', '북한/noun', '리스크/noun', '등/noun', '돌/noun', '서울/noun', '연합뉴스/noun', '한지훈/noun', '기자/noun', '12월/number', '국내/noun', '증시/noun', '중국/noun', '긴축/noun', '우려/noun', '유럽/noun', '재정/noun', '위기/noun', '북한/noun', '리스크/noun', '등/noun', '3/number', '악재/noun', '연말/noun', '계절/noun', '강세/noun', '의미/noun', '산타랠리/noun', '를/noun', '만끽/noun', '전망/noun', '다수/noun', '증시/noun', '전문가/noun', '코스피지수/noun', '올해/noun', '안/noun', '2,000/number', '선/noun', '다시/noun', '비행/noun', '것/noun', '예측/noun', '28일/number', '증권/noun', '업계/noun', '중국/noun', '긴축/noun', '우려/noun', '유럽/noun', '재정/noun', '위기/noun', '등/noun', '기존/noun', '지수/noun', '압박/noun', '해외/noun', '변수/noun', '불확실/noun', '띠/noun', '가운데/noun', '북한/noun', '기습/noun', '포격/noun', '겹치/noun', '시장/noun', '투자/noun', '심리/noun', '위축/noun', '한층/noun', '가중/noun', '모습/noun', '아일랜드/noun', '금융/noun', '유로존/noun', '대한/noun', '불안감/noun', '일부/noun', '해소/noun', '스페인/nou

In [12]:
# custom_tokenizer를 거친 token들을 TF-IDF model을 이용할 수 있도록 bag of words 형태로 바꿈
%time corpus = [corpus_id2word.doc2bow(token_set) for token_set in tokens]
print(type(corpus))

CPU times: user 2min 18s, sys: 5.88 s, total: 2min 24s
Wall time: 2min 24s
<class 'list'>


In [14]:
%time tfidf = TfidfModel(corpus)

CPU times: user 29.9 s, sys: 14.9 ms, total: 29.9 s
Wall time: 29.9 s


In [15]:
tfidf.save("./tfidf_training.model")

In [16]:
%time corpus_tfidf = tfidf[corpus]

CPU times: user 30 µs, sys: 3 µs, total: 33 µs
Wall time: 34.1 µs


In [15]:
%time lsi = LsiModel(corpus_tfidf, id2word = corpus_id2word, num_topics = 500)

CPU times: user 2h 44min 51s, sys: 2h 29min 40s, total: 5h 14min 31s
Wall time: 48min 21s


In [41]:
%time corpus_lsi = lsi[corpus_tfidf]

CPU times: user 39 µs, sys: 0 ns, total: 39 µs
Wall time: 42.9 µs


In [22]:
lsi.save("./LSI_training.model")

In [42]:
print(lsi.print_topic(499))

0.346*"stx/alpha" + -0.182*"위안화/noun" + 0.153*"중산층/noun" + 0.143*"승률/noun" + -0.132*"셋값/noun" + 0.115*"매각/noun" + 0.109*"상/noun" + -0.107*"국민은행/noun" + -0.103*"신세계/noun" + 0.097*"강/noun"


In [51]:
%time index = similarities.MatrixSimilarity(lsi[corpus_tfidf])

CPU times: user 24min 18s, sys: 2.7 s, total: 24min 20s
Wall time: 24min 20s


In [52]:
index.save('./lsi_similarity.index')

In [54]:
print(tokens[-1])

['미국/noun', '이란/noun', '정면/noun', '충돌/noun', '시나리오/noun', '가나/noun', '신문/noun', '300/number', '개/noun', '공격/noun', '목표/noun', '세/noun', '방향/noun', '타격/noun', '계획/noun', '미국/noun', '부인/noun', '불구/noun', '이란/noun', '공격/noun', '시나리오/noun', '제기/noun', '등/noun', '미국/noun', '이란/noun', '정면/noun', '충돌/noun', '위기/noun', '고조/noun', '이란/noun', '공격/noun', '시나리오/noun', '조지/noun', '부시/noun', '대통령/noun', '로버트/noun', '게이츠/noun', '미국/noun', '방/noun', '장관/noun', '차례/noun', '부인/noun', '불구/noun', '미국/noun', '이란/noun', '공격/noun', '계획/noun', '수립/noun', '실제/noun', '이란/noun', '핵/noun', '시설/noun', '폭격/noun', '가능성/noun', '영국/noun', 'bbc/alpha', '방송/noun', '등/noun', '언론보도/noun', '또/noun', '유엔/noun', '안보리/noun', '결의/noun', '이란/noun', '핵/noun', '활동/noun', '중단/noun', '요구/noun', '시한/noun', '21일/number', '정면/noun', '대치/noun', '국면/noun', '긴장감/noun', '부시/noun', '행정부/noun', '이라크/noun', '사태/noun', '때문/noun', '당장/noun', '이란/noun', '손보기/noun', '수/noun', '입장/noun', '경고/noun', '메시지/noun', '무력/noun', '시위/noun', '이란/noun', '공격

In [58]:
vec_bow = corpus_id2word.doc2bow(tokens[-1])
vec_bow_tfidf = tfidf[vec_bow]
vec_lsi = lsi[vec_bow_tfidf]
print(vec_lsi[ : 10])

[(0, 0.09934064389292278), (1, -0.026067307670026265), (2, -0.029848312668925046), (3, -0.012031228107555789), (4, -0.03752097065740627), (5, -0.1004015180737048), (6, 0.006419642968160668), (7, -0.010956147071747227), (8, -0.01437437330475245), (9, 0.0018304173738976053)]


In [59]:
sims = index[vec_lsi]

In [60]:
sims = sorted(enumerate(sims), key = lambda item : -item[1])
print(sims[ : 5])

[(744743, 1.0), (798338, 1.0), (796632, 0.951585), (675309, 0.93731624), (787467, 0.93731624)]


In [88]:
for i in range(5) :
    print(sims[i][0], tokens[sims[i][0]], sims[i][1], "\n")

744743 ['미국/noun', '이란/noun', '정면/noun', '충돌/noun', '시나리오/noun', '가나/noun', '신문/noun', '300/number', '개/noun', '공격/noun', '목표/noun', '세/noun', '방향/noun', '타격/noun', '계획/noun', '미국/noun', '부인/noun', '불구/noun', '이란/noun', '공격/noun', '시나리오/noun', '제기/noun', '등/noun', '미국/noun', '이란/noun', '정면/noun', '충돌/noun', '위기/noun', '고조/noun', '이란/noun', '공격/noun', '시나리오/noun', '조지/noun', '부시/noun', '대통령/noun', '로버트/noun', '게이츠/noun', '미국/noun', '방/noun', '장관/noun', '차례/noun', '부인/noun', '불구/noun', '미국/noun', '이란/noun', '공격/noun', '계획/noun', '수립/noun', '실제/noun', '이란/noun', '핵/noun', '시설/noun', '폭격/noun', '가능성/noun', '영국/noun', 'bbc/alpha', '방송/noun', '등/noun', '언론보도/noun', '또/noun', '유엔/noun', '안보리/noun', '결의/noun', '이란/noun', '핵/noun', '활동/noun', '중단/noun', '요구/noun', '시한/noun', '21일/number', '정면/noun', '대치/noun', '국면/noun', '긴장감/noun', '부시/noun', '행정부/noun', '이라크/noun', '사태/noun', '때문/noun', '당장/noun', '이란/noun', '손보기/noun', '수/noun', '입장/noun', '경고/noun', '메시지/noun', '무력/noun', '시위/noun', '이란/nou