In [1]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
NUM_CLUSTERS = 20 # Define the number of clusters to be found

In [3]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [4]:
def get_Noun_words(morphs):
    Noun_words = []
    for word, tag in morphs:
        if tag == 'Noun':
            Noun_words.append(word)
    return Noun_words

In [5]:
total_morphs = pickle.load(open('politics_2016_morphs.p','rb'))    
# 텍스트 데이터를 수집 후 Twitter 형태소 분석기를 사용해서 형태소 분석을 한 결과를 politics_2016_morphs.p에
# 저장한 후, 본 파일에서는 그 데이터를 사용함
# dict type
# key -> article id (e.g., '081-0002735239')

In [6]:
len(total_morphs)

2632

In [7]:
print(total_morphs['081-0002735239']['content'])

[('[', 'Punctuation'), ('서울', 'Noun'), ('신문', 'Noun'), ('나우', 'Noun'), ('뉴스', 'Noun'), (']', 'Punctuation'), ('한국', 'Noun'), ('형', 'Suffix'), ('전투기', 'Noun'), ('KFX', 'Alpha'), ('(', 'Punctuation'), ('왼쪽', 'Noun'), (')', 'Punctuation'), ('와', 'Noun'), ('일본', 'Noun'), ('F', 'Alpha'), ('-', 'Punctuation'), ('3', 'Number'), ('전투기', 'Noun'), ('(', 'Punctuation'), ('오른쪽', 'Noun'), (')', 'Punctuation'), ('사진', 'Noun'), ('=', 'Punctuation'), ('한국', 'Noun'), ('KAI', 'Alpha'), ('및', 'Noun'), ('일본', 'Noun'), ('방위성', 'Noun'), ('자료', 'Noun'), ('사진', 'Noun'), ('흔히', 'Adjective'), ('우리나라', 'Noun'), ('를', 'Josa'), ('‘', 'Foreign'), ('일본', 'Noun'), ('을', 'Josa'), ('우습', 'Verb'), ('게', 'Eomi'), ('보는', 'Verb'), ('세계', 'Noun'), ('에서', 'Josa'), ('유일', 'Noun'), ('한', 'Josa'), ('민족', 'Noun'), ('’', 'Foreign'), ('이라고', 'Josa'), ('들', 'Verb'), ('한', 'Verb'), ('다', 'Eomi'), ('일본', 'Noun'), ('은', 'Josa'), ('GDP', 'Alpha'), ('순위', 'Noun'), ('세계', 'Noun'), ('3', 'Number'), ('위로', 'Noun'), ('세계', 'Noun'), ('경제', '

In [8]:
total_Noun_words = {}    #명사만 따로 저장

for key in total_morphs.keys():
    morphs = total_morphs[key]['content']
    total_Noun_words[key] ={}
    total_Noun_words[key]['words']=get_Noun_words(morphs) # 명사 단어들만 clustering에 사용
    total_Noun_words[key]['paper_id']=total_morphs[key]['paper_id']
    total_Noun_words[key]['text_title']= total_morphs[key]['text_title']


In [9]:
article_ids = []
text_titles = []
documents =[]
# 기사의 id, 제목, 기사 내용 (명사로 구성)을 별도의 리스트 변수에 저장
for key in total_Noun_words.keys():
    article_ids.append(total_Noun_words[key]['paper_id'])
    text_titles.append(total_Noun_words[key]['text_title'])
    document = total_Noun_words[key]['words'] # list of words
    documents.append(document) # list of docs: a doc is a list of words

In [10]:
f_stop = open('stop_words.txt', 'r', encoding='utf-8')
stop_words = []
for word in f_stop.readlines():
    stop_word = word.strip()
    stop_words.append(stop_word)
f_stop.close()

docs_filtered = [[term for term in doc if term not in stop_words] for doc in documents]
# 전처리의 추가 과정으로 불용어를 제거하고, 
# 아주 많이 사용됐거나 혹은 아주 적게 사용된 단어들도 추가적으로 제외하는 부분

In [11]:
documents_filtered = []
for doc in docs_filtered:
    document_filtered =''
    for word in doc:
        document_filtered = document_filtered+' '+word
    documents_filtered.append(document_filtered) # list of strings
# 불필요한 단어들을 제거하고 난후 DTM로 변환하기 위해 다시 list of strings의 형태로 변환

In [None]:
vectorizer, doc_term_mat = tfidf_extractor(documents_filtered) # TFIDF matrix로 변환
kmeans = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10) # KMeans 알고리즘 생성
# More about the parameters: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

clusters = kmeans.fit_predict(doc_term_mat) # KMeans 알고리즘으로 학습

In [40]:
import pandas as pd
results_df = pd.DataFrame(columns=['article_id', 'title', 'cluster_id'])

In [41]:
results_df

Unnamed: 0,article_id,title,cluster_id


In [42]:
for doc_num, cluster_num in enumerate(clusters): 
    cluster_dict = {}
    cluster_dict['article_id'] = article_ids[doc_num]
    cluster_dict['title'] = text_titles[doc_num]
    cluster_dict['cluster_id'] = cluster_num
    results_df=results_df.append(cluster_dict, ignore_index=True)

In [43]:
results_df.head()

Unnamed: 0,article_id,title,cluster_id
0,081-0002735239,[이일우의 밀리터리 talk] 韓스텔스기vs日스텔스…,1
1,056-0010325528,현역장병이 매긴 군 급식·피복 점수는?…100점 만…,1
2,001-0008474442,국민투표 시한까지 제시되는 개헌론…이번엔 현실…,1
3,001-0008538459,"北, 美 김정은 제재에 ""부시 정부도 안 한 행동""",3
4,008-0003710916,수도권 공백은 어떻게?...패트리엇 미사일 증강 배…,11


In [44]:
results_df.to_excel('k_means_results.xlsx')