### DF-IDF 예시
성경욱 검토 완료, 현재 소스코드는 정상 동작함

In [2]:
from newspaper import Article # crawl newapaper
from konlpy.tag import Komoran # parse title
from konlpy.tag import Twitter # tf-idf
from collections import Counter
from operator import eq
import math # calcurate
import numpy as np
import feedparser # crawl rss
komoran = Komoran()

urls = (
    "https://rss.joins.com/sonagi/joins_sonagi_sports_list.xml"  # 중앙일보 : 많이본뉴스 스포츠
    , "https://rss.joins.com/sonagi/joins_sonagi_star_list.xml"  # 중앙일보 : 많이본뉴스 연예
    , "https://rss.joins.com/sonagi/joins_sonagi_life_list.xml"  # 중앙일보 : 많이본뉴스 사회
    , None
)

In [3]:
#=========================
# rss 에서 기사제목과 기사링크를 추출하는 함수
#=========================
def crawl_rss(urls):

    n = []
    for url in urls:
        print("[crawl rss] ",url)
        d = feedparser.parse(url)
        for e in d.entries:
            n.append({'title':e.title, 'link':e.link})
    return n


In [4]:
#=========================
# url 에서 기사제목과 기사본문을 추출하는 함수
#=========================
def crawl_article(url, language='ko'):
    print("[crawl article] ",url)
    a = Article(url, language=language) #언어가 한국어이므로 language='ko'로 설정
    a.download()
    a.parse()
    return a.title, a.text


In [5]:
# ================================ get_tags =======================================
# OBJECTIVES: 1. calculate the list of noun and count
#             2. calculate the number of each unique word
#             3. calculate the number of count of most frequent word
def get_tags(text, ntags=50):
    spliter = Twitter()
    Num_unique_words = 0            # THE NUMBER OF UNIQUE WORD
    Num_most_frequent = 0           # THE NUMBER OF THE MOST FREQUENT WORD IN A DOCUMENT
    nouns = spliter.nouns(text)     # nouns 함수를 통해서 text에서 명사만 분리/추출

    count = Counter(nouns)          # count : entire number of words in text file
                                    # Counter객체를 생성하고 참조변수 nouns할당
    return_list = []                # 명사 빈도수 저장할 변수

    for n, c in count.most_common(ntags):
        temp = {'tag': n, 'count': c}
        return_list.append(temp)
        Num_unique_words = Num_unique_words+1
        if Num_unique_words == 1:
            Num_most_frequent = c

    # most_common 메소드는 정수를 입력받아 객체 안의 명사중 빈도수
    # 큰 명사부터 순서대로 입력받은 정수 갯수만큼 저장되어있는 객체 반환
    # 명사와 사용된 갯수를 return_list에 저장합니다.

    return Num_unique_words, Num_most_frequent, return_list

In [6]:
# ================================ TF calculation =================================
# ARGS : request   : Request string
#        most_freq : The number of words which is the most frequent in the document (document 'tag')
#        tag       : Target document
def TF(request, most_freq, tag):
    # tf calculation:
    # tf(t, d) = 0.5 + 0.5*f(t,d)/most_freq(d)
    return 0.5 + 0.5*Howmanywords(request, tag)/most_freq


In [7]:
# =============================== IDF calculation =================================
# OBJECTIVES: return IDF value which represents the importance of string in range of
#             entire set of documents
# ARGS : Num_of_docs       : Total number of documents in corpus
#        request           : Request string
#        List_tags  : Set of documents
def IDF(request, List_tags):
    Num_of_docs = len(List_tags)
    # <idf calculation>
    # idf(t, D) = log(Num_of_docs/1+num_of_documents_including_request)
    
    # Number of documents which includes the 'request'
    Num_of_Docs_includes_request = 0

    # count how many documents which includes the 'request'
    for n in List_tags:
        for i in n:
            noun = i['tag']
            if eq(noun, request):
                Num_of_Docs_includes_request = Num_of_Docs_includes_request + 1
                continue

    # finally get the total number of num_of_documents_including_request
    # return the IDF value
    return math.log10(Num_of_docs/(0.001+Num_of_Docs_includes_request))

In [21]:
# ================================ Howmanywords ===================================
# OBJECTIVES: return how many the 'request' is exists in document.
# ARGS : request : represents the request
#        tag
def Howmanywords(request, tag):
    noWords = 0
    for n in tag:
        noun = n['tag']
        count = n['count']
        if eq(noun, request):
            return count
    # if there are not any 'request' in the document, then return 0
    return noWords


In [9]:
#=========================
# Cosine Similarity
#=========================
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)


## Main

In [22]:
# 1. rss에서 기사제목과 링크 목록을 가져온다.
article_list = crawl_rss(urls) #[:10]

# 2. 기사 링크에서 기사본문을 가져옴
for atricle in article_list:
    _, text = crawl_article(atricle['link'])
    atricle['text'] = text

# 3. 기사 제목 파싱
print('[parse title]')
noun_title = [komoran.nouns(a['title']) for a in article_list]


[crawl rss]  https://rss.joins.com/sonagi/joins_sonagi_sports_list.xml
[crawl rss]  https://rss.joins.com/sonagi/joins_sonagi_star_list.xml
[crawl rss]  https://rss.joins.com/sonagi/joins_sonagi_life_list.xml
[crawl rss]  None
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071830&cloc=rss|most_view|sports
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071586&cloc=rss|most_view|sports
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071411&cloc=rss|most_view|sports
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071306&cloc=rss|most_view|sports
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071734&cloc=rss|most_view|sports
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071703&cloc=rss|most_view|sports
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071394&cloc=rss|most_view|spo

[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23072310&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071954&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071641&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071689&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23072475&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071871&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23072138&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23072020&cloc=rss|most_view|society
[crawl article]  http://article.joins.com/news/article/article.asp?total_id=23071684&cloc=rss|most_view|

In [23]:
noun_title[:10]

[['트럼프', '비판', '논란', '로버츠', '용병', '술'],
 ['폭설', '파란', '잔디밭', '축구', '장현수', '봉사', '활동', '자료', '조작', '인정'],
 ['레스터', '시티', '구단주', '소유', '헬기', '추락', '사고', '수습', '중'],
 ['가을', '남자', '박정권', '힘'],
 ['다저스', '승', '패', '벼랑', '끝', '류현진', '등판', '불투명'],
 ['괴물', '장', '타자', '카', '메로', '챔프', '입성', '경기만', '우승', '눈앞'],
 ['호날두', '멀티', '골', '연속', '무패'],
 ['김연아', '후', '메달', '차준환', '태극기', '세리머니', '이유'],
 ['넥센', '샌즈', '김성현', '충돌', '손가락', '욕', '진실'],
 ['차전', '벤', '치클', '리어', '링', '플레이오프']]

In [13]:
article_list[:2]

[{'link': 'http://article.joins.com/news/article/article.asp?total_id=23071830&cloc=rss|most_view|sports',
  'text': '트럼프도 공개적으로 용병술에 대한 지적을 날렸다. 월드시리즈 1승3패로 몰린 데이브 로버츠 LA 다저스 감독의 투수 기용이 큰 비판을 받고 있다.도덜드 트럼프 미국 대통령은 28일(한국시간) 소셜 미디어에 "다저스와 보스턴의 월드시리즈 마지막 이닝을 보고 있다. 거의 7회를 압도적으로 막은 선발투수 리치 힐을 내리고 흠씬 두들겨 맞은 구원 투수(라이언 매드슨 또는 켄리 잰슨)를 올리다니 정말 놀랍다. 4점 리드가 날아갔다. 감독이 큰 실수를 했다"고 적었다.트럼프 대통령은 보스턴의 라이벌 팀인 뉴욕 양키스 팬이다. 자신의 SNS에 "나는 오랫동안 양키스의 팬이다(I am a long time Yankee fan)"라고 드러내기도 했다. 마리아노 리베라, 폴 오닐 등 레전드 스타들에 대한 애정들 드러낸 적도 있다. 양키스가 마이애미 말린스 출신 거포 지안카를로 스탠턴을 영입했을 때도 "그는 정말 잘 해낼 것"이라고 반가워했다.다저스는 이날 월드시리즈 4차전 6회 말 선제점을 뽑은 데 이어 야시엘 푸이그가 3점 홈런을 쳐 4-0으로 앞서갔다. 선발투수 리치 힐은 6회까지 안타 1개, 볼넷 2개만 주며 무실점으로 막았다. 7회 선두타자 잰더 보가츠에게 볼넷을 줬지만 다음 타자 에두아르도 누네스를 삼진으로 잡았다. 하지만 마운드에 오른 로버츠 감독은 힐을 내리고 투수를 스캇 알렉산더로 교체했다.알렉산더는 볼넷을 줬고, 로버츠 감독은 다시 매드슨으로 투수를 교체했다. 매드슨은 재키 브래들리 주니어를 2루수 플라이로 처리했으나 미치 모어랜드에게 3점포를 맞았다. 8회엔 마무리 잰슨을 조기등판시켰으나 스티브 피어스에게 동점포를 내줬다. 마무리 투수가 월드시리즈에서 2경기 연속 홈런을 내준 건 2001년 김병현(당시 애리조나) 이후 17년 만이다. 다저스는 결국 9회

In [14]:
# 4. 기사 본문 파싱
print('[parse text]')
noun_text = []
for a in article_list:
    num_unique_words, num_most_frequent, tag = get_tags(a['text'])
    noun_text.append({'num_unique_words':num_unique_words, 'num_most_frequent':num_most_frequent, 'tags':tag})

[parse text]


  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [15]:
# 5. 기사 제목 TF-IDF 계산
tf_idf_title = []
tf_idf_mean = []
tag_list = [a['tags'] for a in noun_text]
for i, nouns in enumerate(noun_title):
    tfs = [TF(req, noun_text[i]['num_most_frequent'], noun_text[i]['tags']) for req in nouns]
    idfs = [IDF(req, tag_list) for req in nouns]
    _tfidf = [tfs[j] * idfs[j] for j,n in enumerate(nouns)]
    tf_idf_title.append(_tfidf)
    tf_idf_mean.append(np.mean(_tfidf))

print("===[TF-IDF values]==========================================================================================")
print(tf_idf_title)

print("===[TF-IDF mean]==========================================================================================")
for i,e in enumerate(tf_idf_mean):
     print("TF-IDF : ",e," , title : ",article_list[i]['title'])

[[1.563046745568005, 0.9769042159800031, 0.6275928274458286, 1.322396336642491, 2.4771212547196626, 2.4771212547196626], [2.4771212547196626, 2.4771212547196626, 2.4771212547196626, 0.9413892411687428, 1.2000434176367587, 2.4771212547196626, 0.7350118016707676, 0.8450462237877048, 0.9231103212590451, 0.9769042159800031], [1.8316954049625058, 1.7095823779650055, 1.239746565602335, 2.4771212547196626, 1.3432432969725043, 1.3432432969725043, 1.4769765140144722, 2.4771212547196626, 0.5545412158936895], [1.056307779734631, 0.7350118016707676, 1.3520739580603276, 0.5468906710579737], [1.4769765140144722, 0.6760369790301638, 0.6161569065485439, 0.7384882570072361, 0.6275928274458286, 0.9642473288018163, 0.8264977104015567, 2.4771212547196626], [1.1722850591760037, 0.6760369790301638, 0.5880094413366141, 0.7384882570072361, 2.4771212547196626, 1.9538084319600062, 2.4771212547196626, 2.4771212547196626, 0.7056113296039369, 2.4771212547196626], [1.9538084319600062, 0.9769042159800031, 1.81425068

In [16]:

# 6. 제목 tf-idf top5와 문서안의 tf top5의 단어사전 만듬
wordset = set([])
for n, nounlist in enumerate(noun_title):
    for w in np.argsort(np.array(tf_idf_title[n]))[:5] :
        wordset.add(nounlist[w])

for doc in noun_text:
    for tags in doc['tags'][:5]:
        wordset.add(tags['tag'])

word_dict = { t : (i+1) for i,t in enumerate(wordset) }
print("===[Word dictionary]==========================================================================================")
print(word_dict)


{'이민기': 1, '씨': 431, '상대': 2, '분위기': 3, '수급': 282, '더': 4, '의': 283, '장자연': 284, '복': 5, '의혹': 6, '제외': 7, '사이판': 286, '사': 287, '우승': 436, '우리': 8, '공공기관': 451, '장동민': 9, '첫': 10, '미모': 290, '경북': 11, '아빠': 12, '득점': 291, '힐': 13, '속': 14, '체험': 15, '신부': 16, '제이': 17, '보육료': 516, '것': 534, '강제': 351, '준비': 199, '핑크': 18, '못': 19, '스릴러': 408, '관광객': 295, '하우스': 53, '형님': 492, '오늘': 441, '시인': 297, '의료': 329, '홈런': 298, '징계': 20, '감사': 22, '회': 300, '역전': 301, '케미': 23, '공포': 303, '역사': 24, '팬': 25, '축구': 305, '여명': 28, '귀국': 29, '유족': 306, '뿔': 150, '방향': 333, '자매': 30, '세계': 307, '숨바꼭질': 31, '샌즈': 331, '모비스': 32, '안이': 308, '발생': 309, '연결': 310, '세': 34, '이벤트': 311, '시청자': 312, '강병철': 35, '보트': 313, '전략': 314, '회장': 37, '골': 38, '찬스': 153, '연기': 316, '괴물': 39, '참가': 317, '요원': 40, '켈리': 288, '중국': 505, '시티': 319, '시리즈': 101, '조사': 41, '지사': 320, '김규민': 42, '손병호': 289, '년': 43, '평가': 44, '판결': 321, '레인보우': 45, '이닝': 46, '최혜용': 322, '송승헌': 323, '발전': 47, '두': 324, '장애인': 526, '생가': 326

In [17]:
# 7. 제목 tf-idf top5와 문서안의 tf top5를 벡터로 표시
title_vec = []
for n, nounlist in enumerate(noun_title):
    _tmp = []
    for w in np.argsort(np.array(tf_idf_title[n]))[:5] :
        _tmp.append(word_dict[nounlist[w]])
    while len(_tmp) < 5:
        _tmp.append(0)
    title_vec.append(_tmp)
print("===[Word vector : title]==========================================================================================")
print(title_vec)

text_vec = []
for doc in noun_text:
    _tmp = []
    for tags in doc['tags'][:5]:
        _tmp.append(word_dict[tags['tag']])
    text_vec.append(_tmp)
print("===[Word vector : text]==========================================================================================")
print(text_vec)


[[79, 394, 279, 274, 445], [465, 404, 535, 305, 244], [449, 145, 179, 532, 216], [222, 420, 443, 186, 0], [181, 207, 463, 77, 252], [118, 452, 436, 423, 39], [350, 219, 232, 38, 112], [496, 176, 56, 375, 332], [104, 272, 248, 84, 331], [414, 368, 509, 217, 133], [327, 172, 215, 369, 64], [181, 146, 436, 301, 402], [211, 146, 140, 56, 105], [362, 95, 268, 30, 30], [288, 198, 176, 300, 162], [414, 230, 298, 470, 0], [347, 104, 256, 415, 478], [152, 173, 122, 368, 400], [498, 518, 260, 376, 526], [181, 101, 207, 77, 161], [465, 88, 535, 131, 297], [169, 167, 107, 107, 186], [500, 44, 7, 241, 484], [256, 101, 117, 331, 345], [356, 206, 507, 381, 460], [500, 104, 288, 443, 46], [165, 176, 443, 186, 196], [454, 104, 190, 328, 121], [124, 439, 480, 294, 334], [152, 207, 298, 77, 403], [496, 533, 176, 35, 491], [441, 189, 529, 424, 424], [135, 442, 419, 377, 127], [138, 361, 366, 388, 269], [424, 189, 441, 370, 43], [138, 354, 366, 388, 269], [433, 456, 31, 51, 191], [424, 97, 43, 437, 43], [2

In [18]:
# 8. Cosine similarity 계산
cosine_similarities = [cos_sim(t,text_vec[i]) for i,t in enumerate(title_vec)]
print("===[cosine similarities]==========================================================================================")
print(cosine_similarities)


[0.7798837517576797, 0.9328443673041937, 0.7716549600272621, 0.8592112031606849, 0.794851092928134, 0.6784230744585701, 0.8097472126215994, 0.7082873448285695, 0.8933540512513954, 0.7843373119199372, 0.9664603567362667, 0.7679581001068876, 0.9103472460767658, 0.6026900634755926, 0.9108618321202863, 0.7292016164329012, 0.8501359965973685, 0.8965232770649266, 0.9889295437093775, 0.8489207030861788, 0.8848305971263593, 0.8075796788201802, 0.7044451266601084, 0.879914830145652, 0.7713372799790672, 0.6780502164325031, 0.8516372622530838, 0.5973983850455629, 0.9435629179403634, 0.9420798462596506, 0.8291868081424435, 0.8305353716845278, 0.6438385410019697, 0.9085354205417842, 0.9454729521072196, 0.8297160459658559, 0.7746582445317203, 0.8752925059525524, 0.6979798572883765, 0.8947006791587476, 0.7796235332951206, 0.8695046601206158, 0.6255832492061518, 0.6142184688951633, 0.9120091690500083, 0.7859195152181807, 0.7210943788621873, 0.4550930311459055, 0.8469365348065956, 0.6709883466176765, 0

In [19]:
# 9. TF-IDF 평균이 제일 작은거 큰거 출력
idx_tfidf = np.argsort(np.array(tf_idf_mean))

print("\n===[Top3 : Largest TF-IDF mean]==========================================================================================")
for n,i in enumerate(np.flipud(idx_tfidf)[:3]) : 
    print("(",n+1,") : ",tf_idf_mean[i]," , title : ",article_list[i]['title'])
    print("\t link : ",article_list[i]['link'])

print("===[Top3 : Smallest TF-IDF mean]==========================================================================================")
for n,i in enumerate(idx_tfidf[:3]) :
    print("(",n+1,") : ",tf_idf_mean[i]," , title : ",article_list[i]['title'])
    print("\t link : ",article_list[i]['link'])



( 1 ) :  2.4771212547196626  , title :  [화보] 오윤아, 치마 걷어 올리며...'섹시미 폭발'
	 link :  http://article.joins.com/news/article/article.asp?total_id=23072597&cloc=rss|most_view|ent_sports
( 2 ) :  2.227085081596386  , title :  수학 가르친다며 여학생 허벅지에 분필로 낙서한 선생님
	 link :  http://article.joins.com/news/article/article.asp?total_id=23071408&cloc=rss|most_view|society
( 3 ) :  2.1544083298410843  , title :  "사랑한다면 이들처럼" '뷰티 인사이드' 서현진♥이민기, 물오른 꽁냥꽁냥
	 link :  http://article.joins.com/news/article/article.asp?total_id=23072205&cloc=rss|most_view|ent_sports
( 1 ) :  0.922571052630925  , title :  '가을 남자' 박정권 "힘 빼고 즐기면 됩니다"
	 link :  http://article.joins.com/news/article/article.asp?total_id=23071306&cloc=rss|most_view|sports
( 2 ) :  0.9730057759438733  , title :  유상무 결혼식 참석한 장동민 "혼자 남았다…결혼 너무 하고파"
	 link :  http://article.joins.com/news/article/article.asp?total_id=23071683&cloc=rss|most_view|ent_sports
( 3 ) :  1.05038972224616  , title :  다저스 1승3패 벼랑 끝...류현진 등판 불투명
	 link :  http://article.joins.com/news

In [20]:
# 10. Cosine-Similarity 제일 작은거 큰거 출력
idx_tfidf = np.argsort(np.array(cosine_similarities))

print("\n===[Top3 : Largest Cosine Similarity]==========================================================================================")
for n,i in enumerate(np.flipud(idx_tfidf)[:3]) : 
    print("(",n+1,") : ",cosine_similarities[i]," , title : ",article_list[i]['title'])
    print("\t link : ",article_list[i]['link'])

print("===[Top3 : Smallest Cosine Similarity]==========================================================================================")
for n,i in enumerate(idx_tfidf[:3]) :
    print("(",n+1,") : ",cosine_similarities[i]," , title : ",article_list[i]['title'])
    print("\t link : ",article_list[i]['link'])



( 1 ) :  0.9889295437093775  , title :  장애인체육회, 개발도상국 초청 장애인스포츠 개발캠프 개최
	 link :  http://article.joins.com/news/article/article.asp?total_id=23071939&cloc=rss|most_view|sports
( 2 ) :  0.9664603567362667  , title :  5명 연장에 실수 연발... 집 가려다 피말리게 'KPGA 첫 정상' 오른 박성국
	 link :  http://article.joins.com/news/article/article.asp?total_id=23072309&cloc=rss|most_view|sports
( 3 ) :  0.9613196233914822  , title :  재판거래 의혹 핵심 ‘일제 강제징용’ 소송, 김명수 대법원 판단은?
	 link :  http://article.joins.com/news/article/article.asp?total_id=23071378&cloc=rss|most_view|society
( 1 ) :  0.23008655596609331  , title :  "예쁜애X예쁜애" 구구단 개인+단체 개성만점 오피셜 포토 공개
	 link :  http://article.joins.com/news/article/article.asp?total_id=23071505&cloc=rss|most_view|ent_sports
( 2 ) :  0.3336026700946537  , title :  "사랑해요, 영원히요" 80세 할아버지의 50년 만의 프러포즈
	 link :  http://article.joins.com/news/article/article.asp?total_id=23071954&cloc=rss|most_view|society
( 3 ) :  0.4336351270711635  , title :  이재명의 한탄 “촛불 정부의 경찰 맞나…국민 법정 맡긴다”
	 link :  htt