In [5]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:100% !important;}
div.CodeMirror {font-family:Consolas; font-size:10pt;}
div.output {font-size:10pt;}
div.input {font-family:Consolas; font-size:10pt; font-weight:bold;}
div.prompt {min-width:70px;}
</style>
"""))

**<font size='6' color='darkred'>ch02. 한글 형태소 분석</font>**

# 연관분석 개요 
- 데이터들 사이에서 자주 발생하는 속성을 찾고 
  그 속성들 사이에 어느 정도 연관이 있는지를 분석하는 방법
- 활용분야: 상품 진열, 사기보험 적발, 카탈로그 디자인, 신상품 카테고리 구성 ....
- 연관성 분석 관련 지표: 
    - 지지도(support) = 조건 결과 항목 수 / 전체 수
    - 신뢰도(confidence) = 조건 결과 항목 수 / 조건 항목 수
    - 향상도(lift) = 조건 결과 지지도 / (조건 지지도) * (결과 지지도)
                    (1: 상관관계 없음, >1: 양의 상관관계, <1: 음의 상관관계)

<pre>
[조건] => [결과] 지지도 신뢰도 향상도
[주스] => [생수]  0.4    1   0.4 / (0.4)*1 =1 
[소주] => [맥주]  0.2   0.233.. 0.833... 
</pre>

In [6]:
# 트랜잭션 데이터 가져오기 
import csv

transaction = []
with open('data/cf_basket.csv', 'r', encoding='utf-8') as cf:
    csvdata = csv.reader(cf)
    
    # transaction = list(csvdata) 리스트로 형변환 
    for row in csvdata: # 혹은 for문 돌리기
        transaction.append(row)
transaction

[['소주', '콜라', '와인'],
 ['소주', '오렌지주스', '콜라'],
 ['콜라', '맥주', '와인'],
 ['소주', '콜라', '맥주'],
 ['오렌지주스', '와인']]

# 연관 분석 

## 연관 규칙 생성

In [7]:
# pip install apyori
from apyori import apriori # 연관성 규칙 생성 

rules = apriori(transaction,
                min_support=0.2, 
                # 지지도가 0.2 이상일 경우만 연관분석해
                min_confidence=0.1)
rules = list(rules)
len(rules)

18

In [8]:
rules[1]

RelationRecord(items=frozenset({'소주'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'소주'}), confidence=0.6, lift=1.0)])

In [11]:
print('조건 => 결과 \t 지지도 \t 신뢰도 \t 향상도')

for row in rules:
    support = row[1]
    ordered_st = row[2]
    for item in ordered_st:
        # 조건 (left hand side)
        lhs = ', '.join(x for x in item[0])
        lhs = [x for x in item[0]]
        # 결과 (right hand side)
        rhs = ', '.join(x for x in item[1])
        rhs = [x for x in item[1]]
        confidence = item[2]
        lift = item[3]
        
        if lift != 1:
            print(lhs, '=>', rhs, 
                  '\t{:5.3f}\t{:5.3f}\t{:5.3f}'.format(
                      support, confidence, lift))

조건 => 결과 	 지지도 	 신뢰도 	 향상도
['맥주'] => ['소주'] 	0.200	0.500	0.833
['소주'] => ['맥주'] 	0.200	0.333	0.833
['맥주'] => ['와인'] 	0.200	0.500	0.833
['와인'] => ['맥주'] 	0.200	0.333	0.833
['맥주'] => ['콜라'] 	0.400	1.000	1.250
['콜라'] => ['맥주'] 	0.400	0.500	1.250
['소주'] => ['오렌지주스'] 	0.200	0.333	0.833
['오렌지주스'] => ['소주'] 	0.200	0.500	0.833
['소주'] => ['와인'] 	0.200	0.333	0.556
['와인'] => ['소주'] 	0.200	0.333	0.556
['소주'] => ['콜라'] 	0.600	1.000	1.250
['콜라'] => ['소주'] 	0.600	0.750	1.250
['오렌지주스'] => ['와인'] 	0.200	0.500	0.833
['와인'] => ['오렌지주스'] 	0.200	0.333	0.833
['오렌지주스'] => ['콜라'] 	0.200	0.500	0.625
['콜라'] => ['오렌지주스'] 	0.200	0.250	0.625
['와인'] => ['콜라'] 	0.400	0.667	0.833
['콜라'] => ['와인'] 	0.400	0.500	0.833
['맥주'] => ['콜라', '소주'] 	0.200	0.500	0.833
['소주'] => ['콜라', '맥주'] 	0.200	0.333	0.833
['콜라'] => ['소주', '맥주'] 	0.200	0.250	1.250
['소주', '맥주'] => ['콜라'] 	0.200	1.000	1.250
['콜라', '맥주'] => ['소주'] 	0.200	0.500	0.833
['콜라', '소주'] => ['맥주'] 	0.200	0.333	0.833
['맥주'] => ['콜라', '와인'] 	0.200	0.500	1.250
['와인'] => ['콜

# 뉴스기사 연관 분석 실습 

## 뉴스 RSS를 이용하여 기사 검색 후 연관분석 

In [12]:
import requests 
from bs4 import BeautifulSoup

rss_url = 'https://rss.joins.com/joins_money_list.xml'
money_response = requests.get(rss_url)
money_soup = BeautifulSoup(money_response.content, 'xml')
link_list = money_soup.select('item > link')
# link_list = [l.text for l in link_list]
len(link_list)

30

In [13]:
from konlpy.tag import Kkma

kkma = Kkma()
news = []

for link in link_list:
    news_response = requests.get(link.text)
    news_soup = BeautifulSoup(news_response.content, 'html.parser')
    content = news_soup.select_one('div#article_body').text
    nouns_list = list(filter(
        lambda word : len(word)>1, kkma.nouns(content)))
    news.append(nouns_list)
news

[['비트',
  '비트코',
  '이미지',
  '중앙',
  '중앙포토',
  '포토',
  '암호화',
  '암호화폐',
  '비트코인',
  '사상',
  '처음',
  '개당',
  '7500',
  '7500만원',
  '업비트',
  '3일',
  '오후',
  '7502',
  '7502만원',
  '거래',
  '기관',
  '기관투자',
  '투자',
  '가세',
  '연초',
  '대비',
  '일부',
  '투기',
  '투기자산',
  '자산',
  '피해',
  '비관론',
  '암호',
  '화폐',
  '거래소',
  '4시',
  '1비트코인은',
  '인은',
  '7502만2000원',
  '2000',
  '1.58',
  '상승',
  '1비트코',
  '가격',
  '이번',
  '시간',
  '비트코인은',
  '7477',
  '7477만8000원',
  '8000',
  '최고가',
  '기록',
  '주식',
  '주식시장',
  '시장',
  '단위',
  '때문',
  '종류',
  '암호화폐라도',
  '라도',
  '차이',
  '미국',
  '5만9373달러',
  '9373',
  '달러',
  '6703',
  '6703만원',
  '데스크',
  '1주일',
  '주일',
  '15',
  '가량',
  '최근',
  '30',
  '비교',
  '상승세',
  '글로벌',
  '기관투자가',
  '투자가',
  '유입',
  '풀이',
  '최대',
  '투자은행',
  '은행',
  '골드',
  '골드만삭',
  '만삭',
  '지난달',
  '31',
  '31일',
  '현지',
  '현지시간',
  '6월',
  '투자상품',
  '상품',
  '발표',
  '메리',
  '리치',
  '신임',
  '골드만',
  '지털자산',
  '대표',
  '인터뷰',
  '투자자산',
  '2500',
  '2500만',
  '282',
  '282억원',
  '이상',
  '개인',
  '고객

In [14]:
from apyori import apriori

rules = apriori(news, 
                min_support=0.3,
                min_confidence=0.2)
result = list(rules)

In [15]:
for r in result:
    print(r, end='\n\n')

RelationRecord(items=frozenset({'10'}), support=0.4, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'10'}), confidence=0.4, lift=1.0)])

RelationRecord(items=frozenset({'1000'}), support=0.3333333333333333, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1000'}), confidence=0.3333333333333333, lift=1.0)])

RelationRecord(items=frozenset({'19'}), support=0.36666666666666664, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'19'}), confidence=0.36666666666666664, lift=1.0)])

RelationRecord(items=frozenset({'1일'}), support=0.3, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1일'}), confidence=0.3, lift=1.0)])

RelationRecord(items=frozenset({'2019'}), support=0.3333333333333333, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'2019'}), confidence=0.3333333333333333, lift=1.0)])

RelationRecord(items=frozenset({'2019년'}

In [16]:
result[3]

RelationRecord(items=frozenset({'1일'}), support=0.3, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'1일'}), confidence=0.3, lift=1.0)])

In [20]:
# 위의 결과를 데이터프레임으로 
import pandas as pd 
result_df = pd.DataFrame(None, 
                         columns=['lhs', 'rhs', 
                                  'support', 'confidence', 'lift'])
index = 0
for row in result:
    support = row[1]
    ordered_st = row[2]
    
    for item in ordered_st:
        lhs = ','.join(x for x in item[0])
        rhs = ','.join(x.strip() for x in item[1])
        confidence = item[2]
        lift = item[3]
        
        if lift != 1:
            result_df.loc[index] = [lhs, rhs, support, confidence, lift]
            index += 1
result_df.head(10)

Unnamed: 0,lhs,rhs,support,confidence,lift
0,10,관련,0.333333,0.833333,1.25
1,관련,10,0.333333,0.5,1.25
2,10,기사,0.3,0.75,1.5
3,기사,10,0.3,0.6,1.5
4,10,기자,0.3,0.75,1.022727
5,기자,10,0.3,0.409091,1.022727
6,10,사진,0.3,0.75,1.40625
7,사진,10,0.3,0.5625,1.40625
8,10,지난해,0.3,0.75,1.40625
9,지난해,10,0.3,0.5625,1.40625


In [23]:
pd.options.display.max_rows = 530
result_df

Unnamed: 0,lhs,rhs,support,confidence,lift
0,10,관련,0.333333,0.833333,1.25
1,관련,10,0.333333,0.5,1.25
2,10,기사,0.3,0.75,1.5
3,기사,10,0.3,0.6,1.5
4,10,기자,0.3,0.75,1.022727
5,기자,10,0.3,0.409091,1.022727
6,10,사진,0.3,0.75,1.40625
7,사진,10,0.3,0.5625,1.40625
8,10,지난해,0.3,0.75,1.40625
9,지난해,10,0.3,0.5625,1.40625


In [24]:
result_df.loc[(result_df.lhs.str.contains('기자'))].sort_values(
    by=['lift'],
    ascending=False)                                                         

Unnamed: 0,lhs,rhs,support,confidence,lift
228,"기자,2019",2019년,0.3,1.0,3.0
229,"기자,2019년",2019,0.3,1.0,3.0
466,"기자,코로나19","19,코로나",0.3,1.0,3.0
468,"기자,19,코로나",코로나19,0.3,1.0,3.0
199,"기자,코로나19",19,0.3,1.0,2.727273
471,"기자,코로나19,코로나",19,0.3,1.0,2.727273
462,"기자,19","코로나19,코로나",0.3,0.9,2.7
197,"기자,19",코로나19,0.3,0.9,2.7
465,"기자,코로나","코로나19,19",0.3,0.818182,2.454545
419,"기자,코로나",코로나19,0.3,0.818182,2.454545


In [25]:
result_df.loc[(result_df.lhs.str.contains('기자')) &
              result_df.rhs.str.contains('한국')].sort_values(
by=['lift'], 
ascending=False)

Unnamed: 0,lhs,rhs,support,confidence,lift
401,"기자,때문",한국,0.3,0.642857,1.607143
398,기자,"한국,때문",0.3,0.409091,1.363636
134,기자,한국,0.333333,0.454545,1.136364


In [27]:
# 연습해보기 
import requests
from bs4 import BeautifulSoup
from konlpy.tag import Kkma

kkma = Kkma()

test_url = 'https://search.daum.net/search?nil_suggest=btn&w=tot&DA=SBC&q=%EC%BD%94%EB%A1%9C%EB%82%98'
test_response = requests.get(test_url)
test_soup = BeautifulSoup(test_response.content, 'html.parser')
text_content = test_soup.select('.f_eb')

news = []

for text in text_content:
    news.append(list(filter(
        lambda word : len(word)>1 and word != '기자',
        kkma.nouns(text.text))))

In [28]:
news_nouns = set()

for new in news:
    for n in new:
        news_nouns.add(n)
        
print('코로나 기사에 나오는 단어들: ', news_nouns)
print(len(news_nouns))

코로나 기사에 나오는 단어들:  {'한번', '국외발생현황', '아픔', '가운데', '3월', '생계', '공개', '참조', '센터', '현황', '이달', '번호', '지역콜센터120', '스감염증-19', '29일', '38', '기체', '10', '400', '할리우드', '1년', '오늘', '사실', '자료', '제공', '고온', '증상완화', '길거리', '1339콜센터', '관할보건소', '실시간', '바이러스', '다음달', '38도', '진료', '코로나19', '부곰', '사람', '태양', '박창', '기네스', '대상', '반응', '전리', '용어', '국외', '2020년', '400명대', '단순', '방문자', '완치', '콜센터', '노출', '차례', '존스', '아빠', '성공', '2천', '발생', '자제', '나눔', '이후', '강도', '수칙', '영역', '남자', '청년', '수입', '신규', '의료기관', '검색어', '초고온', '유행성', '예방수칙', '사회적', '결과', '고공행진', '의식', '진자', '반품', '스트레스', '전세', '가능하다', '백신접종', '광고노출기준', '무엇', '이상', '2천7백', '쇼핑', '7377', '질병관리청', '영국', '공지사항', '동기', '성혜미', '건강보험', '눈길', '이동', '상담', '네덜란드', '생각', '유세', '병원', '관심', '상태', '2019년', '코로나바', '00시', '당일', '감염증', '이러스', '끝나도', '29', '2019', '1월', '토요일', '동반', '스감', '중국산', '남성', '상위', '증상', '지역국번', '12', '염증', '전계', '양성', '논란', '폐렴', '문제', '한국', '로이터', '가지', '고함', '한시', '60', '캠페인', '경과', '증강', '앵커', '6월', '닷새', '마스크', '건강', '지역', '가스', '연관성'

In [29]:
rules = apriori(news, 
                min_support=0.2,
                min_confidence=0.2)
result = list(rules)
result

[RelationRecord(items=frozenset({'코로나'}), support=0.5263157894736842, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'코로나'}), confidence=0.5263157894736842, lift=1.0)])]

In [33]:
from gensim.models import Word2Vec

model = Word2Vec(news, size=100, 
                 window=5, min_count=2, 
                 workers=-1)

TypeError: __init__() got an unexpected keyword argument 'size'

In [32]:
model.wv.most_similar('감염증')

NameError: name 'model' is not defined