### LDA로 기사 내용을 바탕으로 주제 찾기 ( 비지도학습)

In [None]:
import pandas as pd

In [2]:
npr = pd.read_csv('./data/npr.csv')

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [5]:
# npr['Article'][0]
len(npr)

11992

### 전처리

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# CountVectorizer(문서에서 제외되는 비율, 문서 개수, 불용어 자동 제거 설정)
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english') 

In [9]:
dtm = cv.fit_transform(npr['Article']) # 행렬작업

In [10]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

In [12]:
# LatentDirichletAllocation(주성분개수, 랜덤지정 , 
# 주성분개수는 카테고리를 분류하는 것을 의미 더 세분화를 원한다면 숫자를 키워도 된다
LDA = LatentDirichletAllocation(n_components=7,random_state=42)

In [13]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [None]:
# Grab the vocabulary of words

In [14]:
len(cv.get_feature_names())

54777

In [15]:
type(cv.get_feature_names())

list

In [19]:
import random

randomr_word_id = random.randint(0,54777)

cv.get_feature_names()[10]

'01'

In [None]:
# Grab the topics

In [20]:
len(LDA.components_)

7

In [21]:
type(LDA.components_)

numpy.ndarray

In [22]:
LDA.components_.shape

(7, 54777)

In [23]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [24]:
single_topic = LDA.components_[0]

In [25]:
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [26]:
import numpy as np

In [27]:
arr = np.array([10,200,1])

In [28]:
arr

array([ 10, 200,   1])

In [29]:
arr.argsort()

array([2, 0, 1], dtype=int64)

In [31]:
# ARGSORT ---> INDEX POSITINS SORTED FROM LEAST --> GREATEST
# TOP 10 VALUES ( 10 GREATEST VALUES )
# LAST 10 VALUES of ARGSORT()
single_topic.argsort()[-10:]

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993], dtype=int64)

In [32]:
top_ten_words = single_topic.argsort()[-10:]

In [33]:
for index in top_ten_words:
    print(cv.get_feature_names()[index])

new
percent
government
company
million
care
people
health
said
says


In [None]:
# Grab the highest probability words per topic

In [37]:
for i,topic in enumerate(LDA.components_):
    print(f"THE TOP IS WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

THE TOP IS WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']




THE TOP IS WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




THE TOP IS WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




THE TOP IS WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




THE TOP IS WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']




THE TOP IS WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know'

In [38]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [39]:
topic_results = LDA.transform(dtm)

In [41]:
topic_results[0].argmax()

1

In [42]:
npr['Topic'] = topic_results.argmax(axis=1)

In [43]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2
