In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle = True, random_state=1, remove = ('headers','footers','quotes'))
documents = dataset.data
len(documents)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


11314

In [3]:
documents

["Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n",
 "\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap

In [5]:
news_df = pd.DataFrame({'document':documents})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]"," ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x:x.lower())

In [7]:
news_df['clean_doc'][0]

'well sure about story seem biased what disagree with your statement that media ruin israels reputation that rediculous media most israeli media world having lived europe realize that incidences such described letter have occured media whole seem ignore them subsidizing israels existance europeans least same degree think that might reason they report more clearly atrocities what shame that austria daily reports inhuman acts commited israeli soldiers blessing received from government makes some holocaust guilt away after look jews treating other races when they power unfortunate'

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


# token to idx

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [11]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word = dictionary, passes = 15)
topics = ldamodel.print_topics(num_words= 4)
for topic in topics:
    print(topic)

(0, '0.033*"window" + 0.010*"font" + 0.010*"picture" + 0.010*"xterm"')
(1, '0.016*"israel" + 0.014*"jews" + 0.013*"turkish" + 0.010*"israeli"')
(2, '0.014*"said" + 0.011*"people" + 0.007*"armenian" + 0.006*"went"')
(3, '0.008*"government" + 0.007*"would" + 0.006*"president" + 0.005*"people"')
(4, '0.011*"pain" + 0.008*"doctor" + 0.008*"disease" + 0.007*"gordon"')
(5, '0.012*"would" + 0.011*"know" + 0.011*"like" + 0.011*"thanks"')
(6, '0.012*"jesus" + 0.007*"bible" + 0.007*"christian" + 0.006*"believe"')
(7, '0.012*"health" + 0.010*"university" + 0.008*"april" + 0.008*"medical"')
(8, '0.013*"game" + 0.012*"team" + 0.010*"year" + 0.008*"play"')
(9, '0.014*"sale" + 0.012*"shipping" + 0.011*"offer" + 0.011*"condition"')
(10, '0.006*"plane" + 0.004*"frequency" + 0.004*"edge" + 0.004*"points"')
(11, '0.012*"file" + 0.010*"available" + 0.008*"files" + 0.007*"information"')
(12, '0.033*"space" + 0.013*"nasa" + 0.008*"launch" + 0.006*"earth"')
(13, '0.010*"would" + 0.006*"time" + 0.006*"like" +

In [12]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  """
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [13]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(1, 0.28665507), (2, 0.06539841), (8, 0.091876484), (9, 0.073227376), (19, 0.4707459)]
1 번째 문서의 topic 비율은 [(0, 0.02601102), (6, 0.20595236), (8, 0.029873868), (10, 0.12317073), (19, 0.5966993)]
2 번째 문서의 topic 비율은 [(1, 0.24323331), (13, 0.10208541), (19, 0.6409716)]
3 번째 문서의 topic 비율은 [(3, 0.52073103), (13, 0.4658361)]
4 번째 문서의 topic 비율은 [(8, 0.72035784), (17, 0.08195089), (19, 0.16620979)]


In [14]:
def make_topictable_per_doc(ldamodel, corpus, texts):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [15]:
topictable = make_topictable_per_doc(ldamodel, corpus, tokenized_doc)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,19.0,0.4707,"[(1, 0.28665355), (2, 0.06540131), (8, 0.09187..."
1,1,19.0,0.5967,"[(0, 0.02601104), (6, 0.20594878), (8, 0.02989..."
2,2,19.0,0.641,"[(1, 0.24323463), (13, 0.10205364), (19, 0.641..."
3,3,3.0,0.5207,"[(3, 0.5207168), (13, 0.4658504)]"
4,4,8.0,0.7204,"[(8, 0.7203827), (17, 0.08195822), (19, 0.1661..."
5,5,6.0,0.2681,"[(6, 0.26814297), (7, 0.24355154), (13, 0.1702..."
6,6,4.0,0.7582,"[(0, 0.011780929), (4, 0.7582356), (5, 0.06903..."
7,7,19.0,0.5228,"[(1, 0.20672245), (4, 0.11136618), (6, 0.10025..."
8,8,15.0,0.4864,"[(3, 0.08350716), (15, 0.48641747), (19, 0.407..."
9,9,13.0,0.4927,"[(5, 0.23401336), (8, 0.06831878), (11, 0.0284..."
