In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
dataset = pd.read_csv('/content/20newsgroup_preprocessed.csv', delimiter=';')
dataset = dataset.dropna()

In [4]:
# Extract columns that are needed
dataset = dataset[['target', 'text_cleaned']]


In [5]:
# Tokenization
text_cleaned = list(dataset['text_cleaned'])

def tokenize(text):
    text = text.split(' ')
    return text

text_tokenized = [tokenize(text) for text in text_cleaned]

In [6]:
en_stop = nltk.corpus.stopwords.words('english')
en_stop = ["use", "one", "would", "write", "get", "also", "know", "make", "see"] \
         + ["may", "well", "work", "come", "even", "much", "must"] \
         + ["article", "take", "say", "like", "want", "could", "go", "dont", "think"] + en_stop


In [7]:
from nltk.corpus import wordnet as wn
#　Steming, Lemmatize
def steming_lemmatiation(word, stopwords):
    word = word.lower()

    # Remove words whose length is 1
    if len(word) <= 1:
        return None
    elif word in stopwords:
        return None

    lemma = wn.morphy(word)
    
    if lemma is None:
        return word
    elif lemma in stopwords:
        return None
    else:
        return lemma
    

In [8]:
def process_document(document, stopwords):
    document = [steming_lemmatiation(word, stopwords) for word in document]
    return [word for word in document if word is not None]
    

In [9]:
documents = [process_document(document, en_stop) for document in text_tokenized]


### LDA準備

In [10]:
import gensim
from gensim import corpora

In [11]:
dictionary = corpora.Dictionary(documents)

my_corpus = [dictionary.doc2bow(document) for document in documents]

In [12]:
print(dictionary.token2id)



In [13]:
print(dictionary.doc2bow(w.lower() for w in documents[1]))

[(11, 3), (18, 1), (22, 1), (26, 4), (29, 1), (35, 3), (36, 2), (41, 44), (43, 90), (49, 3), (63, 2), (66, 3), (67, 3), (68, 40), (69, 27), (73, 3), (74, 2), (76, 6), (81, 1), (83, 2), (99, 2), (100, 3), (103, 2), (109, 2), (110, 15), (111, 2), (112, 10), (117, 2), (126, 1), (127, 2), (130, 5), (157, 1), (162, 4), (171, 1), (173, 4), (174, 1), (179, 1), (180, 1), (181, 2), (183, 1), (190, 2), (195, 1), (197, 1), (204, 2), (210, 10), (211, 29), (212, 7), (215, 8), (224, 1), (230, 2), (235, 7), (236, 1), (239, 1), (241, 1), (242, 5), (244, 1), (253, 59), (257, 2), (258, 2), (270, 1), (274, 1), (281, 6), (289, 9), (291, 1), (299, 1), (315, 3), (318, 1), (319, 1), (328, 4), (329, 1), (335, 13), (338, 1), (341, 5), (344, 4), (352, 1), (357, 2), (360, 40), (364, 2), (367, 1), (371, 2), (372, 1), (377, 8), (378, 7), (380, 2), (388, 3), (396, 3), (403, 5), (405, 1), (408, 1), (409, 5), (413, 2), (414, 1), (425, 6), (426, 1), (429, 21), (434, 9), (438, 6), (441, 1), (449, 1), (450, 4), (451, 5)

In [14]:
# Ten documents which I show topic distribution after.
ten_documents = np.random.randint(0, len(documents), 10)

### LDA学習
トピック数20

In [15]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus=my_corpus,
                                           num_topics=20, # The datset has 20 topics
                                           id2word=dictionary,
                                           alpha=0.1,
                                           eta=0.1,
                                           #minimum_probability=0.0
                                          )

In [16]:
topics = ldamodel.print_topics(num_words=10)

for topic in topics:
    print(topic)

(0, '0.011*"relations" + 0.005*"withdraw" + 0.004*"vehicle" + 0.004*"explosion" + 0.004*"propulsion" + 0.003*"spencer" + 0.003*"object" + 0.003*"guerilla" + 0.003*"seizure" + 0.003*"refuge"')
(1, '0.009*"people" + 0.008*"government" + 0.008*"right" + 0.007*"law" + 0.006*"us" + 0.005*"state" + 0.004*"god" + 0.004*"koresh" + 0.004*"israeli" + 0.004*"turkish"')
(2, '0.046*"armenian" + 0.032*"arab" + 0.019*"turkey" + 0.008*"armenia" + 0.006*"lebanon" + 0.006*"istanbul" + 0.005*"ottoman" + 0.004*"covenant" + 0.004*"azerbaijan" + 0.003*"azeri"')
(3, '0.021*"ra" + 0.006*"pt" + 0.006*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.006*"clh" + 0.005*"ar" + 0.005*"ne" + 0.005*"nhl" + 0.005*"de" + 0.004*"spy" + 0.004*"je"')
(4, '0.009*"bike" + 0.005*"gb" + 0.005*"humanitarian" + 0.003*"torch" + 0.003*"bayonet" + 0.003*"drive" + 0.003*"louis" + 0.003*"possess" + 0.003*"muscle" + 0.003*"highway"')
(5, '0.014*"dee" + 0.008*"msg" + 0.007*"mamma" + 0.005*"fox" + 0.004*"skin" + 0.004*"lethal" + 0.004*"faith" + 0

In [17]:
# Adjst output for the report
for topic in topics:
    prob_words = topic[1].split(" + ")
    for prob_word in prob_words:
        prob_word = prob_word.split("*")
        prob = prob_word[0]
        word = prob_word[1]
        print("{} ({}), ".format(word, prob), end="")
    print("")

"relations" (0.011), "withdraw" (0.005), "vehicle" (0.004), "explosion" (0.004), "propulsion" (0.004), "spencer" (0.003), "object" (0.003), "guerilla" (0.003), "seizure" (0.003), "refuge" (0.003), 
"people" (0.009), "government" (0.008), "right" (0.008), "law" (0.007), "us" (0.006), "state" (0.005), "god" (0.004), "koresh" (0.004), "israeli" (0.004), "turkish" (0.004), 
"armenian" (0.046), "arab" (0.032), "turkey" (0.019), "armenia" (0.008), "lebanon" (0.006), "istanbul" (0.006), "ottoman" (0.005), "covenant" (0.004), "azerbaijan" (0.004), "azeri" (0.003), 
"ra" (0.021), "pt" (0.006), "maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" (0.006), "clh" (0.006), "ar" (0.005), "ne" (0.005), "nhl" (0.005), "de" (0.005), "spy" (0.004), "je" (0.004), 
"bike" (0.009), "gb" (0.005), "humanitarian" (0.005), "torch" (0.003), "bayonet" (0.003), "drive" (0.003), "louis" (0.003), "possess" (0.003), "muscle" (0.003), "highway" (0.003), 
"dee" (0.014), "msg" (0.008), "mamma" (0.007), "fox" (0.005), "skin" (0.004), "let

In [18]:
for i in ten_documents:
    print("document ID {}: {}".format(i, sorted(ldamodel.get_document_topics(my_corpus[i]), key=lambda x: x[1], reverse=True)))


document ID 13252: [(16, 0.4273232), (8, 0.3202367), (17, 0.10877795), (19, 0.07297442), (1, 0.039432287)]
document ID 583: [(1, 0.35932994), (17, 0.3518394), (7, 0.24949825), (6, 0.015448269)]
document ID 13143: [(16, 0.47389945), (12, 0.2375425), (10, 0.14166427), (1, 0.094243735), (18, 0.02722323)]
document ID 1338: [(12, 0.41420206), (9, 0.22576697), (8, 0.020008089), (0, 0.020005696), (14, 0.02000503), (7, 0.020002376), (6, 0.020001825), (17, 0.020001445), (18, 0.020001262), (11, 0.020001246), (10, 0.020001091), (16, 0.020000938), (1, 0.020000806), (3, 0.02000035), (13, 0.020000335), (5, 0.0200002), (2, 0.020000089), (19, 0.020000089), (4, 0.020000061), (15, 0.020000044)]
document ID 1706: [(14, 0.39628842), (16, 0.3423063), (19, 0.11971604)]
document ID 3594: [(8, 0.66303587), (9, 0.14700867), (12, 0.13132578)]
document ID 12142: [(16, 0.44729504), (7, 0.19095881), (19, 0.15575399), (8, 0.09576512), (1, 0.057660054), (9, 0.021188905), (3, 0.014491826)]
document ID 12428: [(0, 0.1

トピック数11

In [19]:
ldamodel_11= gensim.models.ldamodel.LdaModel(corpus=my_corpus,
                                           num_topics=11, # The datset has 20 topics
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           #minimum_probability=0.0    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )

In [20]:
topics_11 = ldamodel_11.print_topics(num_words=10)

for topic in topics_11:
    print(topic)

(0, '0.004*"space" + 0.004*"system" + 0.004*"key" + 0.004*"file" + 0.004*"include" + 0.004*"information" + 0.003*"message" + 0.003*"address" + 0.003*"number" + 0.003*"program"')
(1, '0.006*"us" + 0.006*"government" + 0.005*"people" + 0.005*"state" + 0.004*"god" + 0.004*"law" + 0.004*"president" + 0.004*"israel" + 0.004*"group" + 0.003*"world"')
(2, '0.008*"israeli" + 0.007*"myers" + 0.006*"ms" + 0.006*"system" + 0.006*"village" + 0.004*"space" + 0.004*"enforcement" + 0.004*"university" + 0.003*"phone" + 0.003*"data"')
(3, '0.006*"atf" + 0.005*"tragedy" + 0.004*"nsa" + 0.004*"clipper" + 0.003*"extermination" + 0.003*"wiretap" + 0.003*"crypto" + 0.002*"kent" + 0.002*"launcher" + 0.002*"hst"')
(4, '0.008*"people" + 0.006*"time" + 0.004*"back" + 0.004*"good" + 0.004*"first" + 0.004*"going" + 0.003*"fbi" + 0.003*"try" + 0.003*"start" + 0.003*"way"')
(5, '0.019*"homosexual" + 0.017*"jehovah" + 0.015*"god" + 0.013*"jesus" + 0.011*"elohim" + 0.011*"christian" + 0.011*"christ" + 0.010*"ra" + 0.

In [21]:
for topic in topics_11:
    prob_words = topic[1].split(" + ")
    for prob_word in prob_words:
        prob_word = prob_word.split("*")
        prob = prob_word[0]
        word = prob_word[1]
        print("{} ({}), ".format(word, prob), end="")
    print("")

"space" (0.004), "system" (0.004), "key" (0.004), "file" (0.004), "include" (0.004), "information" (0.004), "message" (0.003), "address" (0.003), "number" (0.003), "program" (0.003), 
"us" (0.006), "government" (0.006), "people" (0.005), "state" (0.005), "god" (0.004), "law" (0.004), "president" (0.004), "israel" (0.004), "group" (0.004), "world" (0.003), 
"israeli" (0.008), "myers" (0.007), "ms" (0.006), "system" (0.006), "village" (0.006), "space" (0.004), "enforcement" (0.004), "university" (0.004), "phone" (0.003), "data" (0.003), 
"atf" (0.006), "tragedy" (0.005), "nsa" (0.004), "clipper" (0.004), "extermination" (0.003), "wiretap" (0.003), "crypto" (0.003), "kent" (0.002), "launcher" (0.002), "hst" (0.002), 
"people" (0.008), "time" (0.006), "back" (0.004), "good" (0.004), "first" (0.004), "going" (0.004), "fbi" (0.003), "try" (0.003), "start" (0.003), "way" (0.003), 
"homosexual" (0.019), "jehovah" (0.017), "god" (0.015), "jesus" (0.013), "elohim" (0.011), "christian" (0.011), "

In [22]:
for i in ten_documents:
    print("document ID {}: {}".format(i, sorted(ldamodel_11.get_document_topics(my_corpus[i]), key=lambda x: x[1], reverse=True)))

document ID 13252: [(2, 0.41433716), (4, 0.28378814), (1, 0.19932705), (8, 0.08768264)]
document ID 583: [(1, 0.5688382), (9, 0.22260456), (8, 0.19645245)]
document ID 13143: [(4, 0.5280931), (8, 0.17423673), (0, 0.15407218), (6, 0.13154821)]
document ID 1338: [(10, 0.51343477), (4, 0.2670157), (2, 0.024403865), (0, 0.02439755), (3, 0.024395596), (1, 0.024394577), (7, 0.024393618), (8, 0.024391463), (9, 0.02439104), (6, 0.024390938), (5, 0.024390886)]
document ID 1706: [(0, 0.47741747), (8, 0.44148627)]
document ID 3594: [(3, 0.5954526), (2, 0.3490617), (7, 0.027008735)]
document ID 12142: [(4, 0.68210995), (2, 0.27444836), (3, 0.017733814), (7, 0.01650746)]
document ID 12428: [(2, 0.33517218), (8, 0.31685078), (0, 0.2886805), (4, 0.04901603)]
document ID 366: [(8, 0.80620164), (1, 0.09721485), (9, 0.093044475)]
document ID 4176: [(2, 0.5326966), (8, 0.46096903)]


トピック数7

In [23]:
ldamodel_7 = gensim.models.ldamodel.LdaModel(corpus=my_corpus,
                                           num_topics=7, # The datset has 7 topics
                                           id2word=dictionary,
                                           alpha=0.1,                             #optional LDAのハイパーパラメータalpha
                                           eta=0.1,                                 #optional LDAのハイパーパラメータbeta
                                           #minimum_probability=0.0    #optional 学習結果に格納するトピック・単語の生起確率の下限
                                          )

In [24]:
topics_7 = ldamodel_7.print_topics(num_words=10)

for topic in topics_7:
    print(topic)

(0, '0.008*"people" + 0.005*"us" + 0.004*"state" + 0.003*"right" + 0.003*"good" + 0.003*"government" + 0.003*"im" + 0.003*"time" + 0.003*"president" + 0.003*"child"')
(1, '0.011*"ra" + 0.004*"iranian" + 0.004*"pt" + 0.004*"tragedy" + 0.004*"pp" + 0.004*"thy" + 0.004*"de" + 0.003*"clh" + 0.003*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.003*"iran"')
(2, '0.005*"system" + 0.004*"bit" + 0.004*"email" + 0.004*"data" + 0.003*"chip" + 0.003*"db" + 0.003*"file" + 0.003*"address" + 0.003*"information" + 0.003*"please"')
(3, '0.010*"armenian" + 0.008*"turkish" + 0.007*"arab" + 0.007*"greek" + 0.006*"israel" + 0.005*"christian" + 0.005*"sin" + 0.005*"homosexual" + 0.004*"muslim" + 0.004*"source"')
(4, '0.006*"myers" + 0.005*"ms" + 0.004*"system" + 0.003*"new" + 0.003*"turk" + 0.003*"time" + 0.002*"space" + 0.002*"theory" + 0.002*"village" + 0.002*"science"')
(5, '0.005*"system" + 0.004*"israeli" + 0.004*"try" + 0.004*"drive" + 0.004*"need" + 0.003*"program" + 0.003*"time" + 0.003*"key" + 0.003*"im" + 

In [25]:
for topic in topics_7:
    prob_words = topic[1].split(" + ")
    for prob_word in prob_words:
        prob_word = prob_word.split("*")
        prob = prob_word[0]
        word = prob_word[1]
        print("{} ({}), ".format(word, prob), end="")
    print("")

"people" (0.008), "us" (0.005), "state" (0.004), "right" (0.003), "good" (0.003), "government" (0.003), "im" (0.003), "time" (0.003), "president" (0.003), "child" (0.003), 
"ra" (0.011), "iranian" (0.004), "pt" (0.004), "tragedy" (0.004), "pp" (0.004), "thy" (0.004), "de" (0.004), "clh" (0.003), "maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" (0.003), "iran" (0.003), 
"system" (0.005), "bit" (0.004), "email" (0.004), "data" (0.004), "chip" (0.003), "db" (0.003), "file" (0.003), "address" (0.003), "information" (0.003), "please" (0.003), 
"armenian" (0.010), "turkish" (0.008), "arab" (0.007), "greek" (0.007), "israel" (0.006), "christian" (0.005), "sin" (0.005), "homosexual" (0.005), "muslim" (0.004), "source" (0.004), 
"myers" (0.006), "ms" (0.005), "system" (0.004), "new" (0.003), "turk" (0.003), "time" (0.003), "space" (0.002), "theory" (0.002), "village" (0.002), "science" (0.002), 
"system" (0.005), "israeli" (0.004), "try" (0.004), "drive" (0.004), "need" (0.004), "program" (0.003), "time" (0.0

In [26]:
for i in ten_documents:
    print("document ID {}: {}".format(i, sorted(ldamodel_7.get_document_topics(my_corpus[i]), key=lambda x: x[1], reverse=True)))


document ID 13252: [(4, 0.46784708), (6, 0.26192757), (2, 0.26165858)]
document ID 583: [(0, 0.48012245), (6, 0.46218377), (3, 0.051604245)]
document ID 13143: [(0, 0.8923604), (4, 0.09897228)]
document ID 1338: [(3, 0.83779275), (2, 0.027046034), (6, 0.027035912), (5, 0.02703298), (4, 0.027032131), (1, 0.027030513), (0, 0.02702969)]
document ID 1706: [(5, 0.8121849), (2, 0.1410698)]
document ID 3594: [(2, 0.71239716), (5, 0.15682285), (1, 0.116336316)]
document ID 12142: [(4, 0.4809768), (5, 0.29073754), (0, 0.20870365), (1, 0.0156180775)]
document ID 12428: [(4, 0.5093836), (5, 0.38668838), (0, 0.098018415)]
document ID 366: [(6, 0.9544186), (0, 0.043365523)]
document ID 4176: [(5, 0.6721873), (2, 0.3242834)]


### 以下可視化

In [None]:
!pip install pyLDAvis

In [28]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

  from collections import Iterable


In [29]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, my_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [30]:
lda_display_11 = pyLDAvis.gensim_models.prepare(ldamodel_11, my_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_11)

In [31]:
lda_display_7 = pyLDAvis.gensim_models.prepare(ldamodel_7, my_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_7)