# 필요한 패키지 가져오기

In [2]:
import os
import numpy as np

import spacy
from spacy import displacy

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet

import matplotlib.pyplot as plt
import sklearn
import tensorflow.keras as keras

import warnings

warnings.filterwarnings('ignore')

# 데이터 Gathering하기

In [6]:
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
print(test_data_dir)
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
print(lee_train_file)
text = open(lee_train_file).read()

c:\ProgramData\Anaconda3\envs\py37\lib\site-packages\gensim\test\test_data
c:\ProgramData\Anaconda3\envs\py37\lib\site-packages\gensim\test\test_data\lee_background.cor


In [7]:
text



# 데이터 Cleaning하기

In [17]:
# spacy.load('en') 안되는 문제 해결
!python -m spacy download en

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047113 sha256=b263b50086db2423cd8a32ad7db5eb89047c8eebe913d461ae93718c5af54e4d
  Stored in directory: C:\Users\soohan\AppData\Local\Temp\pip-ephem-wheel-cache-4qxmrmvb\wheels\b7\0d\f0\7ecae8427c515065d75410989e15e5785dd3975fe06e795cd9
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.3.1
c:\ProgramData\Anaconda3\envs\py37\lib\site-packages\spacy\data\en <<===>> c:\ProgramData\Anaconda3\envs\py37\lib\site-packages\en_core_web_sm에 대한 기호화된 링크를 만들었습니다.
[!] Skipping model package depende

In [18]:
nlp = spacy.load('en')

In [19]:
# 불용어처리
my_stop_words = ['say', '\s', 'mr', 'said', 'says', 'saying', 'today', 'be']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [20]:
# 소문자화
doc = nlp(text.lower())

In [21]:
doc

hundreds of people have been forced to vacate their homes in the southern highlands of new south wales as strong winds today pushed a huge bushfire towards the town of hill top. a new blaze near goulburn, south-west of sydney, has forced the closure of the hume highway. at about 4:00pm aedt, a marked deterioration in the weather as a storm cell moved east across the blue mountains forced authorities to make a decision to evacuate people from homes in outlying streets at hill top in the new south wales southern highlands. an estimated 500 residents have left their homes for nearby mittagong. the new south wales rural fire service says the weather conditions which caused the fire to burn in a finger formation have now eased and about 60 fire units in and around hill top are optimistic of defending all properties. as more than 100 blazes burn on new year's eve in new south wales, fire crews have been called to new fire at gunning, south of goulburn. while few details are available at this

In [22]:
# 추가 제거
texts, article = [], []

for word in doc:
    
    if word.text != '\n' and not word.is_stop and not word.is_punct and not word.like_num and word.text != 'I':
        article.append(word.lemma_)
        
    if word.text == '\n':
        texts.append(article)
        article = []        

In [39]:
bigram = gensim.models.phrases.Phrases(texts)
texts = [bigram[line] for line in texts]

In [41]:
texts[0]

['hundred',
 'people',
 'force',
 'vacate',
 'home',
 'southern',
 'highland',
 'new_south',
 'wales',
 'strong',
 'wind',
 'push',
 'huge',
 'bushfire',
 'town',
 'hill',
 'new',
 'blaze',
 'near',
 'goulburn',
 'south',
 'west',
 'sydney',
 'force',
 'closure',
 'hume',
 'highway',
 '4:00pm',
 'aedt',
 'marked',
 'deterioration',
 'weather',
 'storm',
 'cell',
 'move',
 'east',
 'blue_mountain',
 'force',
 'authority',
 'decision',
 'evacuate',
 'people',
 'home',
 'outlying',
 'street',
 'hill',
 'new_south',
 'wales',
 'southern',
 'highland',
 'estimated',
 'resident',
 'leave',
 'home',
 'nearby',
 'mittagong',
 'new_south',
 'wales',
 'rural_fire',
 'service',
 'weather_condition',
 'cause',
 'fire_burn',
 'finger',
 'formation',
 'ease',
 'fire',
 'unit',
 'hill',
 'optimistic',
 'defend',
 'property',
 'blaze',
 'burn',
 'new',
 'year',
 'eve',
 'new_south',
 'wales',
 'fire',
 'crew',
 'call',
 'new',
 'fire',
 'gunning',
 'south',
 'goulburn',
 'detail',
 'available',
 'stag

In [42]:
# (token id, token count) 형태의 bag of words로 변환하기
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [43]:
corpus[1]

[(58, 1),
 (73, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (104, 3),
 (105, 1),
 (106, 2),
 (107, 2),
 (108, 1),
 (109, 1),
 (110, 1),
 (111, 1),
 (112, 1),
 (113, 1),
 (114, 1),
 (115, 1),
 (116, 1),
 (117, 1),
 (118, 1),
 (119, 1),
 (120, 3),
 (121, 1),
 (122, 1),
 (123, 1),
 (124, 2),
 (125, 1),
 (126, 4),
 (127, 1),
 (128, 1),
 (129, 1),
 (130, 1),
 (131, 1),
 (132, 1),
 (133, 1),
 (134, 2),
 (135, 1),
 (136, 1),
 (137, 1),
 (138, 1),
 (139, 3),
 (140, 3),
 (141, 1),
 (142, 1),
 (143, 1),
 (144, 1),
 (145, 3),
 (146, 2),
 (147, 1),
 (148, 2),
 (149, 2),
 (150, 1),
 (151, 1),
 (152, 1),
 (153, 1),
 (154, 1),
 (155, 1),
 (156, 1),
 (157, 1),
 (158, 1),
 (159, 1),
 (160, 1),
 (161, 2),
 (162, 1),
 (163, 1),
 (164, 1),
 (165, 2),
 (166, 1),
 (167, 1),
 (168, 1),
 (169, 1),
 (170, 1),
 (171, 1)]

# Topic Modeling

In [47]:
# Latent Semantic Indexing
lsi_model = LsiModel(corpus= corpus, num_topics= 10, id2word= dictionary)
lsi_model.show_topics(num_topics=5)

[(0,
  '-0.217*"israeli" + -0.213*"palestinian" + -0.197*"arafat" + -0.179*"force" + -0.150*"official" + -0.149*"kill" + -0.143*"attack" + -0.141*"people" + -0.128*"government" + -0.128*"australian"'),
 (1,
  '0.320*"palestinian" + 0.306*"israeli" + 0.299*"arafat" + -0.174*"australia" + -0.171*"australian" + 0.157*"israel" + -0.154*"afghanistan" + 0.137*"sharon" + 0.134*"hamas" + 0.123*"west_bank"'),
 (2,
  '0.263*"afghanistan" + 0.232*"force" + -0.203*"fire" + 0.187*"al_qaeda" + 0.174*"bin_laden" + 0.147*"pakistan" + -0.141*"sydney" + -0.134*"test" + 0.129*"tora_bora" + 0.128*"afghan"'),
 (3,
  '-0.401*"fire" + -0.268*"area" + -0.194*"sydney" + 0.189*"australia" + -0.172*"firefighter" + -0.156*"south" + -0.155*"north" + -0.144*"wind" + -0.129*"new_south" + 0.126*"australian"'),
 (4,
  '0.232*"company" + -0.212*"test" + -0.212*"day" + 0.204*"union" + 0.174*"qantas" + 0.170*"australian" + -0.146*"match" + 0.138*"government" + 0.126*"worker" + -0.125*"wicket"')]

In [None]:
# Hierarchical Dirichlet process
hdp_model = HdpModel(corpus= corpus, id2word = dictionary)
hdp_model.show_topics()