In [1]:
import re
import os
import glob
import numpy as np
import pandas as pd
import nltk
import gensim
import gensim.corpora as corpora

from timeit import default_timer as timer
from datetime import timedelta
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swpark\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swpark\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
os.getcwd(), os.listdir()

('C:\\Users\\swpark\\JupyterLab\\연구자료_모델예제',
 ['.ipynb_checkpoints',
  'clustering.ipynb',
  'collaborative_filtering.ipynb',
  'data',
  'decision_tree.ipynb',
  'iris_tree_model.dot',
  'KNN.ipynb',
  'LDA.ipynb',
  'logistic_regression.ipynb',
  'similarity.ipynb',
  'SVM.ipynb'])

In [5]:
os.chdir('./data/bbc')

In [6]:
data = []

print("**START")
for i, theme in enumerate(os.listdir()):
    file_path = glob.glob(os.path.join(os.getcwd(), theme, "*.txt"))
    # reading text files from each directory
    print("-------------------------------")
    print("Collecting bbc {} news dataset".format(theme))
    start = timer()
    for files in file_path:
        try:
            with open(files, "r", encoding="utf-8") as f:
                data.append(f.read())
        except UnicodeDecodeError as e:
            print(e)
    end = timer()
    print("execution time: {} ".format(timedelta(seconds=end-start)))
    print("-------------------------------")
    print()
print("**END")

**START
-------------------------------
Collecting bbc business news dataset
execution time: 0:00:00.096936 
-------------------------------

-------------------------------
Collecting bbc entertainment news dataset
execution time: 0:00:00.066664 
-------------------------------

-------------------------------
Collecting bbc politics news dataset
execution time: 0:00:00.075731 
-------------------------------

-------------------------------
Collecting bbc README.TXT news dataset
execution time: 0:00:00 
-------------------------------

-------------------------------
Collecting bbc sport news dataset
execution time: 0:00:00.088593 
-------------------------------

-------------------------------
Collecting bbc tech news dataset
execution time: 0:00:00.069273 
-------------------------------

**END


In [7]:
len(data), data[0]

(2225,
 'Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.\n\nThe firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.\n\nTime Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will tr

In [8]:
# 그냥 나중에 편하게 쓰려고 csv 파일로 저장
df = pd.DataFrame(data, columns = ['contents'])
print(df)

                                               contents
0     Ad sales boost Time Warner profit\n\nQuarterly...
1     Dollar gains on Greenspan speech\n\nThe dollar...
2     Yukos unit buyer faces loan claim\n\nThe owner...
3     High fuel prices hit BA's profits\n\nBritish A...
4     Pernod takeover talk lifts Domecq\n\nShares in...
...                                                 ...
2220  BT program to beat dialler scams\n\nBT is intr...
2221  Spam e-mails tempt net shoppers\n\nComputer us...
2222  Be careful how you code\n\nA new European dire...
2223  US cyber security chief resigns\n\nThe man mak...
2224  Losing yourself in online gaming\n\nOnline rol...

[2225 rows x 1 columns]


In [9]:
df.to_csv(os.getcwd()+'news_total.csv', index = True)

In [10]:
data = [re.sub('[^a-zA-Z_]', ' ', doc) for doc in data]
data = [re.sub('\s+', ' ', doc) for doc in data]

In [11]:
tokenized_document = [word_tokenize(d) for d in data]
print(tokenized_document[0][:30])

['Ad', 'sales', 'boost', 'Time', 'Warner', 'profit', 'Quarterly', 'profits', 'at', 'US', 'media', 'giant', 'TimeWarner', 'jumped', 'to', 'bn', 'm', 'for', 'the', 'three', 'months', 'to', 'December', 'from', 'm', 'year', 'earlier', 'The', 'firm', 'which']


In [12]:
stop_words = stopwords.words('english')
stop_words.extend(['said', 'says', 'year', 'also', 'would', 'mr', 'bn', 'could', 'first', 'second', 'one', 'two',
                   'use', 'used', 'last', 'time', 'make', 'new'])
stop_words = set(stop_words)

In [13]:
def cleansing(document):
    corpus = []
    for d in document:
        doc = []
        for word in d:
            low_word = word.lower()
            if (low_word not in stop_words) and (len(low_word)!=1):
                doc.append(word)
        corpus.append(doc)
    return corpus

In [14]:
cleaned_document = cleansing(tokenized_document)
print(cleaned_document[0][:30])

['Ad', 'sales', 'boost', 'Warner', 'profit', 'Quarterly', 'profits', 'US', 'media', 'giant', 'TimeWarner', 'jumped', 'three', 'months', 'December', 'earlier', 'firm', 'biggest', 'investors', 'Google', 'benefited', 'sales', 'high', 'speed', 'internet', 'connections', 'higher', 'advert', 'sales', 'TimeWarner']


In [15]:
dict_ = corpora.Dictionary(cleaned_document)
print(dict_)

Dictionary<31719 unique tokens: ['AOL', 'Ad', 'Alexander', 'Bertelsmann', 'Bros']...>


In [16]:
doc_term_matrix = [dict_.doc2bow(i) for i in cleaned_document]
print(doc_term_matrix[0])

[(0, 7), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 7), (22, 3), (23, 4), (24, 2), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 2), (53, 2), (54, 1), (55, 1), (56, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 3), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 2), (87, 1), (88, 1), (89, 1), (90, 1), (91, 4), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1),

In [18]:
Lda = gensim.models.ldamodel.LdaModel
K = 5
passes = 30
iterations = 600

ldamodel = Lda(doc_term_matrix, 
               num_topics=K, 
               id2word = dict_, 
               passes=passes, 
               iterations=iterations, 
               random_state=123)

In [19]:
ldamodel.print_topics()

[(0,
  '0.006*"people" + 0.006*"government" + 0.004*"Labour" + 0.003*"UK" + 0.003*"Blair" + 0.003*"told" + 0.003*"BBC" + 0.003*"public" + 0.003*"film" + 0.002*"election"'),
 (1,
  '0.013*"people" + 0.010*"mobile" + 0.009*"music" + 0.009*"phone" + 0.007*"TV" + 0.006*"digital" + 0.006*"technology" + 0.006*"services" + 0.006*"service" + 0.005*"UK"'),
 (2,
  '0.007*"games" + 0.005*"technology" + 0.005*"people" + 0.005*"game" + 0.005*"computer" + 0.004*"Mac" + 0.004*"video" + 0.004*"best" + 0.004*"PC" + 0.004*"show"'),
 (3,
  '0.005*"US" + 0.003*"England" + 0.003*"world" + 0.003*"back" + 0.003*"win" + 0.002*"three" + 0.002*"game" + 0.002*"years" + 0.002*"company" + 0.002*"economy"'),
 (4,
  '0.006*"software" + 0.005*"people" + 0.004*"users" + 0.004*"US" + 0.004*"net" + 0.004*"security" + 0.004*"Microsoft" + 0.004*"technology" + 0.004*"virus" + 0.004*"system"')]