In [2]:
# basic data analytics
import pandas as pd
import numpy as np
import sklearn
import pickle

# nlp modules
import nltk
import spacy
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import multiprocessing
import string

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')

num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')

Using 7 workers


In [3]:
df = pd.read_csv('/home/jupyter/df_news.csv', lineterminator='\n')

In [4]:
df = df.drop('Unnamed: 0', axis = 1)

In [5]:
df = df.drop(['text_notitle', 'split', 'split_len'], axis=1)

In [6]:
df.head(3)

Unnamed: 0,url,date,language,title,text,text_clean
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,en,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,thanks to the application of an artificial int...
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,en,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,scientists who designed an artificially clever...
2,http://www.dataweek.co.za/12835r,2021-03-26,en,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...",the world entered a new era of accelerated tra...


In [None]:
%%time

import time
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Define the number of topics
num_topics = 10

# Preprocess your news articles and create a list of documents
# Each document should be a list of tokens (words) representing an article
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

doc_complete = df.head(100000)['text_clean'].values.tolist()

def clean(doc):
    if type(doc) != str:
        return ''
    doc = ' '.join([i for i in doc.split() if len(i) < 20])
    doc = ' '.join([i for i in doc.split() if len(i) > 1])
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in word_tokenize(punc_free))
    normalized = normalized.replace('’', '')
    normalized = normalized.replace('“', '')
    normalized = normalized.replace('”', '')
    return normalized

# Preprocess your documents
start_time = time.time()
cleaned_documents = [clean(doc) for doc in doc_complete]
clean_time = time.time() - start_time
print("Cleaning done in", clean_time, "seconds")

# Split the cleaned documents into tokens
start_time = time.time()
tokenized_documents = [doc.split() for doc in cleaned_documents]
tok_time = time.time() - start_time
print("Tokenization done in", tok_time, "seconds")

# Calculate word frequencies
start_time = time.time()
word_freq = defaultdict(int)
for document in tokenized_documents:
    for word in document:
        word_freq[word] += 1
freq_time = time.time() - start_time
print("Frequency Calculation Time:", freq_time, "seconds")

# Set the threshold for popular words
threshold = 0.5  # Adjust this value according to your needs

# Create a list of stop words based on the threshold
start_time = time.time()
stop_words = [word for word, freq in word_freq.items() if freq / len(tokenized_documents) > threshold]
stop_words_time = time.time() - start_time
print("Stop Words Creation Time:", stop_words_time, "seconds")

# Filter out stop words from the tokenized documents
start_time = time.time()
filtered_documents = [[word for word in document if word not in stop_words] for document in tokenized_documents]
filtering_time = time.time() - start_time
print("Stop Words Filtering Time:", filtering_time, "seconds")

# Create a dictionary from the preprocessed and filtered documents
start_time = time.time()
dictionary = corpora.Dictionary(filtered_documents)
dictionary_creation_time = time.time() - start_time
print("Dictionary Creation Time:", dictionary_creation_time, "seconds")

# Convert the dictionary into a bag-of-words representation
start_time = time.time()
corpus = [dictionary.doc2bow(doc) for doc in filtered_documents]
corpus_creation_time = time.time() - start_time
print("Corpus Creation Time:", corpus_creation_time, "seconds")

print("Working on LDA modeling")

#Using multicore LDA
iterations = 100
passes = 20
eval_every = None

start_time = time.time()
lda_model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, workers=workers)

# Print the most contributing words for each topic
for topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
    print(topic)

Cleaning done in 314.88526582717896 seconds
Tokenization done in 5.4517035484313965 seconds
Frequency Calculation Time: 9.400272846221924 seconds
Stop Words Creation Time: 0.14632725715637207 seconds
Stop Words Filtering Time: 24.386829614639282 seconds
Dictionary Creation Time: 41.951565742492676 seconds
Corpus Creation Time: 23.72555375099182 seconds
Working on LDA modeling


In [46]:
%%time

lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

CPU times: user 4.54 s, sys: 1.11 s, total: 5.66 s
Wall time: 9.22 s


In [None]:
from gensim.models import CoherenceModel
from multiprocessing import Pool

# Function to calculate coherence score for a chunk of documents
def calculate_coherence(chunk):
    return coherence_model.get_coherence(chunk)

# Split the documents into chunks
num_chunks = 4
chunk_size = len(documents) // num_chunks
document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]

# Create a multiprocessing Pool
pool = Pool(processes=num_chunks)

# Calculate coherence scores in parallel
coherence_scores = pool.map(calculate_coherence, document_chunks)

# Aggregate the coherence scores
avg_coherence_score = sum(coherence_scores) / len(coherence_scores)

# Print the average coherence score
print("Average Coherence Score:", avg_coherence_score)