In [1]:
import json
import os

# Load the JSON data
json_file_path = os.path.join('wiki_corpus.json')

with open(json_file_path, 'r') as file:
    data = json.load(file)

# We'll inspect the first record to understand its structure
first_record = data[0] if isinstance(data, list) and len(data) > 0 else {}

first_record

[['? (film)'],
 '? (also written Tanda Tanya, meaning Question Mark) is a 2011 Indonesian drama film directed by Hanung Bramantyo. It stars Revalina Sayuthi Temat, Reza Rahadian, Agus Kuncoro, Endhita, Rio Dewanto, and Hengky Sulaeman. The film focuses around Indonesia\'s religious pluralism, which often results in conflict between different beliefs, represented in a plot that revolves around the interactions of three families, one Buddhist, one Muslim, and one Catholic. After undergoing numerous hardships and the deaths of several family members in religious violence, they are reconciled.\nBased on Bramantyo\'s experiences as a mixed-race child, ? was meant to counter the portrayal of Islam as a "radical religion". Owing to the film\'s theme of religious pluralism and controversial subject matter, Bramantyo had difficulty finding backing. Eventually, Mahaka Pictures put forth Rp 5 billion ($600,000) to fund the production. Filming began on 5 January 2011 in Semarang.\nReleased on 7 Ap

In [2]:
article_contents = [article[1] for article in data if len(article) > 1]

In [3]:
from gensim import corpora, models
import gensim
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Assuming article_contents is a list of strings, where each string is an article
# Preprocess the text data
stop_words = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in article_contents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Convert dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

# View the topics
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

###process was slow

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dreampy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(0, '0.007*"two" + 0.005*"first" + 0.004*"new" + 0.004*"would"')
(1, '0.005*"isbn" + 0.004*"also" + 0.004*"may" + 0.004*"used"')
(2, '0.011*"first" + 0.008*"film" + 0.007*"game" + 0.006*"world"')
(3, '0.005*"would" + 0.005*"also" + 0.004*"new" + 0.004*"first"')
(4, '0.010*"army" + 0.010*"battle" + 0.009*"german" + 0.009*"fleet"')


Using MultiCore module from gensin.models

In [4]:
from gensim import corpora
from gensim.models import LdaMulticore
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Preprocess the text data
stop_words = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in article_contents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Convert dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA using LdaMulticore
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, chunksize=100, passes=10, workers=None, alpha='symmetric', per_word_topics=True)

# View the topics
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dreampy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(0, '0.021*"isbn" + 0.007*"press." + 0.007*"university" + 0.006*"new"')
(1, '0.004*"would" + 0.003*"first" + 0.003*"new" + 0.003*"also"')
(2, '0.004*"also" + 0.004*"one" + 0.003*"first" + 0.003*"two"')
(3, '0.010*"first" + 0.006*"team" + 0.005*"two" + 0.005*"second"')
(4, '0.004*"two" + 0.004*"north" + 0.003*"river" + 0.003*"south"')


In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Extend the stopwords list with custom uninformative words
custom_stopwords = ['also', 'one', 'two', 'first', 'new', 'would', 'many', 'may', 'in', 'the','used']
stop_words.update(custom_stopwords)

In [6]:
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in article_contents]

In [7]:
from gensim import corpora

# Create a new dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Filter out extremes to remove additional noise
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Convert the updated dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(text) for text in texts]

In [8]:
from gensim.models import LdaMulticore

# Apply LDA using LdaMulticore with the refined corpus
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, chunksize=100, passes=10, workers=None, alpha='symmetric', per_word_topics=True)

# View topics
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.006*"united" + 0.005*"state" + 0.005*"government" + 0.005*"president" + 0.005*"states" + 0.004*"national" + 0.003*"storm" + 0.003*"party" + 0.003*"federal" + 0.003*"tropical"')
(1, '0.006*"species" + 0.005*"found" + 0.004*"known" + 0.003*"large" + 0.003*"around" + 0.002*"small" + 0.002*"similar" + 0.002*"years" + 0.002*"although" + 0.002*"often"')
(2, '0.004*"isbn" + 0.003*"work" + 0.003*""the" + 0.003*"wrote" + 0.003*"later" + 0.003*"john" + 0.003*"published" + 0.003*"university" + 0.003*"became" + 0.002*"book"')
(3, '0.008*"army" + 0.007*"force" + 0.007*"air" + 0.006*"german" + 0.006*"japanese" + 0.006*"battle" + 0.005*"british" + 0.005*"forces" + 0.005*"attack" + 0.005*"division"')
(4, '0.004*"king" + 0.004*"war" + 0.004*"french" + 0.003*"royal" + 0.003*"british" + 0.003*"military" + 0.002*"became" + 0.002*"de" + 0.002*"english" + 0.002*"later"')
(5, '0.009*"league" + 0.009*"club" + 0.007*"team" + 0.007*"second" + 0.007*"cup" + 0.006*"football" + 0.006*"match" + 0.005*"season