In [None]:
import pandas as pd

df_train = pd.read_json("../data/news_train.json", orient='records')
df_test = pd.read_json("../data/news_test.json", orient='records')
df = pd.concat([df_train, df_test])

df.info()
df.hist()
df.head()

In [None]:
news = df["text"]
news.head()

In [None]:
import nltk

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download("stopwords")
nltk.download('punkt')

stemmer = PorterStemmer()
w_n_lemmatizer = WordNetLemmatizer() 

stop_words = set(stopwords.words("english"))

news = news.apply(lambda row: row.lower())
news = news.apply(lambda row: " ".join([word for word in word_tokenize(row) if not word in stop_words and word.isalpha()]))
news = news.apply(lambda row: " ".join([w_n_lemmatizer.lemmatize(word) for word in word_tokenize(row)]))
news = news.apply(stemmer.stem)

news.head()

In [None]:
%pip install wordcloud

In [None]:
from wordcloud import WordCloud

long_string = ','.join(news)

wordcloud = WordCloud(background_color="white", 
                      max_words=1000, 
                      contour_width=3, 
                      contour_color='steelblue')

wordcloud.generate(long_string)

wordcloud.to_image()

In [None]:
import gensim.corpora as corpora

data_words = [item.split() for item in news]

id2word = corpora.Dictionary(data_words)

print(id2word)

In [None]:
corpus = [id2word.doc2bow(text) for text in data_words]

print(corpus[:1][0][:30])

In [None]:
import gensim

num_topics = 10

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10)

lda_model.print_topics()

In [None]:
%pip install pyLDAvis 

In [None]:
import os
import pickle
import pyLDAvis
import pyLDAvis.gensim_models


pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('../data/lda_'+str(num_topics))

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, '../data/lda_'+ str(num_topics) +'.html')

LDAvis_prepared