In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dfori = pd.read_csv('/kaggle/input/npr-data/npr.csv')
df = dfori.copy()
df

# Preprocessing the Text using Count Vectorizer
This way, we can build the document term matrix (DTM) for further LDA modeling.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')
dtm = cv.fit_transform(df['Article'])

# LDA Computation
Now let's perform the LDA modelling with our initial guess of seven topics.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
ntopic = 5
lda = LatentDirichletAllocation(n_components = ntopic, random_state = 1)
lda.fit(dtm) # fit the LDA model to the DTM

# Identify the Top 20 Words of Each Topics

In [None]:
lda.components_.shape # topic and pseudocounts (topic, pseudocounts) This is just to understand what are j and i below.

for j,i in enumerate(lda.components_):
    print(f"Topic #{j}")
    print([cv.get_feature_names()[i] for i in lda.components_[j].argsort()[-20:]])
    print(" ")

So perhaps we can infer the followings:
* Topic 0 = World Politics
* Topic 1 = Healthcare
* Topic 2 = Education
* Topic 3 = US Election
* Topic 4 = Culture

In [None]:
# Mapping

topicdict = {0: 'World Politics', 1: 'Healthcare', 2: 'Education', 3: 'US Election', 4: 'Culture'} # Creating a custom dictionary for topics
df['Topic'] = lda.transform(dtm).argmax(axis = 1) # Updates the topic column by transforming the DTM with LDA algorithm
df['Topic'] = df['Topic'].replace(topicdict) # Updates the topic column using the topicdict dictionary
df # display the DataFrame

Let's check randomly whether a random document matches the topic LDA guesses.

In [None]:
import random # to generate a random number
a = random.randint(0, len(df)) # generate the number 

print(f"Topic: {df['Topic'][a]}")
print(" ")
print(f"Text: {df['Article'][a]}")

Notes: 
* Initially, I tried 7 number of topics. But in my opinion, two other suggested topics are not really meaningful. Thus, I tried to reduce to six, and then finally five.
* We can still do better by enhancing the stopwords list so that it may also remove digit characters or special symbols (if any), and other not-meaningful words.

Despite of these issues, I think we can conclude that based on the basic codes, we can infer that there are five topics based on the given data.