In [29]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF


In [30]:

# Load dataset
file_path = 'npr.csv'
df = pd.read_csv(file_path, skiprows=1, names=['Article'])  # Skip first row properly


In [31]:

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text


In [32]:

# Clean the text
df['clean_text'] = df['Article'].astype(str).apply(preprocess_text)


In [33]:

# Vectorization
vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
vectorizer_count = CountVectorizer(stop_words='english', max_features=1000)


In [34]:

X_tfidf = vectorizer_tfidf.fit_transform(df['clean_text'])
X_count = vectorizer_count.fit_transform(df['clean_text'])


In [35]:

# LDA Model
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda_topics = lda.fit_transform(X_count)


In [40]:
df_lda = df.copy()
df_lda['Topic'] = np.argmax(lda_topics, axis=1)
df_lda = df_lda[['Article', 'Topic']]
df_lda.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",9
1,Donald Trump has used Twitter — his prefe...,9
2,Donald Trump is unabashedly praising Russian...,9
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",6


In [38]:
# NMF Model
nmf = NMF(n_components=10, random_state=42)
nmf_topics = nmf.fit_transform(X_tfidf)

df_nmf = df.copy()
df_nmf['Topic'] = np.argmax(nmf_topics, axis=1)
df_nmf = df_nmf[['Article', 'Topic']]

df_nmf.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6
