In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Fetch the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Use TF-IDF vectorizer to convert text data into numerical vectors
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)

# Apply Non-Negative Matrix Factorization (NMF) to identify topics
n_topics = 5  # You can adjust the number of topics based on your preference
nmf = NMF(n_components=n_topics, random_state=1)
nmf.fit(X)

# Display the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    top_words_idx = topic.argsort()[:-10 - 1:-1]  # Display top 10 words for each topic
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")




Topic #1: don, people, just, think, like, good, know, time, did, right
Topic #2: windows, thanks, file, dos, program, does, know, files, mail, use
Topic #3: god, jesus, bible, believe, christ, christian, faith, christians, does, sin
Topic #4: drive, scsi, card, ide, disk, hard, controller, drives, bus, floppy
Topic #5: key, chip, encryption, clipper, government, keys, escrow, use, law, algorithm


In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
import string
import nltk
nltk.download('punkt')

# Fetch the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Tokenization, removing punctuation, lowercase conversion, removing stop words, and stemming
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Removing punctuation and converting to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Removing stop words
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to all documents
preprocessed_data = [preprocess_text(doc) for doc in newsgroups.data]

# Use TF-IDF vectorizer to convert preprocessed text data into numerical vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_data)

# Apply Non-Negative Matrix Factorization (NMF) to identify topics
n_topics = 5  # You can adjust the number of topics based on your preference
nmf = NMF(n_components=n_topics, random_state=1)
nmf.fit(X)

# Display the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    top_words_idx = topic.argsort()[:-10 - 1:-1]  # Display top 10 words for each topic
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic #1: peopl, god, say, think, just, christian, believ, did, know, like
Topic #2: window, file, thank, program, use, run, know, doe, do, help
Topic #3: game, team, play, year, player, win, hockey, basebal, fan, score
Topic #4: drive, card, disk, scsi, hard, control, ide, use, monitor, floppi
Topic #5: key, chip, encrypt, use, clipper, govern, phone, secur, escrow, algorithm


In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
import string
import nltk
nltk.download('punkt')

# Fetch the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Tokenization, removing punctuation, lowercase conversion, removing stop words, and stemming
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Removing punctuation and converting to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Removing stop words
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to all documents
preprocessed_data = [preprocess_text(doc) for doc in newsgroups.data]

# Use CountVectorizer to create a bag-of-words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_data)

# Apply Non-Negative Matrix Factorization (NMF) to identify topics
n_topics = 5  # You can adjust the number of topics based on your preference
nmf = NMF(n_components=n_topics, random_state=1)
nmf.fit(X)

# Display the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf.components_):
    top_words_idx = topic.argsort()[:-10 - 1:-1]  # Display top 10 words for each topic
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic #1: max, bhj, giz, gk, bj, wm, qax, kn, ax, nrhj
Topic #2: use, file, program, includ, avail, window, inform, server, data, run
Topic #3: peopl, say, did, know, said, think, god, just, time, presid
Topic #4: db, mov, cs, bh, byte, bit, al, si, di, bl
Topic #5: imag, jpeg, file, gif, format, color, version, use, program, display


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import nltk

# Download the 'stopwords' resource
nltk.download('stopwords')

# Load dataset
data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

# Extract the features
count_vectorizer = CountVectorizer(stop_words='english')
features = count_vectorizer.fit_transform(data.data)

# Define additional preprocessing steps
def preprocess_text(text):
    # Tokenization
    words = text.split()

    # Remove all punctuation and lowercase words
    words = [word.lower() for word in words if word.isalpha()]

    # Remove the stop words
    words = [word for word in words if word not in stopwords.words('english')]

    # Stemming
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]

    return words

# Apply preprocessing to each document
data.data = [preprocess_text(text) for text in data.data]

# Print first 5 documents after preprocessing
for i in range(5):
    print(" ".join(data.data[i]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


wonder anyon could enlighten car saw sport look late earli call door realli front bumper separ rest anyon tellm model engin year car whatev info funki look pleas
fair number brave soul upgrad si clock oscil share experi pleas send brief messag detail experi top speed cpu rate add card heat hour usag per floppi disk function floppi especi summar next two pleas add network knowledg base done clock upgrad answer
well mac plu final gave ghost weekend start life way back market new machin bit sooner intend look pick powerbook mayb bunch question somebodi anybodi know dirt next round powerbook introduct heard suppos make appear heard anymor sinc access wonder anybodi anybodi heard rumor price drop powerbook line like one went impress display could probabl swing got disk rather realli feel much display look great realli could solicit opinion peopl use worth take disk size money hit get activ realiz real subject play around machin comput store breifli figur opinion somebodi actual use machin d