In [4]:
import os

html_file_paths = ['20newsgroups.data.html']

# Function to read and print a snippet of the HTML file
def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        return content

# Print a snippet of each HTML file
for file_path in html_file_paths:
    content = read_html_file(file_path)
    print(f"Content snippet from {file_path}:")
    print(content[:1000])  # Print the first 1000 characters for inspection
    print("\n\n")


Content snippet from 20newsgroups.data.html:
<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><meta http-equiv="Content-Style-Type" content="text/css" /><meta name="generator" content="Aspose.Words for .NET 24.2.0" /><title>20 Newsgroups</title><style type="text/css">body { font-family:'Times New Roman'; font-size:12pt }h1, h2, h3, h4, h5, h6, p { margin:0pt }h1 { margin-top:12pt; margin-bottom:0pt; page-break-inside:avoid; page-break-after:avoid; font-family:'Times New Roman'; font-size:24pt; font-weight:bold; font-style:normal; color:#2f5496 }h2 { margin-top:2pt; margin-bottom:0pt; page-break-inside:avoid; page-break-after:avoid; font-family:'Times New Roman'; font-size:18pt; font-weight:bold; font-style:normal; color:#2f5496 }h3 { margin-top:2pt; margin-bottom:0pt; page-break-inside:avoid; page-break-after:avoid; font-family:'Times New Roman'; font-size:14pt; font-weight:bold; font-style:normal; color:#1f3763 }h4 { margin-top:2pt; margin-bottom:0pt; p

In [5]:
from bs4 import BeautifulSoup

def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        text = ' '.join([p.get_text() for p in soup.find_all('p')])  # Adjust the tag if necessary
        return text

documents = []
for file_path in html_file_paths:
    documents.append(extract_text_from_html(file_path))

# Combine all extracted documents into one list
documents = [doc for sublist in documents for doc in sublist.split('\n') if doc.strip()]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re

nltk.download('stopwords')

# Initialize the stemmer and stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and stem the text
    words = text.lower().split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

preprocessed_documents = [preprocess(doc) for doc in documents]


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
pip install nltk


Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2024.7.24-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (776 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.5/776.5 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2024.7.24
Note: you may need to restart the kernel to use updated packages.


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re

def preprocess(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and lowercase the text
    words = text.lower().split()
    return ' '.join(words)

preprocessed_documents = [preprocess(doc) for doc in documents]

# Print the number of documents
print(f"Number of preprocessed documents: {len(preprocessed_documents)}")

# Adjust the parameters
vectorizer = CountVectorizer(max_df=1.0, min_df=1, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(preprocessed_documents)
print(f"Document-term matrix shape: {doc_term_matrix.shape}")

# Apply LDA
lda = LatentDirichletAllocation(n_components=20, random_state=42)
lda.fit(doc_term_matrix)

# Display the top words in each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
tf_feature_names = vectorizer.get_feature_names_out()
display_topics(lda, tf_feature_names, no_top_words)


Number of preprocessed documents: 1
Document-term matrix shape: (1, 119)
Topic 0:
www hardware donated edu educational electronics file files following forsale
Topic 1:
www hardware donated edu educational electronics file files following forsale
Topic 2:
www hardware donated edu educational electronics file files following forsale
Topic 3:
www hardware donated edu educational electronics file files following forsale
Topic 4:
www hardware donated edu educational electronics file files following forsale
Topic 5:
www hardware donated edu educational electronics file files following forsale
Topic 6:
www hardware donated edu educational electronics file files following forsale
Topic 7:
www hardware donated edu educational electronics file files following forsale
Topic 8:
www hardware donated edu educational electronics file files following forsale
Topic 9:
www hardware donated edu educational electronics file files following forsale
Topic 10:
www hardware donated edu educational electronic

In [13]:
# Verify the number of preprocessed documents
print(f"Number of preprocessed documents: {len(preprocessed_documents)}")


Number of preprocessed documents: 1


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Convert the preprocessed documents to TF-IDF feature vectors
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

# Number of documents
n_documents = tfidf_matrix.shape[0]

# Set number of clusters
n_clusters = min(20, n_documents)

# Apply K-means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Display the cluster labels
labels = kmeans.labels_
for i, label in enumerate(labels[:10]):  # Display first 10 documents' clusters
    print(f"Document {i} is in cluster {label}")




Document 0 is in cluster 0


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Adjust the parameters and preprocess the documents
vectorizer = CountVectorizer(max_df=1.0, min_df=1, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(preprocessed_documents)
print(f"Document-term matrix shape: {doc_term_matrix.shape}")

# Apply LDA
lda = LatentDirichletAllocation(n_components=n_clusters, random_state=42)
lda.fit(doc_term_matrix)

# Display the top words in each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
tf_feature_names = vectorizer.get_feature_names_out()
display_topics(lda, tf_feature_names, no_top_words)


Document-term matrix shape: (1, 119)
Topic 0:
comp sci talk articles rec misc text cmu science computer


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re

def preprocess(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and lowercase the text
    words = text.lower().split()
    return ' '.join(words)

# Replace this with your list of documents
documents = [
    "Sample document 1 text.",
    "Sample document 2 text.",
    # Add all your documents here
]

preprocessed_documents = [preprocess(doc) for doc in documents]

# Print the number of documents
print(f"Number of preprocessed documents: {len(preprocessed_documents)}")

# Adjust the parameters
vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Set the number of clusters to be the minimum of 20 or the number of documents
n_clusters = min(20, tfidf_matrix.shape[0])

# Apply K-means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Display the cluster labels
labels = kmeans.labels_
for i, label in enumerate(labels):  # Display all documents' clusters
    print(f"Document {i} is in cluster {label}")



Number of preprocessed documents: 2
TF-IDF matrix shape: (2, 3)
Document 0 is in cluster 0
Document 1 is in cluster 0


  kmeans.fit(tfidf_matrix)
