In [None]:
pip install datasets

In [None]:
pip install nltk datasets

In [None]:
pip install gensim pyLDAvis

# Import Proper Libraries

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from gensim.models import CoherenceModel
from datasets import Dataset, DatasetDict
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline

# Load Dataset

In [None]:
ds = load_dataset('SetFit/20_newsgroups')

# Data Pre-Processing Function

In [None]:
#split the dataset into test and train dataframes
df_train = ds['train'].to_pandas()
df_test = ds['test'].to_pandas()

In [None]:
# assign the splits
train = Dataset.from_pandas(df_train)
test = Dataset.from_pandas(df_test)
# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
new_ds

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# pre processing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation and non-alphanumeric characters, including digits
    text = re.sub(r'\W+|\d+', ' ', text)  # This removes both punctuation and numbers


    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove single-character tokens (except 'a' and 'i')
    tokens = [word for word in tokens if len(word) > 1 or word in ['a', 'i']]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)

    # Return None if the document is empty after preprocessing
    return cleaned_text if len(cleaned_text.strip()) > 0 else None


# Apply Pre-Processing

In [None]:
# Apply preprocessing to the text data in train and test sets
df_train['cleaned_text'] = df_train['text'].apply(preprocess_text)
df_test['cleaned_text'] = df_test['text'].apply(preprocess_text)

# Remove empty (None) documents
df_train = df_train[df_train['cleaned_text'].notnull()]
df_test = df_test[df_test['cleaned_text'].notnull()]


In [None]:
# View the cleaned data
df_train.head()

# Representation

Prepare Data: Transforming data into a format that LDA can work with

In [None]:
# Tokenize the cleaned text
df_train['tokenized_text'] = df_train['cleaned_text'].apply(lambda x: x.split())

# Create dictionary for LDA
dictionary = corpora.Dictionary(df_train['tokenized_text'])

# Create Bag of Words corpus
corpus = [dictionary.doc2bow(text) for text in df_train['tokenized_text']]

# View a sample from the corpus (word ID and count)
print(corpus[:1])


Applying LDA Model and view topics

In [None]:
# Train the LDA model
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=8,
    random_state=42,
    chunksize=100,
    passes=15,
    per_word_topics=True
)

# View the 10 topics and their top words
lda_model.print_topics(num_words=10)

Evaluate the Model with coherance score

In [None]:
# Evaluate the model's coherence
coherence_model_lda = CoherenceModel(model=lda_model, texts=df_train['tokenized_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'\nCoherence Score: {coherence_lda}')

# Try a NMF (Non-Negative Matrix Factrorization) Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the cleaned text data to create the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train['cleaned_text'])

# The matrix is now ready to be used in models like LSA or NMF

In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the cleaned text data to create the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df_train['cleaned_text'])

# Get the feature names (terms) from the TF-IDF vectorizer
terms = tfidf_vectorizer.get_feature_names_out()

# Apply NMF on the TF-IDF matrix
nmf_model = NMF(n_components=10, random_state=42)
nmf_model.fit(tfidf_matrix)

# Print the top words for each topic
for i, topic in enumerate(nmf_model.components_):
    terms_in_topic = [terms[x] for x in topic.argsort()[:-10 - 1:-1]]
    print(f"Topic {i}: {', '.join(terms_in_topic)}")

# Apply a Pre-Trained LLM

Install Transformers

In [None]:
pip install transformers

Import pipeline and use pre-trained model focused on news category classification

In [None]:
from transformers import pipeline

# Set up the pipeline for topic classification with a pre-trained model
classifier = pipeline("text-classification", model="Yueh-Huan/news-category-classification-distilbert", device=0)

# Get predictions for your dataset
dataset_texts = df_train['cleaned_text'].tolist()

# Get predictions from the classifier using the actual dataset
predictions = classifier(dataset_texts, batch_size=16, truncation=True)

# Add predictions to the dataframe (optional)
df_train['predicted_topic'] = [pred['label'] for pred in predictions]

# Inspect the first few rows of the updated dataframe
df_train.head()

Download the dataset with the predicted topic so we can look at it wiithout running again.

In [None]:
#download the df_train dataset with the predicted topics
df_train.to_csv('df_train_with_predicted_topics.csv', index=False)

In [None]:
#save this file to my local desktop
from google.colab import files
files.download('df_train_with_predicted_topics.csv')