# Installing the required libraries

In [None]:
!pip install pandas numpy matplotlib nltk scikit-learn

# Importing the required libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import spacy

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm')

# Loading the text data

In [None]:
# Load the corpus: return a list of text data and a list of filenames
def load_corpus(folder_path):
    corpus = []
    filenames = []
    for filename in os.listdir(folder_path):
        try:
            if filename.endswith(".txt"):
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                    corpus.append(file.read())
                    filenames.append(filename)
        except:
            print(f'Error reading file: {filename}')
    return corpus, filenames

In [None]:
folder_path = 'books'
corpus, filenames = load_corpus(folder_path)

In [None]:
print(f"Number of documents: {len(corpus)}")

# Topic Modeling 1

## Vectorizing the text data

In [None]:
tfidf_vectorizer = TfidfVectorizer(encoding='utf-8', lowercase=True, max_df=0.95, min_df=3, max_features=2500)
corpus_vectorized = tfidf_vectorizer.fit_transform(corpus)

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
print(f"Number of features: {len(tfidf_feature_names)}")
print(f"First 10 features: {tfidf_feature_names[:10]}")
print(f"Last 10 features: {tfidf_feature_names[-10:]}")

## Training the NMF model

In [None]:
nmf = NMF(n_components=4, random_state=1)
nmf.fit(corpus_vectorized)

## Visualizing the topics

In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(1, 4, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
n_top_words = 30
plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model')

In [None]:
# document topic distribution
doc_topic_dist = nmf.transform(corpus_vectorized)
df = pd.DataFrame(doc_topic_dist, columns=["Topic 1", "Topic 2", "Topic 3", "Topic 4"])
df['filename'] = filenames
print(df)

# Topic Modeling 2: Removing stopwords

## Preprocessing the text data

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word.lower() for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [None]:
corpus_preprocessed = [preprocess_text(text) for text in corpus]

In [None]:
len(corpus_preprocessed[1])

## Vectorizing the preprocessed text data

In [None]:
corpus_vectorized = tfidf_vectorizer.fit_transform(corpus_preprocessed)

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
print(f"Number of features: {len(tfidf_feature_names)}")
print(f"First 10 features: {tfidf_feature_names[:10]}")
print(f"Last 10 features: {tfidf_feature_names[-10:]}")

## Training the NMF model

In [None]:
nmf.fit(corpus_vectorized)

In [None]:
plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model')

In [None]:
# document topic distribution
doc_topic_dist = nmf.transform(corpus_vectorized)
df = pd.DataFrame(doc_topic_dist, columns=["Topic 1", "Topic 2", "Topic 3", "Topic 4"])
df['filename'] = filenames
print(df)

# Topic Modeling 3: + Lemmatization

## preprocessing the text data

In [None]:
def preprocess_text(text):
    word_tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    filtered_text = [lemmatizer.lemmatize(word.lower()) for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [None]:
corpus_preprocessed = [preprocess_text(text) for text in corpus]

In [None]:
len(corpus_preprocessed[1])

## Vectorizing the preprocessed text data

In [None]:
corpus_vectorized = tfidf_vectorizer.fit_transform(corpus_preprocessed)

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
print(f"Number of features: {len(tfidf_feature_names)}")
print(f"First 10 features: {tfidf_feature_names[:10]}")
print(f"Last 10 features: {tfidf_feature_names[-10:]}")

## Training the NMF model

In [None]:
nmf.fit(corpus_vectorized)

In [None]:
plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model')

In [None]:
# document topic distribution
doc_topic_dist = nmf.transform(corpus_vectorized)
df = pd.DataFrame(doc_topic_dist, columns=["Topic 1", "Topic 2", "Topic 3", "Topic 4"])
df['filename'] = filenames
print(df)

# Topic Modeling 4: + Entities Masking

## Preprocessing the text data

In [None]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_text = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_space and token.is_alpha:
            if token.ent_type_ in ['PERSON', 'ORG', 'GPE'] or token.pos_ in ['PROPN', 'NUM', 'SYM']:
                filtered_text.append('MASKED')
            elif token.pos_ in ['DET', 'ADP', 'CCONJ', 'PRON', 'AUX', 'PART', 'PUNCT', 'INTJ']:
                continue
            else:
                filtered_text.append(token.lemma_.lower())
    return ' '.join(filtered_text)

In [None]:
nlp.max_length = 4000000

In [None]:
corpus_preprocessed = [preprocess_text(text) for text in corpus]

In [None]:
len(corpus_preprocessed[1])

## Vectorizing the preprocessed text data

In [None]:
corpus_vectorized = tfidf_vectorizer.fit_transform(corpus_preprocessed)

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
print(f"Number of features: {len(tfidf_feature_names)}")
print(f"First 10 features: {tfidf_feature_names[:10]}")
print(f"Last 10 features: {tfidf_feature_names[-10:]}")

## Training the NMF model

In [None]:
nmf.fit(corpus_vectorized)

In [None]:
plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in LDA model')

In [None]:
# document topic distribution
doc_topic_dist = nmf.transform(corpus_vectorized)
df = pd.DataFrame(doc_topic_dist, columns=["Topic 1", "Topic 2", "Topic 3", "Topic 4"])
df['filename'] = filenames
print(df)