In [5]:
import os
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Function to process a .docx file
def process_docx(file_path):
    doc = Document(file_path)
    sentences = []

    for paragraph in doc.paragraphs:
        sentences.extend(paragraph.text.split('.'))
    
    preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences if sentence]

    return preprocessed_sentences

# Path to the .docx file
docx_file_path = "datasets/116950326-December-2012-Rental-Agreement.pdf - Copy.docx"

# Process the .docx file
preprocessed_corpus = process_docx(docx_file_path)

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(preprocessed_corpus)

# Create a document-term matrix
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]

# Build the LDA model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)

# Print the topics
(lda_model.print_topics())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shivam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(0,
  '0.046*"agreement" + 0.046*"epa" + 0.024*"follows" + 0.024*"utility" + 0.024*"apportioned" + 0.024*"bills" + 0.024*"payable" + 0.024*"lh" + 0.024*"day" + 0.024*"length"'),
 (1,
  '0.066*"tenant" + 0.053*"monthly" + 0.053*"bill" + 0.053*"pays" + 0.027*"household" + 0.027*"decision" + 0.027*"copy" + 0.027*"pdf" + 0.014*"party" + 0.014*"long"'),
 (2,
  '0.029*"tenant" + 0.029*"deposit" + 0.029*"deductions" + 0.029*"made" + 0.015*"cleaning" + 0.015*"security" + 0.015*"rent" + 0.015*"provide" + 0.015*"expenses" + 0.015*"receipts"'),
 (3,
  '0.089*"tenant" + 0.031*"shall" + 0.031*"landlord" + 0.031*"rental" + 0.016*"law" + 0.016*"copy" + 0.016*"cleaning" + 0.016*"moves" + 0.016*"inform" + 0.016*"needed"'),
 (4,
  '0.046*"may" + 0.035*"notice" + 0.024*"enter" + 0.024*"days" + 0.024*"agreement" + 0.024*"written" + 0.024*"rent" + 0.023*"tenant" + 0.023*"normal" + 0.012*"must"'),
 (5,
  '0.059*"tenant" + 0.036*"landlord" + 0.024*"home" + 0.024*"die" + 0.013*"pamphlet" + 0.013*"clarifying"