In [5]:
import os
import string
from docx import Document
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pprint import pprint

# Download stopwords if not already downloaded
import nltk
nltk.download('stopwords')

# Preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation and numbers
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

# Function to process a .docx file
def process_docx(file_path):
    doc = Document(file_path)
    sentences = []

    for paragraph in doc.paragraphs:
        # Tokenize sentences
        sentences.extend(paragraph.text.split('.'))
    
    # Preprocess each sentence
    preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences if sentence]

    return preprocessed_sentences

# Function to perform LDA
def perform_lda(corpus):
    # Create a dictionary representation of the documents
    dictionary = corpora.Dictionary(corpus)

    # Create a document-term matrix
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

    # Build the LDA model
    lda_model = models.LdaModel(doc_term_matrix, num_topics=3, id2word=dictionary, passes=10)

    # Print the topics
    pprint(lda_model.print_topics())

# Path to the .docx file
docx_file_path = "datasets/116950326-December-2012-Rental-Agreement.pdf - Copy.docx"

# Process the .docx file
preprocessed_corpus = process_docx(docx_file_path)

# Perform LDA on the preprocessed corpus
perform_lda(preprocessed_corpus)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shivam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[(0,
  '0.048*"tenant" + 0.037*"landlord" + 0.020*"shall" + 0.014*"may" + '
  '0.014*"pays" + 0.014*"bill" + 0.014*"monthly" + 0.014*"deposit" + '
  '0.013*"cleaning" + 0.013*"deductions"'),
 (1,
  '0.035*"tenant" + 0.018*"deposit" + 0.014*"security" + 0.012*"pursuant" + '
  '0.010*"may" + 0.010*"shall" + 0.010*"days" + 0.010*"repairs" + 0.010*"make" '
  '+ 0.010*"last"'),
 (2,
  '0.025*"tenant" + 0.019*"agreement" + 0.014*"paint" + 0.010*"copy" + '
  '0.010*"either" + 0.010*"household" + 0.010*"offender" + 0.010*"resides" + '
  '0.010*"information" + 0.010*"homes"')]


In [1]:
!pip install python-docx gensim nltk


Collecting FuzzyTM>=0.4.0 (from gensim)
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
                                              0.0/67.1 kB ? eta -:--:--
                                              0.0/67.1 kB ? eta -:--:--
                                              0.0/67.1 kB ? eta -:--:--
     ------------------------                 41.0/67.1 kB 2.0 MB/s eta 0:00:01
     ------------------------                 41.0/67.1 kB 2.0 MB/s eta 0:00:01
     -------------------------------------- 67.1/67.1 kB 521.9 kB/s eta 0:00:00
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading simpful-2.11.1-py3-none-any.whl (32 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting miniful (from fst-pso->pyf

In [8]:
import os
import string
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if not already downloaded
import nltk
nltk.download('stopwords')

# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Function to process a .docx file
def process_docx(file_path):
    doc = Document(file_path)
    sentences = []

    for paragraph in doc.paragraphs:
        sentences.extend(paragraph.text.split('.'))
    
    preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences if sentence]

    return preprocessed_sentences

# Path to the .docx file
docx_file_path = "datasets/142106117-Rental-Agreement.pdf.docx"

# Process the .docx file
preprocessed_corpus = process_docx(docx_file_path)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_corpus)

# Train an Isolation Forest model
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
isolation_forest.fit(tfidf_matrix)

# Predict anomaly scores for each document
anomaly_scores = isolation_forest.decision_function(tfidf_matrix)

# Identify anomalies based on threshold (you may need to adjust the threshold)
threshold = -0.3
anomalies = [i for i, score in enumerate(anomaly_scores) if score < threshold]

# Print the identified anomalies
print("Anomalies:", anomalies)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shivam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Anomalies: []
