# DOWNLOADING THE DEPENDENCIES

In [None]:
!pip install sentence-transformers python-docx PyPDF2

# LIBRARIES

In [None]:
import nltk
import numpy as np
import spacy
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from transformers import pipeline
from docx import Document
from PIL import Image
import cv2
import pytesseract
from matplotlib import pyplot as plt
import re
from PyPDF2 import PdfReader
import json
import warnings

warnings.simplefilter("ignore")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# SENTENCE TRANSFORMERS

In [None]:
model_name = "all-mpnet-base-v2"
sentence_encoder = SentenceTransformer(model_name)
nlp = spacy.load("en_core_web_sm")

# PREPROCESSING

In [None]:
def preprocess_text(text):
  print("\nPreprocessing the text . . . ")
  """
  This function preprocesses text data for anomaly detection.

  Args:
      text: The text string to be preprocessed.

  Returns:
      A list of preprocessed tokens (words).
  """
  # Lowercase the text
  text = text.lower()

  # Tokenize the text (split into words)
  words = text.split()

  # Remove punctuation (optional)
  punctuation = [".", ",", "!", "?", ";", ":"]
  words = [word for word in words if word not in punctuation]

  # Remove stop words (common words)
  stop_words = stopwords.words('english')
  words = [word for word in words if word not in stop_words]

  # Lemmatization (optional, reduces words to their base form)
  # lemmatizer = WordNetLemmatizer()
  # words = [lemmatizer.lemmatize(word) for word in words]

  return words

## ENCODING THE SENTENCES 

In [None]:
def encode_statement(statement):
  print("\nEncoding each statement . . . ")
  """
  This function preprocesses a statement and generates its sentence embedding.

  Args:
      statement: The text of the statement.

  Returns:
      A sentence embedding vector.
  """
  # Preprocess the statement
  preprocessed_text = preprocess_text(statement)

  # Encode the preprocessed statement
  model_name = "all-mpnet-base-v2"  # Replace with your chosen model
  model = SentenceTransformer(model_name)
  statement_vector = model.encode(preprocessed_text)
  return statement_vector

## DETECTING TOPIC KEYWORDS

In [None]:
def detect_topics_keywords(statements, num_topics):
  print("\nFinding out the topics using the LDA { Latent Dirichlet Algorithm } among the provided set of content . . . ")
  """
  This function detects topics and keywords from a corpus of statements using LDA.

  Args:
      statements: A list of statement text strings.
      num_topics: The number of topics to identify.

  Returns:
      A list of topic keywords, where each element is a list of keywords
      representing a topic.
  """
  # Preprocess statements (replace with your preferred preprocessing)
  preprocessed_statements = [preprocess_text(statement) for statement in statements]
  print("\nThe statements were successfully preprocessed")
  # Create a dictionary from preprocessed statements
  dictionary = Dictionary(preprocessed_statements)
  print("\nThe preprocessed text was succesffully converted into a dictionary")

  # Convert statements to bag-of-words format
  corpus = [dictionary.doc2bow(statement) for statement in preprocessed_statements]

  # Train the LDA model
  print("\nThe LDA Model is running to detect the topics")
  lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)

  # Extract topic keywords
  topic_keywords = []
  for topic_id in range(lda_model.num_topics):
    topic_words = [word for word, prob in lda_model.show_topic(topic_id, topn=10)]  # Top 10 keywords per topic
    topic_keywords.append(topic_words)

  return topic_keywords


# GETTING TOPIC KEYWORDS

From a specific statement getting the keywords

In [None]:
def get_keywords(statement):
  """
  This function extracts keywords from a statement using NLTK (replace with your preferred method).

  Args:
      statement: The text of the statement.

  Returns:
      A list of extracted keywords (lowercase).
  """
  print("\nFinding out the keywords . . . ")
  words = nltk.word_tokenize(statement.lower())

  # need to be adjusted
  # Filter for nouns, verbs, and named entities (adjust based on your needs)
  keywords = [word for word in words if (
      nltk.pos_tag([word])[0][1] in ["NN", "NNP", "VB", "VBP"]
  )]
  return keywords

# MAIN ANOMALY DETECTION FUNCTION

In [None]:
def detect_anomalies_hybrid(statements, your_keywords):
  print("\nDetecting the anomalies on the statements using the keywords . . . ")
  """
  This function detects anomalies in a list of statements using cosine similarity
  and keyword heuristics.

  Args:
      statements: A list of statement text strings.
      your_keywords: A list of keywords relevant to the topic.

  Returns:
      A list of tuples, where each tuple contains the indices of two statements
      with potential contradictions.
  """
  # Encode each statement
  statement_vectors = [encode_statement(statement) for statement in statements]

  # Set minimum number of shared keywords for anomaly consideration
  min_keywords = 2  # Adjust threshold based on your data

  anomaly_pairs = []
  for i in range(len(statement_vectors)):
    for j in range(i + 1, len(statement_vectors)):
      # Check if statements share at least the minimum keywords
      shared_keywords = len(set(get_keywords(statements[i])) & set(get_keywords(statements[j])))
      if shared_keywords >= min_keywords:
        # Calculate cosine similarity
        element_wise_product = np.outer(statement_vectors[i], statement_vectors[j])
        norm_product = np.linalg.norm(statement_vectors[i]) * np.linalg.norm(statement_vectors[j])
        similarity = np.sum(element_wise_product) / (norm_product + 1e-8)

        # Identify potential anomalies based on threshold
        threshold = 0.7  # Adjust threshold based on your data and desired strictness
        if similarity < threshold:
          anomaly_pairs.append((i, j))
  print("The Anomaly Detection is succesfully completed!\n")
  print("************************************************\n")
  return anomaly_pairs

# EXTRACTING STATEMENTS FROM A DOCUMENT

### preprocessing the statements of the page extracted content for escape sequences

In [None]:
def preprocess_page_content(page_content: str) -> str:
    """
    Removes escape sequences from a given string containing PDF content.

    Args:
        page_content (str): The text content extracted from a PDF page.

    Returns:
        str: The preprocessed text with escape sequences removed.
    """

    return re.sub(r'\\(.)', r'\1', page_content)


### EXTRACTING THE STATEMENTS PAGE WISE AND STORING THE CONTENT IN RESPECTIVE PAGE INDEX

In [None]:
def preprocess_pdf(pdf_path: str) -> dict:
    """
    Preprocesses the content of a PDF by removing escape sequences and other unwanted
    characters from each page. Stores the preprocessed text in a dictionary.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        dict: A dictionary containing page numbers as keys and preprocessed text as values.
    """

    preprocessed_data = {}
    try:
        # Open the PDF file in binary mode
        with open(pdf_path, 'rb') as pdf_file:
            reader = PdfReader(pdf_file)

            # Process each page
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                page_text = page.extract_text().strip()

                # Preprocess text
                preprocessed_text = preprocess_page_content(page_text)

                # Store preprocessed text in the dictionary
                preprocessed_data[page_num + 1] = preprocessed_text

    except FileNotFoundError:
        print(f"Error: PDF file '{pdf_file.name}' not found.")
    except Exception as e:
        print(f"An error occurred while processing the PDF: {e}")

    return preprocessed_data


# # Example usage:
# pdf_path = "60009210051_CL_Assignment1.pdf"  # Replace with your actual PDF path
# preprocessed_pdf_data = preprocess_pdf(pdf_path)

# # Print the JSON string if preprocessed data is available
# if preprocessed_pdf_data:
#     json_string = json.dumps(preprocessed_pdf_data, indent=4)
#     print(json_string)
# else:
#     print("No content found in the PDF.")

# OCR FUNCTION

In [None]:
def grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

### NOISE REMOVAL

In [None]:
def noise_removal(image):
    kernel = np.ones((1,1), np.uint8)
    image = cv2.dilate(image , kernel, iterations=1)
    kernel = np.ones((1,1), np.uint8)
    image = cv2.erode(image,kernel,iterations=1)
    image = cv2.morphologyEx(image , cv2.MORPH_CLOSE, kernel)
    image = cv2.medianBlur(image , 3)
    return (image)

### REMOVING BORDERS

In [None]:
def remove_borders(image):
    contours, heirarchy = cv2.findContours(image , cv2.RETR_EXTERNAL , cv2.CHAIN_APPROX_SIMPLE)
    cntSorted = sorted(contours, key=lambda x:cv2.contourArea(x))
    cnt = cntSorted[-1]

    x,y,w,h = cv2.boundingRect(cnt)
    crop = image[y:y+h,x:x+w]
    return crop

### THINNING THE FONT

In [None]:
def thin_font(image):
    image = cv2.bitwise_not(image)
    kernel = np.ones((2,2), np.uint8)
    image = cv2.erode(image , kernel, iterations=1)
    image = cv2.bitwise_not(image)
    return image

### THICKENING THE BORDERS

In [None]:
def thick_font(image):
    image = cv2.bitwise_not(image)
    kernel = np.ones((2,2), np.uint8)
    image = cv2.dilate(image , kernel, iterations=1)
    image = cv2.bitwise_not(image)
    return image

### COLORS FOR THE BOUNDING BOXES

In [None]:
color = [255,255,255]
top , bottom , left , right = [150]*4

### PREPROCESSING THE IMAGE

In [None]:
def preprocess_image(im_file):
    img = cv2.imread(im_file)
    gray_image= grayscale(img)
    thresh , im_bw = cv2.threshold(gray_image , 210 , 230, cv2.THRESH_BINARY)
    no_noise = noise_removal(im_bw)
    # thin = thin_font(no_noise)
    # no_borders = remove_borders(no_noise)
    # image_with_border = cv2.copyMakeBorder(no_borders, top, bottom , left , right, cv2.BORDER_CONSTANT , value=color)
    return no_noise

### DISPLAYING THE IMAGES

In [None]:
#displaying-different-images-with-actual-size-in-matplotlib-subplot
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()

### PERFORMING THE MAIN OCR

In [None]:
def ocr_image(image_path):
    final_image = preprocess_image(image_path)
    cv2.imwrite("output/final_test.jpg", final_image)
    extracted_text = pytesseract.image_to_string("output/final_test.jpg")
    return final_image ,extracted_text

# SEARCHING FOR A SPECIFIC LINE IN A GIVEN DOCUMENT

In [None]:
def find_line_in_json(preprocessed_data: dict, target_line: str) -> list[int]:
    """
    Searches for a specific line within the preprocessed JSON data
    and returns a list of page numbers where the line is found.

    Args:
        preprocessed_data (dict): A dictionary containing page numbers as keys
                                  and preprocessed text as values.
        target_line (str): The line to search for.

    Returns:
        list[int]: A list of page numbers where the target line is found.
    """

    found_pages = []
    for page_num, page_content in preprocessed_data.items():
        if target_line in page_content:
            found_pages.append(page_num)

    return found_pages


#  Example usage:
# pdf_path = "60009210051_CL_Assignment1.pdf"  # Replace with your actual PDF path
# target_line = "Given a corpus C2, the Maximum Likelihood Estimation (MLE) for the bigram"

# preprocessed_pdf_data = preprocess_pdf(pdf_path)

# if preprocessed_pdf_data:
#     found_pages = find_line_in_json(preprocessed_pdf_data, target_line)
#     if found_pages:
#         print(f"The target line found on pages: {found_pages}")
#     else:
#         print("The target line was not found in the document.")
# else:
#     print("No content found in the PDF.")
