In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# AI DocParser

An application framework developed using the latest AI technologies to extract the values of specific pre-defined keys from a given PDF document. Also generating a document summary using the key & values extracted in the while doing so. 

**Application Input:**

A set of PDF documents, the domain of the document, and a set of field names (also called keys) for which the values need to be extracted.

**Application output:**

The values against the keys in a document extracted and stored in a CSV file with key & value columns.
Summary of the document

## Our Example PDF

We will use the recent contracts happened between Disney and Reliance.

### Keys to extract:

Name of the 1st Party

Name of the 2nd Party

Data of announcement

Investment amount

Transaction value

Exclusive rights

# Setup

### PDF file path

In [3]:
pdf_path = '/kaggle/input/reliance-hotstar-ocr/reliance_disney_agreement.pdf'

### Pretty table for printing key-value like dictionary pairs

In [4]:
from prettytable import PrettyTable 
def print_pretty_table(extracted_values):
    """
    Prints the extracted key-value pairs in a formatted table.

    Args:
        extracted_values (dict): Dictionary of extracted key-value pairs.
    """
    # Create a PrettyTable instance
    table = PrettyTable()

    # Set column names
    table.field_names = ["Key", "Value"]

    # Add rows
    for key, value in extracted_values.items():
        table.add_row([key, value])

    # Print the table
    print(table)

# Step 1: Data Processing and Preprocessing

**What we are doing**

* Extract text content from PDFs.
* Clean and preprocess text (remove headers, footers, and noise).
* Normalize text to ensure uniformity (e.g., removing special characters, handling encoding issues).

## 1.1 Extract text content from PDFs

### Option 1 - PyPDF2, for simple PDF extraction
- **Lightweight** and easy to use for basic text extraction.
- Works well for simple PDFs with **linear text** (no complex layouts, images, or tables).
- Stable and actively maintained.

### Option 2 - pdfplumber, for handling layouts better
- Excellent for extracting text from PDFs with **complex layouts**.
- Can extract tables, images, and even coordinates of text within a page.
- Provides a **high degree of customization**, such as selecting specific page areas or identifying text within bounding boxes.
- Supports well-structured text extraction for scanned documents (if OCR is pre-applied).

### Option 3 - PyMuPDF library (via fitz), 
- Extremely fast and efficient.
- Extracts text, images, and metadata with high accuracy.
- **Handles multi-column PDFs**, tables, and even rotated text better than PyPDF2.
- Can extract text from specific regions of a page (bounding box selection).
- **Supports embedded fonts and handles PDFs with complex structures more robustly**.



### Let's opt for option 3 as default, but we can use 1 and 2 by changing argument

In [5]:
!pip install PyPDF2 PyMuPDF pdfplumber

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0

In [6]:
import fitz  # PyMuPDF
import pdfplumber
from PyPDF2 import PdfReader
import re

def extract_text_from_pdf(pdf_path, method="fitz"):
    """
    Extracts text from a PDF using the specified library (fitz, pdfplumber, or PyPDF2).

    Args:
        pdf_path (str): Path to the PDF file.
        method (str): The library to use for extraction ('fitz', 'pdfplumber', 'pypdf2').
                      Defaults to 'fitz' (PyMuPDF).

    Returns:
        str: Extracted and cleaned text from the PDF.
    """
    try:
        raw_text = ""
        
        # pdfplumber method
        if method == "pdfplumber":
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    raw_text += page.extract_text() or ""

        # PyPDF2 method
        elif method == "pypdf2":
            reader = PdfReader(pdf_path)
            for page in reader.pages:
                raw_text += page.extract_text() or ""

        # PyMuPDF (fitz) method
        else:
            document = fitz.open(pdf_path)
            for page_num in range(len(document)):
                page = document[page_num]
                raw_text += page.get_text()
            document.close()

        if not raw_text.strip():
            raise ValueError("No extractable text found in the PDF.")
        else:
            return raw_text

    except Exception as e:
        return f"Error processing the PDF with {method}: {str(e)}"

## 1.2 Accuracy check

In [8]:
file_path = '/kaggle/input/reliance-hotstar-ocr/reliance_disney_ocr.txt'

**Loading the OCR file text**

In [9]:
# Function to read ground truth from a file
def read_ground_truth(file_path):
    try:
        with open(file_path, 'r') as file:
            ground_truth = file.read().strip() 
        return ground_truth
    except FileNotFoundError:
        print(f"Error: The file at '{file_path}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

**Preprocessing text before checking accuracy**

In [10]:
import re

def preprocess_text(text):
    """
    Preprocesses the input text by:
    - Removing extra whitespaces
    - Normalizing newlines
    - Converting to lowercase
    - Optional: Removing punctuation or stopwords
    """
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces and trim
    text = text.replace('\n', ' ')  # Replace newlines with a space

    # Convert to lowercase
    text = text.lower()

    # Optional: Remove punctuation (if needed)
    # text = re.sub(r'[^\w\s]', '', text)  # Uncomment if punctuation removal is desired

    return text

**Word accuracy code**

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score

def word_match_accuracy(extracted_text, ground_truth):
    # Preprocess both texts
    extracted_text = preprocess_text(extracted_text)
    ground_truth = preprocess_text(ground_truth)

    # Convert the texts into sets of words
    extracted_words = set(extracted_text.split())
    ground_truth_words = set(ground_truth.split())

    # Calculate precision, recall, and F1-score
    intersection = len(extracted_words & ground_truth_words)
    precision = intersection / len(extracted_words) if len(extracted_words) > 0 else 0
    recall = intersection / len(ground_truth_words) if len(ground_truth_words) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

**Similiarity check code**

In [12]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


In [13]:
import Levenshtein

def levenshtein_accuracy(extracted_text, ground_truth):
    # Preprocess both texts
    extracted_text = preprocess_text(extracted_text)
    ground_truth = preprocess_text(ground_truth)
    
    # Compute Levenshtein distance and return the similarity ratio
    distance = Levenshtein.distance(extracted_text, ground_truth)
    max_len = max(len(extracted_text), len(ground_truth))
    
    # Similarity ratio
    similarity_ratio = 1 - (distance / max_len)
    return similarity_ratio

### Accuracy check - 1. Using fitz model

In [14]:
ground_truth = read_ground_truth(file_path)

In [15]:
extracted_text = extract_text_from_pdf(pdf_path, method="fitz")

# Check Levenshtein similarity
similarity_ratio = levenshtein_accuracy(extracted_text, ground_truth)
print(f"Levenshtein Similarity: {similarity_ratio * 100:.2f}%")

# Check word match accuracy
precision, recall, f1 = word_match_accuracy(extracted_text, ground_truth)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

Levenshtein Similarity: 99.77%
Precision: 0.998, Recall: 0.998, F1-Score: 0.998


### Accuracy check - 2. Using pdfplumber

In [16]:
extracted_text = extract_text_from_pdf(pdf_path, method="pdfplumber")

# Check Levenshtein similarity
similarity_ratio = levenshtein_accuracy(extracted_text, ground_truth)
print(f"Levenshtein Similarity: {similarity_ratio * 100:.2f}%")

# Check word match accuracy
precision, recall, f1 = word_match_accuracy(extracted_text, ground_truth)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

Levenshtein Similarity: 99.95%
Precision: 0.993, Recall: 0.996, F1-Score: 0.995


### Accuracy check - 3. Using pypdf2

In [17]:
extracted_text = extract_text_from_pdf(pdf_path, method="pypdf2")

# Check Levenshtein similarity
similarity_ratio = levenshtein_accuracy(extracted_text, ground_truth)
print(f"Levenshtein Similarity: {similarity_ratio * 100:.2f}%")

# Check word match accuracy
precision, recall, f1 = word_match_accuracy(extracted_text, ground_truth)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-Score: {f1:.3f}")

Levenshtein Similarity: 99.22%
Precision: 0.914, Recall: 0.937, F1-Score: 0.925


## 1.3 Clean and normalise text

extra preprocesing of extracted text for our main objectives

In [18]:
# Clean the text
def clean_txt(raw_text):
    cleaned_text = re.sub(r"(page \d+ of \d+)", "", raw_text, flags=re.IGNORECASE)  # Remove page numbers
    cleaned_text = re.sub(r"(\n\s*\n)|(\r\n|\r|\n)", "\n", cleaned_text)  # Remove extra newlines
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Normalize whitespace
    cleaned_text = re.sub(r"[^\x00-\x7F₹$\s]+", " ", cleaned_text) # Remove non-ASCII characters
    normalized_text = cleaned_text.strip()

    return normalized_text

# Step 2: Key-Value Pair Extraction

**What we are doing**


* Annotate domain-specific training data (e.g., contracts, finance documents).
* Identify and extract values corresponding to predefined keys using AI models.

**Libraries:**

NER models: SpaCy

re: Regular Expression

###  Optional : Fine-Tune a Pre-Trained NER Model (Custom NER Model)

* **Domain-Specific Adaptation**: Fine-tuning allows the model to recognize entities specific to the domain, improving accuracy.
* **Reusability**: Pre-trained models like BERT can be adapted for multiple domains with minimal effort.
* **Scalability**: The approach scales well for different domains as long as labeled data is available

⚠️ More data of PDFs will be needed for this!

### Or we can use Hugging Face's pretrained models

## Using the NER Model + Regular Expressions (re) for Key Extraction

* Load the model and tokenizer to predict keys from the text
* For better accuracy, will use Regular Expressions for dates and amount
* Saving extracted keys in CSV file

In [19]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires cloudpickle~=2.2.1, but you have cloudpickle 3.0.0 which is incompatible.
apache-beam 2.4

**Extraction of key-value pairs**

In [21]:
import spacy
import re

# Loading the spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

def key_value_extraction(text, keys):
    """
    Extract key values from a cleaned contract text based on the provided keys.

    Parameters:
    - text (str): Cleaned and preprocessed contract text.
    - keys (list, optional): A list of keys (strings) to extract. If None, all keys will be extracted.

    Returns:
    - dict: A dictionary containing the extracted key-value pairs.
    """
    # Process the text with spaCy for Named Entity Recognition (NER)
    doc = nlp(text)

    # Extract company names using NER (ORG - Organizations)
    companies = [ent.text for ent in doc.ents if ent.label_ == "ORG"]

    # Extract dates using regex pattern for dates
    date_pattern = r"\d{1,2}[a-z]{2}\s[A-Za-z]+\s\d{4}"
    dates = re.findall(date_pattern, text)

    # Extract financial values (e.g., investment amounts, transaction values) using regex
    amount_pattern = r"₹[\d,]+(?:\s*crore|\s*\(.*\))"
    amounts = re.findall(amount_pattern, text)

    # Define helper function to check for exclusive rights
    def check_exclusive_rights(text):
        """
        Function to check if the contract mentions exclusive rights.
        Returns True if exclusive rights are granted, otherwise False.
        """
        # Keywords or phrases that indicate exclusive rights
        exclusive_rights_keywords = ["exclusive rights", "granted exclusive rights", "exclusive distribution rights"]
        
        # Search for any of the keywords in the text
        for keyword in exclusive_rights_keywords:
            if re.search(rf"{keyword}", text, re.IGNORECASE):
                return True
        
        # If no keywords are found, return False
        return False

    # Check if the contract mentions exclusive rights
    exclusive_rights = check_exclusive_rights(text)

    # Prepare the default extracted information, note that the below config depends upon predefined
    # keywords and it can be done better for more domains
    extracted_info = {
        keys[0]: companies[0] if len(companies) > 0 else None,  # First company (Party 1)
        keys[1]: companies[1] if len(companies) > 1 else None,  # Second company (Party 2)
        keys[2]: dates[0] if dates else None,  # First date found
        keys[3]: amounts[0] if amounts else None,  # First investment amount found
        keys[4]: amounts[2] if len(amounts) > 1 else None,  # Second transaction value found
        keys[5]: "Yes" if exclusive_rights else "No"  # Boolean value indicating exclusive rights
    }

    # Return the filtered dictionary of extracted information
    return extracted_info


**Save to CSV**

In [22]:
import csv

In [23]:
def save_to_csv(key_values, output_file):
    """
    Saves the extracted key-value pairs to a CSV file.

    Args:
        extracted_values (dict): Dictionary of extracted key-value pairs.
        output_file (str): Path to the output CSV file.
    """
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Key", "Value"])
        for key, value in key_values.items():
            writer.writerow([key, value])

## Problem - Keys are to be exact?

Handling Flexible Keys for Extraction:
Domain-Specific Synonyms:

For each domain, I used some standard variations of keys (e.g., for contracts: "First Party," "Party One," "Party 1" all refer to the same entity).
This is done by specifying multiple synonyms in the domain-specific patterns.
Case Insensitivity:

The regex patterns in the code are case-insensitive (e.g., (?i)), so "Start Date" will match "start date" or "START DATE" in the text.




### Solution - Custom Key Mapping:

You can have a custom key mapping system where a user can provide a list of keys (e.g., "Start Date", "Start Date of Contract"), and these would map to predefined extraction patterns in your system.
Enhancement for Flexible Matching:
Here’s how you can modify the code to allow for more flexible or fuzzy key matching:

Allow for Multiple Variants of Keys (Domain-Specific Variations):

Add different forms of keys within the same domain rules so that the same key can be captured even if it appears in different formats.
Allow Users to Define Their Own Keys:

Users can input the keys they need, and you can check for matches in the text regardless of slight variations.


**Example of Custom Key Mapping:**

In [None]:
# Domain-specific patterns
domain_rules = {
    "Contracts": {
        "Name of the 1st Party": r"(?:First Party|Party 1|Party One):?\s*([^\n,]+)",
        "Name of the 2nd Party": r"(?:Second Party|Party 2|Party Two):?\s*([^\n,]+)",
        "Contract Start Date": r"(?:Effective Date|Start Date|Commencement):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Contract End Date": r"(?:End Date|Termination Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Scope of Work": r"(?:Scope of Work|Services):?\s*([^\n]+)",
        "Penalty Amount": r"(?:Penalty|Fine):?\s*\$?([\d,]+)"
    },
    "Finance": {
        "Transaction Amount": r"(?:Amount|Transaction):?\s*\$?([\d,]+)",
        "Date of Transaction": r"(?:Transaction Date|Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Account Number": r"(?:Account Number|Acc No):?\s*([^\n,]+)",
        "Bank Name": r"(?:Bank|Financial Institution):?\s*([^\n,]+)"
    },
    "Legal": {
        "Case Number": r"(?:Case Number|Case ID):?\s*([^\n,]+)",
        "Filing Date": r"(?:Filing Date|Date Filed):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Petitioner": r"(?:Petitioner|Claimant):?\s*([^\n,]+)",
        "Respondent": r"(?:Respondent|Defendant):?\s*([^\n,]+)",
        "Court Name": r"(?:Court|Jurisdiction):?\s*([^\n,]+)"
    },
    "HR": {
        "Employee Name": r"(?:Employee Name|Name):?\s*([^\n,]+)",
        "Employee ID": r"(?:Employee ID|ID):?\s*([^\n,]+)",
        "Joining Date": r"(?:Joining Date|Start Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Department": r"(?:Department|Team):?\s*([^\n,]+)",
        "Salary": r"(?:Salary|Compensation):?\s*\$?([\d,]+)"
    },
    "Invoices": {
        "Invoice Number": r"(?:Invoice Number|Invoice ID):?\s*([^\n,]+)",
        "Invoice Date": r"(?:Invoice Date|Date):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
        "Client Name": r"(?:Client Name|Customer Name):?\s*([^\n,]+)",
        "Total Amount": r"(?:Total Amount|Total):?\s*\$?([\d,]+)",
        "Due Date": r"(?:Due Date|Payment Due):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})"
    }
}

## Step 3: Document Summarization

**What are we doing**

* Models: facebook/bart-large-cnn
* Utilize extracted keys and values to contextualize summaries using the model (hugging-face)
* Key-Value Pairs: The model will prioritize and emphasize these key points (e.g., party names, contract start and end dates, penalty amount) in the summary.

Input: The model will take in:

- Text: The raw document text.
- Keys: A list of predefined keys that are relevant to the domain or document type. 
- Values: A list of corresponding values that describe the content or information related to the keys.
- Word Limit: A maximum word count for the summary (default to 1000 words).

In [24]:
from transformers import pipeline
import re

# Step 1: Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Step 2: Custom Summarization Function based on Keys and Values
def custom_summarization(text, keys, values):
    """
    Summarize text by focusing on specific keys and values.
    
    Args:
    - text (str): The full text to summarize.
    - keys (list): List of keys (such as 'Investment Amount', 'Party 1', etc.)
    - values (list): List of corresponding values (such as 'Reliance', '₹11,500 crore', etc.)
    
    Returns:
    - summary (str): A summary that focuses on the provided keys and values.
    """
    
    relevant_text = ""
    
    # Try to match key-value pairs and extract related text
    for key, value in zip(keys, values):
        # Find sentences containing the key and its value (this is a simple heuristic)
        pattern = rf"([^.]*{re.escape(value)}[^.]*\.)"
        matches = re.findall(pattern, text)
        
        # Combine the relevant sentences into the relevant_text
        relevant_text += " ".join(matches)
    
    # If no relevant text is found, use the whole text for summarization
    if not relevant_text:
        relevant_text = text
    
    # Summarize the extracted relevant text
    summary = summarizer(relevant_text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



# Compiling Results

In [25]:
extracted_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_txt(extracted_text)
text = cleaned_text

keys = ['Party 1', 'Party 2', 'Date of Announcement', 'Investment', 'Transaction Value', 'Exclusive Rights']

key_values = key_value_extraction(text, keys)

values = [i for i in key_values.values()]

In [26]:
print_pretty_table(key_values)

+----------------------+--------------------------+
|         Key          |          Value           |
+----------------------+--------------------------+
|       Party 1        |         Reliance         |
|       Party 2        | the Joint Venture Disney |
| Date of Announcement |    28th February 2024    |
|      Investment      |      ₹11,500 crore       |
|  Transaction Value   |      ₹70,352 crore       |
|   Exclusive Rights   |           Yes            |
+----------------------+--------------------------+


In [27]:
save_to_csv(key_values, "key_values.csv")

In [28]:
media_text = cleaned_text

custom_summary = custom_summarization(media_text, keys, values)

print(custom_summary)

Reliance Industries Limited, Viacom 18 Media Private Limited and The Walt Disney Company (NYSE:DIS) ( Disney ) today announced the signing of binding definitive agreements to form a joint venture ( JV) Reliance is India s largest private sector company, with a consolidated revenue of Rs 9,74,864 crore (US$118.5 billion) Reliances to invest ₹11,500 crore in the Joint Venture Disney to provide Content License to the Joint venture Mumbai / Burbank, Calif.


## Step 4: Continuous Learning Mechanism

**Objective**


* Log user corrections to enhance traceability and accuracy.
* Fine-tune your summarization model using real-world feedback.
* Employ active learning to iteratively improve model predictions.
* Periodically retrain with labeled data to keep the model updated.

## 4.1 Logging User Corrections in CSV

In [29]:
import csv
from datetime import datetime

# Function to log user corrections
def log_correction(original_summary, user_correction, feedback, keys, values):
    """
    Log corrections made by the user into a CSV file.

    Args:
    - original_summary (str): The summary generated by the model.
    - user_correction (str): The corrected summary provided by the user.
    - feedback (str): Additional user feedback or comments.
    - keys (list): List of keys used in the extraction process.
    - values (list): List of values used in the extraction process.
    """
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "original_summary": original_summary,
        "user_correction": user_correction,
        "feedback": feedback,
        "keys": ", ".join(keys),
        "values": ", ".join(map(str, values))
    }
    
    # Append to a CSV file
    with open("corrections_log.csv", mode="a", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=log_entry.keys())
        
        # Write the header if the file is new
        if file.tell() == 0:
            writer.writeheader()
        
        writer.writerow(log_entry)

In [31]:
keys

['Party 1',
 'Party 2',
 'Date of Announcement',
 'Investment',
 'Transaction Value',
 'Exclusive Rights']

In [None]:
original_summary = custom_summary
user_correction = input()
feedback = input()

# keys are predefined
keys = keys

values = input()

log_correction(original_summary, user_correction, feedback, keys, values)

## 4.2 Incorporating Feedback Loops

In [None]:
import pandas as pd

# Load corrections from the CSV file
def load_corrections(file_path="corrections_log.csv"):
    """
    Load corrections logged by users into a DataFrame.

    Args:
    - file_path (str): Path to the corrections log CSV file.

    Returns:
    - DataFrame: User corrections and feedback.
    """
    return pd.read_csv(file_path)

In [None]:
corrections_df = load_corrections()
print(corrections_df.head())

## 4.3 Retraining Models with Corrections (for keys and values)

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments

# Load pre-trained model and tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# Prepare data for fine-tuning
def prepare_data(df):
    """
    Prepare input-output pairs for fine-tuning the summarization model.

    Args:
    - df (DataFrame): DataFrame containing original and corrected summaries.

    Returns:
    - dict: Tokenized input and labels for fine-tuning.
    """
    inputs = tokenizer(list(df["original_summary"]), truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    labels = tokenizer(list(df["user_correction"]), truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Load corrections and prepare data
corrections_df = load_corrections()
data = prepare_data(corrections_df)

# Fine-tuning arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_bart",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=10,
    save_total_limit=2,
    logging_dir="./logs"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data
)

# Start fine-tuning
trainer.train()