In [None]:
!pip install pdf2image
!pip install streamlit firebase-admin requests
!npm install -g localtunnel
!pip install pyngrok
!pip install paddlepaddle paddleocr
!pip install cohere
!pip install numpy
!pip install transformers
!pip install pillow
!pip install torch torchvision torchaudio
!pip install gradio
!pip install openpyxl


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m83.7 MB/

In [None]:
%%writefile app.py
import streamlit as st
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image, ImageEnhance, ImageFilter, ImageDraw, UnidentifiedImageError
from pdf2image import convert_from_bytes
import pandas as pd
import numpy as np
import requests
from io import BytesIO
from firebase_admin import credentials, firestore
import firebase_admin
import cohere
import matplotlib.pyplot as plt
from collections import defaultdict
import re
import io
import os
import traceback

@st.cache_resource  # Cache the PaddleOCR model
def load_ocr_model():
    return PaddleOCR(use_angle_cls=True, lang='en')

@st.cache_data  # Cache the Cohere API call results
def cached_search_with_cohere(extracted_text, prompting_type, custom_keywords, document_type):
    return search_with_cohere(extracted_text, prompting_type, custom_keywords, document_type)

# Initialize Firebase
cred = credentials.Certificate("serviceAccountKey.json")  # Replace with your Firebase service account key
#firebase_admin.initialize_app(cred)

# Initialize Cohere API with your API key
co = cohere.Client("aeYHJcrlPDDgIuO6w0EeOFtyQ0gkdV2jj3cHQT1G")  # Replace with your actual Cohere API key
db = firestore.client()

# Function to preprocess the image
def preprocess_image(image, contrast, sharpen, median_filter):
    image = image.convert("RGB")
    image = ImageEnhance.Contrast(image).enhance(contrast)  # Enhance contrast
    for _ in range(sharpen):
        image = image.filter(ImageFilter.SHARPEN)  # Sharpen the image
    if median_filter > 1:
        image = image.filter(ImageFilter.MedianFilter(size=median_filter))  # Denoise
    return image

# Function to query Firestore and get image URLs from a specified collection
def get_image_urls_from_firestore(collection_name, num_images):
    try:
        collection_ref = db.collection(collection_name)
        docs = collection_ref.stream()
        count = 0

        image_urls = []
        for doc in docs:
            data = doc.to_dict()
            url = data.get("url")  # Assuming the URL is stored under the 'url' field
            if url:
                if count < num_images:
                    image_urls.append(url)
                    count += 1
        return image_urls
    except Exception as e:
        st.error(f"Error retrieving data from Firestore for {collection_name}: {e}")
        return []

# ============================
# Configuration Section
# ============================

# Update this path based on your system
# For Windows:
# Global variable to store the uploaded file
uploaded_file = None
image = None

# Predefined list of 5 keywords to always search for
predefined_keywords = ["Amount", "Date", "Transaction", "Invoice", "Balance"]

# Initialize OCR models
paddle_ocr = PaddleOCR(use_angle_cls=True, lang='en')

def process_single_image(image, file_name, prompting_type, custom_keywords=None, document_type="Invoice"): #Process one image at a time
    try:
        params = {
            "use_angle_cls": True
        }
        result = process_image_with_cohere(image, prompting_type, custom_keywords, document_type, params)
        if isinstance(result, dict) and "error" in result:
            st.error(result["error"])
            return None, None
        response_values = parse_response_to_dict(result.split("\n\nSearch Result:\n")[-1])
        plot = plot_values(response_values, file_name)
        plot1 = plot_pie_chart(response_values, file_name)
        return result, [plot, plot1]
    except Exception as e:
        st.error(f"Error processing image: {e}")
        st.error(traceback.format_exc())
        return None, None

def plot_pie_chart(values_by_keyword, image_name):
    total_values = {}
    for k, v_list in values_by_keyword.items():
        total_values[k] = len(v_list)

    if not total_values:
        # Debugging: Print message if no values are found
        print(f"No values found for pie chart for image: {image_name}")
        return Image.new('RGB', (300, 200), (255, 255, 255))

    fig, ax = plt.subplots(figsize=(8, 8))
    ax.pie(total_values.values(), labels=total_values.keys(), autopct='%1.1f%%', startangle=90)
    ax.set_title(f'Pie Chart of Keyword Values from {image_name}')
    plt.tight_layout()

    buf = io.BytesIO()
    fig.savefig(buf, format='png')
    buf.seek(0)
    img = Image.open(buf)
    return img

# Modify the 'process_multiple_images' function to generate and save results as an Excel file
def save_results_to_excel(results_by_keyword, output_dir="output"):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Flatten the results_by_keyword dictionary to create rows for each keyword and its values
    flattened_data = []
    for keyword, values in results_by_keyword.items():
        for value in values:
            flattened_data.append({"Keyword": keyword, "Value": value})

    # Create a pandas DataFrame
    df = pd.DataFrame(flattened_data)

    # Generate the output Excel file path
    output_file = os.path.join(output_dir, "extracted_results.xlsx")

    # Write the DataFrame to an Excel file
    df.to_excel(output_file, index=False, engine='openpyxl')

    return output_file

def plot_values(values_by_keyword, image_name):
    avg_values = {}
    for k, v_list in values_by_keyword.items():
        numeric_values = []
        for v in v_list:
            if isinstance(v, (int, float)):
                numeric_values.append(float(v))
            elif isinstance(v, str):
                # Remove non-numeric characters except for '-' and '.'
                cleaned_value = re.sub(r'[^\d.-]', '', v)
                if re.match(r'^-?\d+(\.\d+)?$', cleaned_value):  # Check if the string represents a float
                    numeric_values.append(float(cleaned_value))

        if numeric_values:
            avg_values[k] = np.mean(numeric_values)

    if not avg_values:
        return Image.new('RGB', (300, 200), (255, 255, 255))  # Return a blank image if no valid values are found

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(avg_values.keys(), avg_values.values())
    ax.set_xlabel('Keyword')
    ax.set_ylabel('Average Value')
    ax.set_title(f'Bar Graph of Average Numeric Values from {image_name}')
    ax.set_xticklabels(avg_values.keys(), rotation=45)
    plt.tight_layout()

    buf = io.BytesIO()
    fig.savefig(buf, format='png')
    buf.seek(0)
    img = Image.open(buf)
    return img

def detect_with_ppocr(image, params):
    ocr = PaddleOCR(use_angle_cls=params["use_angle_cls"], lang='en')  # Use the English model for OCR
    np_image = np.array(image)
    result = ocr.ocr(np_image, cls=params["use_angle_cls"])

    # Draw bounding boxes
    image_with_boxes = image.copy()
    draw = ImageDraw.Draw(image_with_boxes)
    boxes = []
    for line in result[0]:
        points = line[0]
        top_left = tuple(map(int, points[0]))
        bottom_right = tuple(map(int, points[2]))
        text = line[1][0]
        confidence = line[1][1] * 100  # Scale confidence to 100
        boxes.append((top_left, bottom_right, text, confidence))
        draw.rectangle([top_left, bottom_right], outline="blue", width=2)

    return image_with_boxes, boxes

def extract_dynamic_keywords(document_type):
    if document_type == "Payslips":
        return ["Salary", "Date", "Pay", "Tax", "Name", "Pension", "Insurance"]
    elif document_type == "Bank Statement":
        return ["Balance", "Transaction", "Withdrawals", "Charge", "Payments"]
    elif document_type == "Balance Sheet":
        return ["Date", "Assets", "Cash", "Capital", "Liabilities", "Equity"]
    else:
        return predefined_keywords

def process_multiple_images(files, prompting_type, custom_keywords=None, document_type="Invoice"):
    results = []
    all_plots = []
    all_values_by_keyword = defaultdict(list)
    progress_bar = st.progress(0)
    params = {
        "use_angle_cls": True
    }

    for idx, file in enumerate(files):
        try:
            progress_bar.progress((idx + 1) / len(files))
            if isinstance(file, dict):
                img_byte_arr = file.get("data")
                if img_byte_arr is None:
                    raise ValueError("No image data found in the file")
                if isinstance(img_byte_arr, io.BytesIO):
                    image = Image.open(img_byte_arr)
                else:
                    image = Image.open(io.BytesIO(img_byte_arr))
            else:
                image = Image.open(file)

            result = process_image_with_cohere(image, prompting_type, custom_keywords, document_type, params)
            if isinstance(result, dict) and "error" in result:
                st.error(result["error"])
                continue

            if isinstance(result, str):
                response_values = parse_response_to_dict(result.split("\n\nSearch Result:\n")[-1])
                # Debugging: Print response values for each image
                print(f"Response values for image {file['name'] if isinstance(file, dict) else file.name}: {response_values}")
            else:
                st.error("Unexpected result format")
                continue

            st.subheader(f"Plots for {file['name'] if isinstance(file, dict) else file.name}")
            bar_plot = plot_values(response_values, file['name'] if isinstance(file, dict) else file.name)
            pie_plot = plot_pie_chart(response_values, file['name'] if isinstance(file, dict) else file.name)
            all_plots.append((bar_plot, pie_plot, file['name'] if isinstance(file, dict) else file.name))

            if isinstance(response_values, dict):
                for keyword, values in response_values.items():
                    all_values_by_keyword[keyword].extend(values)
            else:
                st.error(f"Expected response_values to be a dictionary, got {type(response_values)}")

            results.append(f"--- Result for Image {idx + 1} ({file['name'] if isinstance(file, dict) else file.name}) ---\n{result}")
        except Exception as e:
            results.append(f"--- Error processing Image {idx + 1} ({file['name'] if isinstance(file, dict) else file.name}) ---\n{e}")
            st.error(f"Error: {e}")
            traceback.print_exc()
    excel_file = save_results_to_excel(all_values_by_keyword)
    return "\n\n".join(results), all_plots, excel_file

def parse_response_to_dict(response_text):
    values_by_keyword = defaultdict(list)
    lines = response_text.split('\n')
    for line in lines:
        match = re.match(r"- (.+): Found \d+ time\(s\)\. Values: (.+)", line.strip())
        if match:
            keyword = match.group(1).strip()
            values_str = match.group(2).strip()
            if values_str.lower() != "not applicable":
                values = [v.strip() for v in values_str.split(',')]
                for value in values:
                    try:
                        # Clean the value by removing non-numeric characters except for '-' and '.'
                        cleaned_value = re.sub(r'[^\d.-]', '', value)
                        num_value = float(cleaned_value)
                        values_by_keyword[keyword].append(num_value)
                    except ValueError:
                        values_by_keyword[keyword].append(value)
            else:
                values_by_keyword[keyword] = []
    return dict(values_by_keyword)

# Function to perform exploratory data analysis
def exploratory_data_analysis(df):
    st.header("Exploratory Data Analysis (EDA)")

    # Show basic statistics
    st.subheader("Basic Statistics")
    st.write(df.describe())

    # Plot distribution of numeric values
    st.subheader("Distribution of Numeric Values")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        fig, ax = plt.subplots()
        df[col].plot(kind='hist', bins=30, ax=ax, title=f'Distribution of {col}')
        st.pyplot(fig)

    # Plot time series if there are date values
    if 'Date' in df.columns:
        st.subheader("Time Series Analysis")
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.set_index('Date', inplace=True)
        for col in numeric_cols:
            fig, ax = plt.subplots()
            df[col].plot(ax=ax, title=f'Time Series of {col}')
            st.pyplot(fig)
        df.reset_index(inplace=True)

    # Correlation matrix
    st.subheader("Correlation Matrix")
    corr_matrix = df.corr()
    fig, ax = plt.subplots()
    cax = ax.matshow(corr_matrix, cmap='coolwarm')
    fig.colorbar(cax)
    plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=90)
    plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
    st.pyplot(fig)

    # Display correlation matrix
    st.write(corr_matrix)

# Function to search with Cohere API and ensure the response is correctly parsed
def search_with_cohere(extracted_text, prompting_type, custom_keywords=None, document_type="Invoice"):
    try:
        # Limit the extracted text length if it's too long
        max_length = 2000
        if len(extracted_text) > max_length:
            extracted_text = extracted_text[:max_length]

        # If custom_keywords are provided, join them into a string
        keywords = ", ".join(custom_keywords) if custom_keywords else ""
        prompt = f"""
        Analyze the following text and extract the values associated with these keywords: {keywords}.
        For each keyword, do the following:

        1. Search for any substring of the keyword within the text (case-insensitive). This includes the full keyword or any part of it (e.g., for 'Salary', also find 'Total Salary', 'Salary Amount', etc.).
        2. For each match found, provide:
            - The number of occurrences of the keyword or its substring.
            - The corresponding value(s) found right after or near the keyword. If a value is numeric, it might follow directly after the keyword or in the form of a unit (e.g., 'Salary: 120 USD', 'Tax 50').
            - If a keyword is not found or does not have a corresponding value, state 'Not applicable'.

        Text:
        {extracted_text}

        Return results in this format:
        - <Keyword>: Found <Number> time(s). Values: <Extracted values or 'Not applicable'>.
        - If no values are found or the keyword is not present, return 'Not applicable'.

        If multiple matches are found for the same keyword/subkeyword, list all corresponding values.
        """

        # Make the API call to Cohere
        response = co.generate(
            model='command',
            prompt=prompt,
            max_tokens=500
        )

        # Extract the text result from the response
        result_text = response.generations[0].text.strip()
        parse_response_to_dict(result_text)

        # Debugging: Print the response from Cohere API
        st.write("Cohere API response:", result_text)

        return result_text

    except Exception as e:
        return {"error": f"Error during keyword search with Cohere: {str(e)}\n{traceback.format_exc()}"}

def extract_text_with_paddleocr(image, params):
    ocr = PaddleOCR(use_angle_cls=params["use_angle_cls"], lang='en')
    image = np.array(image)
    result = ocr.ocr(image, cls=params["use_angle_cls"])
    extracted_text = [line[1][0] for line in result[0]]
    if not extracted_text:
        return "No text detected in the image."
    return extracted_text

def process_image_with_cohere(image, prompting_type, custom_keywords=None, document_type="Invoice", params=None):
    try:
        extracted_data = extract_text_with_paddleocr(image, params)
        if isinstance(extracted_data, str) and extracted_data.startswith("Error"):
            return {"error": f"Error during text extraction: {extracted_data}"}

        extracted_text = " ".join(extracted_data)

        result = search_with_cohere(extracted_text, prompting_type, custom_keywords, document_type)
        if 'error' in result:
            return result  # Return the error dictionary directly

        # Ensure the result is a string before splitting
        if isinstance(result, str):
            return f"Document Type: {document_type}\n\nExtracted Text:\n{extracted_text}\n\nSearch Result:\n{result}"
        else:
            return {"error": "Unexpected result format from search_with_cohere"}

    except Exception as e:
        return {"error": f"Error: {e}"}

def convert_df_to_text(df):
    return "\n".join(df['Text'].tolist())

def extract_value(prompt, result):
    keyword = prompt.lower()  # Convert keyword to lowercase for case-insensitive comparison

    # Iterate through each line in the OCR result
    for line in result:
        if len(line) > 1 and len(line[1]) > 0:  # Ensure the line contains text
            text = line[1][0]  # Extract the recognized text from the OCR result
            print("Detected Text Line:", text)  # Log the detected text for debugging

            # Split the text into words and store them in a list
            words = text.split()  # Convert the text into a list of words

            # Iterate over the words list to find the keyword
            for i, word in enumerate(words):
                if keyword in word.lower():  # Check for the keyword in the word (case-insensitive)
                    # If the keyword is found, check the next word as the value
                    if i + 1 < len(words):
                        return f"The value for '{keyword}' is: {words[i + 1]}"

    return f"'{keyword}' not found in the image."

# --------------------------- Streamlit App ---------------------------

# Streamlit UI
st.title("Integrating whole pipeline")
# Option to upload or fetch from Firebase
option = st.radio(
    "Choose an option:",
    ("Upload an Image", "Fetch Random Images from Firebase")
)

# Initialize a list to store the fetched images
uploaded_files = []
firebase_images = []

if option == "Fetch Random Images from Firebase":
    option1 = st.radio(
        "Choose the collection name for fetching images :",
        ("balance_sheets", "bank_statements", "payslips")
    )

    # Taking a number input from the user
    num_images = st.number_input(
        label="Enter the number of images to be fetched from Firebase",
        min_value=0,           # Minimum value
        max_value=100,         # Maximum value
        value=10,              # Default value
        step=1,                # Step size for increments
        format="%d"            # Number format ("%d" for integers, "%.2f" for floats)
    )
    collection_name = option1
    if not collection_name:
        st.warning("Please enter a Firestore collection name.")
        st.stop()

    if st.button("Fetch Images"):
        image_urls = get_image_urls_from_firestore(collection_name, num_images)
        if not image_urls:
            st.error("No images found in the specified collection.")
        else:
            st.success(f"Fetched {len(image_urls)} images.")
            for i, image_url in enumerate(image_urls):
                st.write(f"Image {i + 1}")
                st.image(image_url, caption=f"Fetched Image {i + 1} from Firebase")

                # Fetch the image from the URL
                response = requests.get(image_url)

                if response.status_code != 200:
                    st.error(f"Failed to fetch image from {image_url}. HTTP Status: {response.status_code}")
                    continue

                # Check Content-Type
                if "image" not in response.headers.get("Content-Type", ""):
                    st.error(f"URL {image_url} does not point to a valid image.")
                    continue

                # Attempt to load the image
                try:
                    image = Image.open(BytesIO(response.content))  # Load image from URL
                except UnidentifiedImageError:
                    st.error(f"Could not identify the image from {image_url}. Skipping...")
                    continue

                # Convert the image data to a BytesIO object
                img_byte_arr = BytesIO(response.content)
                img_byte_arr.seek(0)

                # Simulate an uploaded file
                uploaded_file = {
                    "name": f"image_{i + 1}.png",
                    "type": "image/png",
                    "data": img_byte_arr
                }

                # Append the simulated uploaded file to the list
                firebase_images.append(uploaded_file)

            # Now uploaded_files holds all the images as simulated uploaded file objects
            st.write(f"Total images stored as simulated UploadedFile objects: {len(firebase_images)}")

elif option == "Upload an Image":
    uploaded_files = st.file_uploader("📂 Upload multiple images or PDFs", type=["jpg", "jpeg", "png", "pdf"], accept_multiple_files=True)
    # Check file type based on the uploaded file's name or MIME type
    if uploaded_files:
        for uploaded_file in uploaded_files:
            try:
                # Determine file type
                file_name = uploaded_file.name
                if file_name.endswith(".pdf"):
                    images = convert_from_bytes(uploaded_file.read())
                    image = images[0]  # Use the first page for OCR
                elif file_name.endswith((".jpg", ".jpeg", ".png")):
                    image = Image.open(uploaded_file)  # Load uploaded image
                else:
                    st.error(f"Unsupported file type for {file_name}. Please upload PDFs or images.")
                    continue

                # Display the image or the first page of the PDF
                st.image(image, caption=f"🖼️ Uploaded Image: {file_name}", use_column_width=True)

            except Exception as e:
                st.error(f"❌ Error with {file_name}: {e}")

if firebase_images:
    uploaded_files = firebase_images

if uploaded_files:
    st.sidebar.header("Global Preprocessing Defaults")
    contrast = st.sidebar.slider("Default Contrast Enhancement", 1.0, 5.0, 2.0, 0.1)
    sharpen = st.sidebar.slider("Default Sharpen Filter", 0, 5, 1, 1)
    median_filter = st.sidebar.slider("Default Median Filter Size", 1, 5, 1, 1)

    # Add sliders for hyperparameters
    st.sidebar.subheader("PaddleOCR Hyperparameters")
    paddle_conf_threshold = st.sidebar.slider("Confidence Threshold", 0, 100, 70)
    use_angle_cls = st.sidebar.checkbox("Use Angle Classification", True)
    ocr_results = []

    st.write("Performing OCR with PaddleOCR")

    for idx, uploaded_file in enumerate(uploaded_files):
        try:
            # Check if the file is from Firebase (dict) or uploaded (UploadedFile)
            with st.expander(f"Set Parameters for Image {idx + 1}"):
                if isinstance(uploaded_file, dict):  # Simulated Firebase file
                    file_name = uploaded_file["name"]
                    img_byte_arr = uploaded_file["data"]
                    image = Image.open(img_byte_arr)  # Load the image from BytesIO
                else:  # UploadedFile from Streamlit
                    file_name = uploaded_file.name
                    if uploaded_file.type == "application/pdf":
                        images = convert_from_bytes(uploaded_file.read())
                        image = images[0]  # Use the first page for OCR
                    elif uploaded_file.type in ["image/jpeg", "image/png", "image/jpg"]:
                        image = Image.open(uploaded_file)  # Load uploaded image
                    else:
                        st.error(f"Unsupported file type: {uploaded_file.type}. Please upload PDFs or images.")
                        continue

                # Display the uploaded image
                st.image(image, caption=f"Uploaded Image {idx + 1}", use_column_width=True)

                # Preprocess the image
                preprocessed_image = preprocess_image(image, contrast, sharpen, median_filter)

                np_image = np.array(preprocessed_image)
                params = {
                    "min_size": st.sidebar.slider("Min Size", 1, 20, 10, 1, key=f"min_size_{idx}"),
                    "text_threshold": st.sidebar.slider("Text Threshold", 0.1, 1.0, 0.7, 0.05, key=f"text_threshold_{idx}"),
                    "low_text": st.sidebar.slider("Low Text Threshold", 0.1, 1.0, 0.4, 0.05, key=f"low_text_{idx}"),
                    "link_threshold": st.sidebar.slider("Link Threshold", 0.1, 1.0, 0.4, 0.1, key=f"link_threshold_{idx}"),
                    "canvas_size": st.sidebar.slider("Canvas Size", 2000, 5000, 2560, 10, key=f"canvas_size_{idx}"),
                    "mag_ratio": st.sidebar.slider("Magnitude Ratio", 0.1, 50.0, 1.0, 5.0, key=f"mag_ratio_{idx}"),
                    "slope_ths": st.sidebar.slider("Slope Threshold", 0.01, 1.0, 0.1, 0.01, key=f"slope_ths_{idx}"),
                    "ycenter_ths": st.sidebar.slider("Y Center Threshold", 0.1, 1.0, 0.5, 0.1, key=f"ycenter_ths_{idx}"),
                    "height_ths": st.sidebar.slider("Height Threshold", 0.1, 1.0, 0.5, 0.1, key=f"height_ths_{idx}"),
                    "width_ths": st.sidebar.slider("Width Threshold", 0.1, 1.0, 0.5, 0.1, key=f"width_ths_{idx}"),
                    "use_angle_cls": use_angle_cls,
                }

                # Detect with PaddleOCR
                paddle_image_with_boxes, paddle_boxes = detect_with_ppocr(preprocessed_image, params)
                paddle_texts = [box[2] for box in paddle_boxes]
                paddle_confidences = [box[3] for box in paddle_boxes]

                # Display the OCR result image with bounding boxes
                st.image(paddle_image_with_boxes, caption=f'PaddleOCR - Image {idx + 1} with Bounding Boxes', use_column_width=True)

                # Display extracted texts and confidence scores
                st.subheader(f"PaddleOCR Results for Image {idx + 1}")
                paddle_df = pd.DataFrame({'Text': paddle_texts, 'Confidence': paddle_confidences})
                st.dataframe(paddle_df)

                # Add the OCR results to the list
                ocr_results.append({'image_idx': idx + 1, 'texts': paddle_texts, 'confidence': paddle_confidences})

                prompting_type = st.radio(
                    f"Type of Prompting for Image {idx + 1}",
                    options=["Manual", "Prompts keywords"],
                    index=0,
                    key=f"radio_{idx + 1}"
                )

                if prompting_type == "Manual":
                    custom_keywords = st.text_input(
                        f"Enter Custom Keywords for Image {idx + 1} (comma-separated)",
                        placeholder="e.g., Amount, Date, Balance",
                        key=f"text_input_{idx + 1}"
                    )
                else:
                    custom_keywords = ""

                document_type = st.radio(
                    f"Type of Financial Document for Image {idx + 1}",
                    options=["Payslips", "Bank Statement", "Balance Sheet", "Other"],
                    index=0,
                    key=f"doc_type_{idx + 1}"
                )

                results, keyword_value_plots, excel_file = process_multiple_images(
                    [uploaded_file], prompting_type, custom_keywords, document_type
                )

                st.text("Extracted Text:")
                st.write(results)
                st.subheader("Keyword Value Plots:")
                if keyword_value_plots:
                    for bar_plot, pie_plot, file_name in keyword_value_plots:  # Unpack the tuple
                        if bar_plot:
                            st.image(bar_plot, caption=f"Bar Graph - {file_name}", use_container_width=True)
                        else:
                            st.warning(f"No bar plot available for {file_name}")

                        if pie_plot:
                            st.image(pie_plot, caption=f"Pie Chart - {file_name}", use_container_width=True)
                        else:
                            st.warning(f"No pie chart available for {file_name}")
                else:
                    st.warning("No plots were generated.")

                # Provide download link for Excel file
                st.text("Download Extracted Results:")
                st.download_button(
                    label="Download Excel",
                    data=open(excel_file, 'rb').read(),
                    file_name="extracted_results.xlsx",
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    key=f"button_{idx + 1}"
                )

        except Exception as e:
            st.error(f"❌ Error processing file {file_name}: {e}")

    # Optionally: Provide a download option for the OCR results
    if ocr_results:
        # Convert the OCR results into text
        ocr_text = "\n\n".join([f"Image {result['image_idx']}:\n" + "\n".join(result['texts']) for result in ocr_results])
        st.download_button(
            label="Download OCR Texts",
            data=ocr_text,
            file_name="paddleocr_extracted_texts.txt",
            mime="text/plain"
        )
    # Perform EDA on the extracted results
    st.header("Perform Exploratory Data Analysis (EDA)")
    if 'excel_file' in locals() and excel_file:
        df = pd.read_excel(excel_file)
        exploratory_data_analysis(df)

Overwriting app.py


In [None]:
!pkill ngrok

from pyngrok import ngrok

!ngrok authtoken 2pL239b5flKDjRaU5JJEIh4R9vl_6FVYELEE3XsoePDxjJBEF
!streamlit run app.py &>/dev/null&
url = ngrok.connect(8501, "http")

print(f"Access your app here: {url}")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Access your app here: NgrokTunnel: "https://ea34-34-82-156-202.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!git clone https://github.com/pranavrockz/OCR-Of-Bank-Statements-.git

Cloning into 'OCR-Of-Bank-Statements-'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 1), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 32.18 KiB | 784.00 KiB/s, done.
Resolving deltas: 100% (1/1), done.


In [None]:
!wget "https://colab.research.google.com/drive/1cZl2t7JSe59FJ1jH6T3VPeJOJu0o8PR5/export?format=ipynb" -O Pipeline.ipynb



--2025-01-06 04:24:34--  https://colab.research.google.com/drive/1cZl2t7JSe59FJ1jH6T3VPeJOJu0o8PR5/export?format=ipynb
Resolving colab.research.google.com (colab.research.google.com)... 216.239.36.180, 216.239.34.180, 216.239.38.180, ...
Connecting to colab.research.google.com (colab.research.google.com)|216.239.36.180|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘Pipeline.ipynb’

Pipeline.ipynb          [<=>                 ]       0  --.-KB/s               Pipeline.ipynb          [ <=>                ]  88.65K  --.-KB/s    in 0.005s  

2025-01-06 04:24:35 (18.0 MB/s) - ‘Pipeline.ipynb’ saved [90782]



In [None]:
!cp /content/Pipeline.ipynb /content/OCR-Of-Bank-Statements-/

In [None]:
%cd /content/OCR-Of-Bank-Statements-/

/content/OCR-Of-Bank-Statements-


In [16]:

!git remote set-url origin https://pranavrockz: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@github.com/pranavrockz/OCR-Of-Bank-Statements-.git


In [17]:
!git remote -v


origin	https://pranavrockz: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@github.com/pranavrockz/OCR-Of-Bank-Statements-.git (fetch)
origin	https://pranavrockz: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@github.com/pranavrockz/OCR-Of-Bank-Statements-.git (push)


In [20]:

!git pull origin main --rebase
!git push origin main

From https://github.com/pranavrockz/OCR-Of-Bank-Statements-
 * branch            main       -> FETCH_HEAD
Current branch main is up to date.
Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 28.96 KiB | 4.83 MiB/s, done.
Total 3 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/pranavrockz/OCR-Of-Bank-Statements-.git
   f78e7f1..f421b59  main -> main


In [21]:
!pip install nbformat
!python -m nbformat validate /content/Pipeline.ipynb


/usr/bin/python3: No module named nbformat.__main__; 'nbformat' is a package and cannot be directly executed


In [23]:
!jupyter nbconvert --to notebook --inplace /content/OCR-Of-Bank-Statements-/Pipeline.ipynb


[NbConvertApp] Converting notebook /content/OCR-Of-Bank-Statements-/Pipeline.ipynb to notebook
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/nbformat/reader.py", line 19, in parse_json
    nb_dict = json.loads(s, **kwargs)
  File "/usr/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/usr/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/usr/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/bin/jupyter-nbconvert", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/jupyter_core/application.py", line 283, in launch_instance
    supe

In [None]:
!git config --global user.email "pbhatnagar07@gmail.com"
!git config --global user.name "pranavrockz"


In [None]:
!git config --global --list


user.email=pbhatnagar07@gmail.com
user.name=pranavrockz
