# Import Required Libraries
Import necessary libraries such as OpenCV, pytesseract, and pandas.

In [None]:
# Import necessary libraries
import cv2  # OpenCV for image processing
import pytesseract  # pytesseract for OCR
import pandas as pd  # pandas for data manipulation

# Ensure pytesseract is configured correctly
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load and Display Image
Load the image using OpenCV and display it to verify the correct image is loaded.

In [None]:
# Load the image using OpenCV
image_path = 'mongodb_image.jpg'
image = cv2.imread(image_path)

# Check if the image was loaded successfully
if image is None:
    print(f"⚠️ Error: Image '{image_path}' not found or unable to load!")
else:
    # Display the image to verify the correct image is loaded
    cv2.imshow('Loaded Image', image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

# Preprocess Image for OCR
Convert the image to grayscale, apply thresholding, and other preprocessing steps to improve OCR accuracy.

In [None]:
# Convert the image to grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply GaussianBlur to reduce noise and improve thresholding
blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

# Apply adaptive thresholding to get a binary image
threshold_image = cv2.adaptiveThreshold(
    blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)

# Display the preprocessed image to verify the preprocessing steps
cv2.imshow('Preprocessed Image', threshold_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Perform OCR to Extract Text
Use pytesseract to perform OCR on the preprocessed image and extract text.

In [None]:
# Perform OCR to Extract Text
extracted_text = pytesseract.image_to_string(threshold_image)

# Split the extracted text into lines
lines = extracted_text.split('\n')

# Create a DataFrame from the lines
df = pd.DataFrame({"Text": lines})

# Display the DataFrame
df

# Convert Extracted Text to Table Format
Parse the extracted text and convert it into a structured table format using pandas.

In [None]:
# Convert Extracted Text to Table Format

# Parse the extracted text and convert it into a structured table format using pandas
# Assuming the text is in a tabular format with columns separated by spaces or tabs

# Split each line into columns based on whitespace
data = [line.split() for line in lines if line.strip()]

# Create a DataFrame from the parsed data
df_table = pd.DataFrame(data)

# Display the DataFrame to verify the table format
df_table

# Save Table as CSV
Save the structured table as a CSV file using pandas.

In [None]:
# Save Table as CSV

# Define the path to save the CSV file
csv_file_path = 'mongodb_table.csv'

# Save the DataFrame as a CSV file
df_table.to_csv(csv_file_path, index=False)

# Print confirmation message
print(f"\n✅ CSV file saved successfully at: {csv_file_path}")


✅ CSV file saved successfully at: mongodb_table.csv
