<a href="https://colab.research.google.com/github/rahulpandey89/Age_and_Gender_Prediction/blob/master/OCR_extractor_using_phi_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from PIL import Image
import requests
from transformers import AutoModelForCausalLM, AutoProcessor

class Phi3VisionModel:
    def __init__(self, model_id="microsoft/Phi-3-vision-128k-instruct", device="cuda"):
        """
        Initialize the Phi3VisionModel with the specified model ID and device.

        Args:
            model_id (str): The identifier of the pre-trained model from Hugging Face's model hub.
            device (str): The device to load the model on ("cuda" for GPU or "cpu").
        """
        self.model_id = model_id
        self.device = device
        self.model = self.load_model()  # Load the model during initialization
        self.processor = self.load_processor()  # Load the processor during initialization

    def load_model(self):
        """
        Load the pre-trained language model with causal language modeling capabilities.

        Returns:
            model (AutoModelForCausalLM): The loaded model.
        """
        print("Loading model...")
        # Load the model with automatic device mapping and data type adjustment
        return AutoModelForCausalLM.from_pretrained(
            self.model_id,
            device_map="auto",  # Automatically map model to the appropriate device(s)
            torch_dtype="auto",  # Use an appropriate torch data type based on the device
            trust_remote_code=True,  # Allow execution of custom code for loading the model
            _attn_implementation='flash_attention_2'  # Use optimized attention implementation
        ).to(self.device)  # Move the model to the specified device

    def load_processor(self):
        """
        Load the processor associated with the model for processing inputs and outputs.

        Returns:
            processor (AutoProcessor): The loaded processor for handling text and images.
        """
        print("Loading processor...")
        # Load the processor with trust_remote_code=True to handle any custom processing logic
        return AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True)

    def predict(self, image_url, prompt):
        """
        Perform a prediction using the model given an image and a prompt.

        Args:
            image_url (str): The URL of the image to be processed.
            prompt (str): The textual prompt that guides the model's generation.

        Returns:
            response (str): The generated response from the model.
        """
        # Load the image from the provided URL
        #image = Image.open(requests.get(image_url, stream=True).raw)
        image = Image.open(image_url)

        # Format the input prompt template for the model
        prompt_template = f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n"

        # Process the inputs, converting the prompt and image into tensor format
        inputs = self.processor(prompt_template, [image], return_tensors="pt").to(self.device)

        # Set generation arguments for the model's response generation
        generation_args = {
            "max_new_tokens": 500,  # Maximum number of tokens to generate
            "temperature": 0.7,     # Sampling temperature for diversity in generation
            "do_sample": False      # Disable sampling for deterministic output
        }
        print("Generating response...")
        # Generate the output IDs using the model, skipping the input tokens
        output_ids = self.model.generate(**inputs, **generation_args)
        output_ids = output_ids[:, inputs['input_ids'].shape[1]:]  # Ignore the input prompt in the output

        # Decode the generated output tokens to obtain the response text
        response = self.processor.batch_decode(output_ids, skip_special_tokens=True)[0]
        return response

# Initialize the model
phi_model = Phi3VisionModel()

# Example predictiohttps://example.com/sample_image.pngn
from PIL import Image
import requests

image_url = "https://templates.invoicehome.com/invoice-template-us-dexter-750px.png"
image_url="/content/bank.jpeg"
#image = Image.open(requests.get(url, stream=True).raw)
#image
#"/content/0064O00000kBSiIQAW-00P4O00001Jk4v2UAB-__deposit_form__voided_check0_0_3151.jpg"  # URL of the sample image
prompt = "you are a bank information extractor .Provide OCR for all the text in given image in markdown format."  # Prompt for model guidance
response = phi_model.predict(image_url, prompt)  # Get the response from the model

print("Response:", response)  # Print the generated response

Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading processor...
Generating response...




Response: Certainly! Below is the markdown format of the table from the image:

```markdown
| Customer Number | 23785-54-9674458 |
|-----------------|------------------|
| Branch Name     | <Branch Name>    |
| Statement Date  | mm/dd/yyyy       |
| Payment Due Date| mm/dd/yyyy       |
| Credit Limit    | 390,000.00       |
| Total Amount Due| 3,898.57         |

## SUMMARY

| Card Type | Previous Balance | Payment / Credits and Rebates | Purchases and Advances | Installment Due | Finance Charges and Other Fees | Late Payment Charges | Amount Due |
|-----------|------------------|-------------------------------|------------------------|-----------------|--------------------------------|----------------------|------------|
| Visa Gold | 7,126.14         | 7,126.14                      | 3,898.57               | 0.00            | 0.00                           | 0.00                 | 3,898.57   |

## TRANSACTION

| Date       | Description                  | Amount |
|------------|-----

In [2]:
 !pip install flash_attn

Collecting flash_attn
  Downloading flash_attn-2.7.0.post2.tar.gz (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash_attn: filename=flash_attn-2.7.0.post2-cp310-cp310-linux_x86_64.whl size=183291101 sha256=16a849d51b95cf8e47a6e6cd36826e9ffbbc068a8546e7e3501a598bd70905a6
  Stored in directory: /root/.cache/pip/wheels/bf/e3/ed/5e845387d52f2debd1bafb847bf3d774d3f0a3c8e31b1dc948
Successfully built flash_attn
Installing collected packages: flash_attn
Successfully installed flash_attn-2.7.0.post2


In [3]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-1h8bfjr7
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-1h8bfjr7
  Resolved https://github.com/huggingface/transformers to commit 19dabe96362803fb0a9ae7073d03533966598b17
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.47.0.dev0-py3-none-any.whl size=10106394 sha256=dc976884f3b3fc3580f8ff09458ddf4f1684ee908f7b4e7f2bd7df89ad47e514
  Stored in directory: /tmp/pip-ephem-wheel-cache-q0m20pd7/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer