# <center> Email Parser Demo </center>
***

In [None]:
import os.path
import base64
import re
import datetime, time
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build


## <center> PART-I : EMAIL MONITORING </center>
***
### Set the scopes
***

In [None]:
SCOPES = ['https://www.googleapis.com/auth/gmail.modify']

### Step 1: Authenticate Gmail
***

In [None]:
def authenticate_gmail():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return build('gmail', 'v1', credentials=creds)

### Validate File extension
***

In [None]:
def is_valid_extension(file_path):
    _, file_extension = os.path.splitext(file_path)
    
    if file_extension.lower() in ['.txt', '.pdf', '.jpg', '.jpeg', '.png', '.eml']:
        return True
    
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

### Step 2 and 3: Inbox Monitoring & Searching for Emails with attachments
***

In [None]:
def search_emails_with_attachments(service, user_id='me'):
    
    # Query if the polling interval is 1 hour.
    # now = int(time.time())
    # five_minutes_ago = now - 60*60
    # query = f'in:inbox has:attachment after:{five_minutes_ago} before:{now}'
    
    # Query if the polling interval is 1 day.
    query = 'in:inbox has:attachment newer_than:10d'
    results = service.users().messages().list(userId=user_id, q=query).execute()
    messages = results.get('messages', [])
    if not messages:
        print("No messages found.")
    else:
        print(f"Found {len(messages)} messages with attachments.")
        return messages

### Step 4: Download Attachments
***

In [None]:
def download_attachments(service, user_id, msg_id, store_dir):
    message = service.users().messages().get(userId=user_id, id=msg_id).execute()
    parts = message.get('payload').get('parts')
    if parts:
        for part in parts:
            if part.get('filename') and is_valid_extension(part.get('filename')):
                if 'data' in part['body']:
                    data = part['body']['data']
                else:
                    att_id = part['body'].get('attachmentId')
                    att = service.users().messages().attachments().get(userId=user_id, messageId=msg_id, id=att_id).execute()
                    data = att['data']
                file_data = base64.urlsafe_b64decode(data.encode('UTF-8'))
                path = os.path.join(store_dir, part['filename'])
                with open(path, 'wb') as f:
                    f.write(file_data)
                print(f"Attachment {part['filename']} downloaded.")

In [None]:
service = authenticate_gmail()
messages = search_emails_with_attachments(service)

store_dir = 'email_downloads'
if not os.path.exists(store_dir):
    os.makedirs(store_dir)

if messages:
    for msg in messages:
        download_attachments(service, 'me', msg['id'], store_dir)

## <center> PART-II : ATTACHMENT CLASSIFICATION </center>
***
### Step 5: Extract Attachment Content
***

In [None]:
import os
import PyPDF2
import docx
import mammoth
import easyocr
import email
import pdfplumber
import pymupdf
from openai import AzureOpenAI
from dotenv import load_dotenv

from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials

### For PDF files
***

In [None]:
def read_pdf_pypdf2(file_path):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            content = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                content += page.extract_text()
            return content
    
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def read_pdf_pdfPlumber(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
        return ""

### For Image files
***

In [None]:
def read_img_easyocr(file_path):
    try:
        reader = easyocr.Reader(['en'], verbose=False)
        result = reader.readtext(file_path)
        content = ' '.join([text[1] for text in result])
        return content
    
    except Exception as e:
        print(f"Error reading Image: {e}")
        return ""

def read_img_azure_ocr(file_path):
    try:
        content = ''
        load_dotenv()
        subscription_key = os.environ["VISION_KEY"]
        endpoint = os.environ["VISION_ENDPOINT"]
        computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
        
        with open(file_path, "rb") as image_stream:
            ocr_result = computervision_client.recognize_printed_text_in_stream(image_stream)

        for region in ocr_result.regions:
            for line in region.lines:
                line_text = " ".join([word.text for word in line.words])
                content += line_text + ' '
        
        return content

    except Exception as e:
        print(f"Error reading Image: {e}")
        return ""

### For EML files
***

In [None]:
def read_eml(file_path, output_dir='attachments/Positive Examples/eml_downloads/'):  
    try:
        attachments_content = dict()
        with open(file_path, 'r') as file:
            msg = email.message_from_file(file)
            for part in msg.walk():
                if part.get_content_maintype() == 'multipart':
                    continue
                if part.get('Content-Disposition') is None:
                    continue

                file_name = part.get_filename()
                if file_name:
                    filepath = os.path.join(output_dir, file_name)
                    with open(filepath, 'wb') as f:
                        f.write(part.get_payload(decode=True))
                    # print(f"Attachment {file_name} downloaded.")
                    content = read_document(output_dir + file_name)
                    attachments_content[file_name] = content

        return attachments_content
    
    except Exception as e:
        print(f"Error reading EML: {e}")
        return ""

### For scanned PDF files
***

In [None]:
def read_pdf_as_image_easyocr(file_path, image_path='attachments/Positive Examples/images/sample.png'):
    try:
        # print("Reading PDF as image using Easy OCR...")
        content = ''
        doc = pymupdf.open(file_path)
        for page in doc:
            pix = page.get_pixmap() 
            pix.save(image_path)
            
            reader = easyocr.Reader(['en'], verbose=False)
            result = reader.readtext(image_path)
            content = ' '.join([text[1] for text in result])
            return content
            
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
        return ""
    
def read_pdf_as_image_azureocr(file_path, image_path='attachments/Positive Examples/images/sample.png'):
    print("Reading PDF as image using Azure OCR...")
    try:
        content = ''
        doc = pymupdf.open(file_path)
        for page in doc:
            pix = page.get_pixmap() 
            pix.save(image_path)
            content += read_img_azure_ocr(image_path)
        return content
            
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
        return ""

In [None]:
def read_document(file_path, flag = 0):
    _, file_extension = os.path.splitext(file_path)
    
    if file_extension.lower() == '.pdf' :
        content = read_pdf_pdfPlumber(file_path) if flag == 0 else read_pdf_pypdf2(file_path)
        if content == "":
            content = read_pdf_as_image_azureocr(file_path)
        return content
    
    elif file_extension.lower() in ['.jpg', '.jpeg', '.png']:
        return (read_img_azure_ocr(file_path) if flag == 0 else read_img_easyocr(file_path))
    
    elif file_extension.lower() == '.eml':
        return read_eml(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

### 6. Invoice Classifier
***

In [None]:
load_dotenv()

GPT_KEY = os.getenv('GPT_KEY')
GPT_ENDPOINT = os.getenv('GPT_ENDPOINT')
GPT_VERSION = os.getenv('GPT_VERSION')
GPT_DEPLOYMENT_NAME = os.getenv('GPT_DEPLOYMENT_NAME')

client = AzureOpenAI(
    api_key = GPT_KEY,
    api_version = GPT_VERSION,
    azure_endpoint = GPT_ENDPOINT
)

In [None]:
def invoice_classifier(content):
    query = f'''Does the below content indicate an Invoice statement ?
            {content}
            // Yes or No.'''

    response = client.chat.completions.create(
                        model = GPT_DEPLOYMENT_NAME,
                        messages = [
                            {
                                "role": "system",
                                "content": [
                                    {
                                    "type": "text",
                                    "text": ''' You are an helpful AI assistant. You help users to decide whether the content provided by them represents an invoice or not. You can judge the content based on certain invoice-related keywords. A few of them include:
                                    - Invoice Number
                                    - Invoice Date
                                    - Statement
                                    - Total Due
                                    - Due Date
                                    - Bill To
                                    - Sold to
                                    - Ship To
                                    - Customer
                                    - Order Number
                                    - Purchase Order (PO) Number
                                    - Terms
                                    - Line Item
                                    - Quantity
                                    - Unit Price
                                    - Total
                                    - Subtotal
                                    - Tax
                                    - Discount
                                    - Shipping Cost
                                    - Balance Due
                                    - Amount Paid
                                    - Remit To
                                    - Description
                                    - Net Price
                                    - Contact Information
                                    If you find a **good amount of keywords in the given content**, you can **return your verdict as Yes, else No**. Give your verdict in a simple Yes/No format.
                                    '''
                                    }
                                ]
                            },
                            {
                                "role" : "user",
                                "content": [
                                    {
                                    "type": "text",
                                    "text": f"{query}"
                                    },
                                ]
                            },
                        ],
                        max_tokens = 100,
                        temperature = 0,
                        top_p = 0.95,
                        seed=101
                    )

    return(response.choices[0].message.content)

In [None]:
for file in os.listdir(store_dir):
    file_path = f'{store_dir}/{file}'
    content = read_document(file_path)
    isInvoice = invoice_classifier(content)
    print(f"The attachment {file} is a Invoice ?: {isInvoice}")