# Using ASW Textract to Fetch Data

In [None]:
#Detects text in a document stored in an S3 bucket. Display polygon box around text and angled text 
import boto3
import io
from io import BytesIO
import sys
import os
import pandas as pd
from PIL import Image, ImageDraw, ImageFont,ImageOps
import matplotlib.pyplot as plt
%matplotlib inline
import re

os.environ['AWS_DEFAULT_REGION'] = 'us-east-2'

info=[]
def process_text_detection(bucket, document):
    text=[]
    #Get the document from S3
    s3_connection = boto3.resource('s3')
    s3_object = s3_connection.Object(bucket,document)
    s3_response = s3_object.get()
    stream = io.BytesIO(s3_response['Body'].read())
     #Image processing--->> 
    image=Image.open(stream)
    enh_bri = ImageEnhance.Brightness(image)
    brightness = 1
    image_brightened = enh_bri.enhance(brightness)
    # increasing color by factor of 2
    enh_col = ImageEnhance.Color(image_brightened)
    color =2
    image_colored = enh_col.enhance(color)
    #changing image to grayscale
    im2=ImageOps.grayscale(image_colored)
    #increasing the contrast of the image
    enh_con = ImageEnhance.Contrast(im2)
    contrast=1.3
    image_contrasted = enh_con.enhance(contrast)
    #increasing the sharpness of the image
    enh_sha = ImageEnhance.Sharpness(image_contrasted)
    sharp=3
    image_sharped = enh_sha.enhance(sharp)
    imgByteArr = io.BytesIO()
    image_sharped.save(imgByteArr, format='PNG')
    imgByteArr = imgByteArr.getvalue()
    # Detect text in the document
    
    client = boto3.client('textract')
    #process using image bytes                      
    #image_binary = im2.getvalue()
    response = client.detect_document_text(Document={'Bytes': imgByteArr})
    #Get the text blocks
    blocks=response['Blocks']  
    #print ('Detected Document Text')
    for block in blocks:
            if block['BlockType'] != 'PAGE':
                text.append(block['Text'])
    return (" ".join(text))

# Getting the list of files contained in the s3 bucket

In [1173]:
listoffiles=[]
s3_connection = boto3.resource('s3')
bucketl = s3_connection.Bucket('drivinglicense')
for obj in bucketl.objects.all():
    listoffiles.append(obj.key)
newlist=listoffiles

# Creating a List of files contained in s3 bucket

In [1338]:
# Input to be given here , 
#the first arguement of the function process_text_detection contains the name of the bucket in s3(AWS)
#the second arguement of the function process_text_detection contains the name of the files 

In [1337]:
p=[]
for i in range(46):
    p.append(process_text_detection('drivinglicense',listoffiles[i])) #input the s3 bucket name and name of files here

In [None]:
# creating a dataframe of the extracted data from the Textract API
#column content contains all the data

In [1179]:
df=pd.DataFrame({'Image_name':listoffiles,'Content':p})

# Getting the License plate number

In [1181]:
drivinglicense=[]
for i in range(46):
    p=[]
    p=re.findall("DL[\w]{3,4}[\s]{0,1}[\w]{4,5}",df['Content'][i]) #using regular expression to fetch the Driving license
    #The DL as the arguement above can be modified to fetch data of any other state
    if len(p)>1:
        drivinglicense.append(p.pop(1))
    else:
        drivinglicense.append(p)

In [1182]:
df['license_plate_number']=drivinglicense

# Name from the text

In [1339]:
#Used regular expression to fetch the name data 
#Regular Expression to fetch data with the various outout format has been incorporated 

In [1193]:
name=[]
for i in range(46):
    p=[]
    p=re.findall("MR[\W]{0,1}[\s]{1}([\w]{2,}[\s]{0,1}[\w]{2,})",df['Content'][i])
    if len(p)>=1:
        name.append(p.pop(1))
    else:
        p=re.findall("Owner's N[\w]{3,4}[\s]{1}([\w]{2,}[\W]{0,1}[\s]{0,1}[\w]{2,})",df['Content'][i])
        if len(p)>=1:
            name.append(p.pop(1))
        else:
            p=re.findall("Owner's Name[\s]{1}([\w]{2,})",df['Content'][i])
            if len(p)>=1:
                name.append(p.pop(1))
            else:
                p=re.findall("Dealer's N[\w]{3,4}[\s]{1}([\w]{2,}[\W]{0,1}[\s]{0,1}[\w]{2,})",df['Content'][i])
                if len(p)>=1:
                    name.append("No Name Available")
                else:
                    p=re.findall("Name & Address ([\w]{2,}[\s]{1,}[\w]{2,})",df['Content'][i])
                    if len(p)>=1:
                        name.append(p.pop(1))
                    else:
                        p=re.findall("Name &Address ([\w]{2,})",df['Content'][i])
                        if len(p)>=1:
                            name.append(p.pop(1))
                        else:
                            p=re.findall("NAME ([\w]{2,}[\s]{1}[\w]{2,})",df['Content'][i])
                            if len(p)>=1:
                                name.append(p.pop(1))
                            else:
                                p=re.findall("NAME - ([\w]{2,}[\s]{1}[\w]{2,})",df['Content'][i])
                                if len(p)>=1:
                                    name.append(p.pop(1))
                                else:
                                    p=re.findall("NAME I ([\w]{2,}[\s]{1}[\w]{2,})",df['Content'][i])
                                    if len(p)>=1:
                                        name.append(p.pop(1))
                                    else:
                                        p=re.findall("N[\w]{3,4}[\s]{1}([\w]{2,})",df['Content'][i])
                                        if len(p)>=1:
                                            name.append(p.pop(1))
                                        else:
                                            p=re.findall("Name. ([\w]{2,}[\s]{1,}[\w]{2,})",df['Content'][i])
                                            if len(p)>=1:
                                                name.append(p.pop(1))
                                            else:
                                                p=re.findall("Name[\s]{1}[-]{0,1}[\s]{0,1}([\w]{2,}[\s]{1,}[\w]{2,})",df['Content'][i])
                                                if len(p)>=1:
                                                    name.append(p.pop(1))
                                                else:
                                                    name.append(p)
                                            



In [1196]:
df['Name_Owner']=name

# Fetching the registration number in the RCs

In [1198]:
registration_num=[]
for i in range(46):
    p=[]
    p=re.findall("HR[\w]{3}[\s]{0,1}[\w]{4}",df['Content'][i])
    if len(p)>=1:
        registration_num.append(p.pop(1))
    else:
        p=re.findall("HR[\w]{2}-{1,2}[\w]{1}-[\w]{4}",df['Content'][i])
        if len(p)>=1:
            registration_num.append(p.pop(1))
        else:
            registration_num.append(p)

In [1199]:
df['Registration_number']=registration_num

# VIN NUMBER OR CHASIS NUMBER

In [None]:
#using regular expression to fetch chasis number based on the pattern in which they occur

In [1201]:
chasis_num=[]
for i in range(46):
    p=[]
    p=re.findall("M[\w]{15}",df['Content'][i])
    if len(p)>1:
        chasis_num.append(p.pop(1))
    else:
        p=re.findall("[\d]{6}[\w]{9}",df['Content'][i])
        if len(p)>1:
            chasis_num.append(p.pop(1))
        else:   
            chasis_num.append(p)


In [1202]:
df['chasis_num']=chasis_num

# Getting the Engine Number

In [None]:
#Using the regular expression to fetch Engine Number

In [1291]:
enginer_number=[]
for i in range(46):
    e=[]
    e=re.findall("E NO[\s]{1}([\w]{7,})",df['Content'][i])
    if len(e)>1:
        enginer_number.append(e.pop(1))
    else:
        e=re.findall("E[\s]{0,1}NO[\s]{1}-{0,1}[\w]{0,1}[\s]{0,1}([\w]{7,})",df['Content'][i])
        if len(e)>1:
            enginer_number.append(e.pop(1))
        else:
            e=re.findall("Engine No. ([\w]{7,})",df['Content'][i])
            if len(e)>1:
                enginer_number.append(e.pop(1))
            else:
                e=re.findall("Engine ([\w]{5,}[\s]{0,1}[\d]{2,7})",df['Content'][i])
                if len(e)>1:
                    enginer_number.append(e.pop(1))
                else:
                    enginer_number.append(e)


In [1293]:
df['engine_number']=enginer_number

# Getting the Reg date 

In [1306]:
registration_date=[]
for i in range(46):
    p=[]
    p=re.findall("REG. DT: (\d\d/\d\d/\d\d)",df['Content'][i])
    if len(p)>1:
        registration_date.append(p.pop(1))
    else:
        registration_date.append(p)         

In [1309]:
df['Registration_Date']=registration_date

# Getting the Manufacturing Date

In [1331]:
manufacturing_date=[]
for i in range(46):
    p=[]
    p=re.findall("M[\w]{2}[\W]{0,1}[\s]{0,1}DT[\W]{0,1}[\s][\W]{0,1}[\s]{0,1}(\d{0,1}\d/\d\d\d\d)",df['Content'][i])
    if len(p)>1:
        manufacturing_date.append(p.pop(1))
    else:
        p=re.findall("Year of Mfg. (\d{0,1}\d/\d\d\d\d)",df['Content'][i])
        if len(p)>1:
            manufacturing_date.append(p.pop(1))
        else:
            manufacturing_date.append(p)  

In [1334]:
df['manufacturing_date']=manufacturing_date

# Exporting the dataframe into a csv file

In [1340]:
df.to_csv("data_rc.csv")