In [0]:
# Installing dependencies
# Uncomment and run this cell once in the beginning
# ! pip install spacy google-cloud-vision pandas tqdm

In [0]:
# Importing libraries
import random
import re
import spacy
import os 
import json
from google.cloud import vision
import io
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [0]:
# GOOGLE_APPLICATION_CREDENTIALS env var must contain path to 
# Service Account JSON File with access to GVision API
# If not set then the next part of the program wont run

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/path-to-file"
client = vision.ImageAnnotatorClient()
output_dir = "NER/"
nlp = spacy.load(output_dir)

In [0]:
# Creating an output dataframe
outputDF = pd.DataFrame()

In [0]:
def getOutput(type, data, save_to_excel=False):
  """
  Parameters: 
  type: type of data, either img (image file), file (text file) or text (plain text)
  data: image, file of text
  save_to_excel = Boolean
  Output: Prints the dictionary 
  """
  dfLen = len(outputDF)
  textToPredict = ""
  if (type == "img"):
    with io.open(data, 'rb') as image_file:
        content = image_file.read()
        image = vision.types.Image(content=content)
        text_response = client.text_detection(image=image)
        texts = [text.description for text in text_response.text_annotations]
        textToPredict = texts[0]
        
  elif (type == "file"):
    f = open(data, "r")
    textToPredict = f.read()
  else:
    textToPredict = data

  doc = nlp(textToPredict)
  max_amt = 0
  i = 1
  dataDict = {}
  items_list = []
  for ent in doc.ents:
    if ent.label_ in dataDict.keys():
      dataDict[ent.label_+"-"+str(i)] = ent.text
      i +=1
    elif ent.label_ == "Registration date":
      dataDict["Registration date"] = ent.text[4:] if (len(ent.text)>11) else ent.text
    elif ent.label_ == "Mfg. date":
      dataDict["Mfg. date"] = ent.text[-7:] if (len(ent.text)>7) else ent.text
    else:
      dataDict[ent.label_] = ent.text
  if save_to_excel is True: 
    for key, value in dataDict.items():
      if key not in outputDF.columns:
        outputDF.insert(len(outputDF.columns), key, None)
      outputDF.loc[dfLen, key] = value
  else:
    dataDict = dict(sorted(dataDict.items()))
    print(json.dumps(dataDict, indent=2))

In [0]:
# Getting output on a test image
getOutput("img", "1.jpg")

{
  "Chassis No": "MASFJEB1S00372805",
  "Engine No": "2205067",
  "Mfg. date": "7/2013",
  "Name": "PAWAN KUMAR",
  "Registration No": "HR14K 6035",
  "Registration date": "From27-Aug",
  "Registration date-1": "2013"
}


In [0]:
# Reading JSON file for data
json_data = pd.read_json("CvisionAI.json", lines=True)

In [0]:
# Running the function for all the available data 
for i in tqdm(range(len(json_data))):
  getOutput("text", json_data['content'][i], save_to_excel=True)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(IntProgress(value=0, max=41), HTML(value='')))




In [0]:
# Checking results
outputDF

Unnamed: 0,Registration No,Name,Chassis No,Engine No,Mfg. date,Registration date,Chassis No-1,Name-1
0,HR10--P-0840,SANDEEP LAKRA,MALCG41GLAM255721*,G4EB9M256677,01/2010,,,
1,HR10-P-5470,DAVENDER SINGH,MALCM4IVR9M079263*L,D4FA9U817848,12/2009,,,
2,HR10-M-5657,ANUBHAV JAIN,MA3FKEB2S00121766,D13A1120011,08/2008,,,
3,DL8CP 1150,ADITYA GUPTA,MA3EMD81S00167856,F10DN3294185,2007,20/12/2007,,
4,DL2CAT9109,ANOOP SURESH DHAWALE,MA3ETDE1S00218363,7567094,07/2015,21/07/2015,,
5,DL9CX 3096,MANOJ VERMA,MA3EFJC1S0151460,M16AN2011764,08/2009,26/09/2009,,
6,DL1CU8248,SURESH KUMAR GOYAL,MBJEBPEH204509282,22RY295253,01/2016,14022016,,
7,L30AD0141,RAKESH KUMA,0036228,371772454,01/7213,08/02/2013,,
8,DL5CJ 6088,RAJNISH JAIN,MA3FHEB1S00520199,D13A2235055,09/2013,06/10/2013,,
9,HRO6P 5988,SUBE SINGH,MA3EYD81S00765439,F8DN3321864,9/2006,07-Feb-2007,,


In [0]:
# Saving Df as Excel file
outputDF.to_excel("ExtractedData.xlsx",index=False)

# Output on unseen data (Tested before training the model on all the data)
text = REGN NO - DL9CAC6215\nO SNo\n- 01\nREG. DT: 24/12/2012\nMFG CD CD - MUL\nCH. NO MA3FHEB1S00358580\nCOLOUR - P M A WHITE\nE NO - D13A0338461\nCLASS\n-\nL.M.V\nNAME\nSRISHTI NAYAR\nS/WID OF\n-\nSANJIV NAYAR\nADDRESS: 80 c BLOCK NEELAMBER APTS\nRANI BAGH\nNEW DELH\n110034\nMODEL\nSWIFT VDI BS4 M\nRegistering Authority\nBODY\nSALOON\nNO. OF CYL $4 4\nPalam\nWHEEL BASE - 0\nUNLADEN WT - 1060\nMFG.DT.\n- 12/2012\nSEATING C.\n-\n5\nFUEL\nDIESEL\nSTANDING c\nREG.UPTO\n23/12/2027\nCU.CAP\n1248\nTAX UPTO\n- OTT\nSignature\nTeritory of Delhi\nLicenca to drive vehicles throughout Indie\nLICENCE NO.\n: P08072007637601\nN\nNAME\n- SRISHTI NAYAR\nDaughter of\n- SH SANJIV NAYAR\nADDRESS\n: C-80 NEELAMBER\nAPTS SAINIK VIHAR\nDELHI 110034\nDT. OF BIRTH\n- 14/12/1988\nVEHICLE CLASS LMVINT)\n23/07/2007\n(Holder's Signature)\nDT OF ISSUE 24/07/2007\nVALIDITY - 23/07/2027\nNV CARR NO NA\nSi Lola Aaelalhael

{
  
  "Chassis No": "MA3FHEB1S00358580",

  "Engine No": "D13A0338461",

  "Mfg. date": "12/2012",

  "Name": "SRISHTI NAYAR",

  "Registration No": "DL9CAC6215",

  "Registration date": "24/12/2012"

}