##**Data Preprocessing**

## TesseractOCR

In [2]:
!pip install pytesseract
!sudo apt install tesseract-ocr

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,905 kB/s)
debc

In [63]:
import pytesseract
from PIL import Image


# Load an image using PIL
image_path = '/content/personal-info.png'
image = Image.open(image_path)

# Perform OCR on the image
text = pytesseract.image_to_string(image)

# Print the extracted text
print(text)


PERSONAL INFORMATION.

 

Full Name : Mr. Anthony Daniel.
Date Of Birth : 23.06.1992.

Gender : Male.

Address : No 3,Martin Circle.
E-Mail : daniel@gmail.com
Phone Number : +14565214785.

ID Number : 199125412325.
Status : Maried.

Occupation : Accountant



### Word tokenization

In [64]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [65]:
tokens = nltk.word_tokenize(text)
print (tokens)

['PERSONAL', 'INFORMATION', '.', 'Full', 'Name', ':', 'Mr.', 'Anthony', 'Daniel', '.', 'Date', 'Of', 'Birth', ':', '23.06.1992', '.', 'Gender', ':', 'Male', '.', 'Address', ':', 'No', '3', ',', 'Martin', 'Circle', '.', 'E-Mail', ':', 'daniel', '@', 'gmail.com', 'Phone', 'Number', ':', '+14565214785', '.', 'ID', 'Number', ':', '199125412325', '.', 'Status', ':', 'Maried', '.', 'Occupation', ':', 'Accountant']


## Stemming

Reduce words to their base or root form to improve generalization.

In [66]:
# import these modules
from nltk.stem import PorterStemmer
import numpy as np

portstem = PorterStemmer()

stem=np.array([])
for i in tokens:
    stem= np.append(stem,portstem.stem (i))


In [67]:
print(stem)

['person' 'inform' '.' 'full' 'name' ':' 'mr.' 'anthoni' 'daniel' '.'
 'date' 'of' 'birth' ':' '23.06.1992' '.' 'gender' ':' 'male' '.'
 'address' ':' 'no' '3' ',' 'martin' 'circl' '.' 'e-mail' ':' 'daniel' '@'
 'gmail.com' 'phone' 'number' ':' '+14565214785' '.' 'id' 'number' ':'
 '199125412325' '.' 'statu' ':' 'mari' '.' 'occup' ':' 'account']


## Lemmatization

Reduce words to their base or root form to improve generalization.

In [68]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [69]:
# import these modules
from nltk.stem import WordNetLemmatizer

lemmati = WordNetLemmatizer()

lem=np.array([])
for j in stem:
    lem= np.append(lem, lemmati.lemmatize(j))

In [70]:
lem

array(['person', 'inform', '.', 'full', 'name', ':', 'mr.', 'anthoni',
       'daniel', '.', 'date', 'of', 'birth', ':', '23.06.1992', '.',
       'gender', ':', 'male', '.', 'address', ':', 'no', '3', ',',
       'martin', 'circl', '.', 'e-mail', ':', 'daniel', '@', 'gmail.com',
       'phone', 'number', ':', '+14565214785', '.', 'id', 'number', ':',
       '199125412325', '.', 'statu', ':', 'mari', '.', 'occup', ':',
       'account'], dtype='<U32')

In [71]:
text_to_anonymize = " ".join(lem)

In [72]:
text_to_anonymize

'person inform . full name : mr. anthoni daniel . date of birth : 23.06.1992 . gender : male . address : no 3 , martin circl . e-mail : daniel @ gmail.com phone number : +14565214785 . id number : 199125412325 . statu : mari . occup : account'

#**NLP Model Architecture**

In [73]:
# Install Presidio
!pip install presidio_analyzer presidio_anonymizer

# Download spaCy's English language model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [74]:
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import json
from pprint import pprint

In [75]:
# Initialize the analyzer
analyzer = AnalyzerEngine()

# Initialize the anonymizer
anonymizer = AnonymizerEngine()



In [76]:
import pickle

# Save the Analyzer model to a file
with open('/content/analyzer_model.pkl', 'wb') as f:
    pickle.dump(analyzer, f)

# Save the Anonymizer model to a file
with open('/content/anonymizer_model.pkl', 'wb') as f:
    pickle.dump(anonymizer, f)


In [36]:
import pickle

# Load the Analyzer model from the saved file
with open('/content/analyzer_model.pkl', 'rb') as f:
    loaded_analyzer = pickle.load(f)

# Load the Anonymizer model from the saved file
with open('/content/anonymizer_model.pkl', 'rb') as f:
    loaded_anonymizer = pickle.load(f)


# **Prediction**

In [77]:
# Analyze text using the loaded analyzer
loaded_analyzer_results = loaded_analyzer.analyze(text=text_to_anonymize, language='en')

# Anonymize text using the loaded anonymizer based on the loaded analyzer results
anonymized_results = loaded_anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=loaded_analyzer_results,
    operators={
        "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}),
        "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char": "*", "chars_to_mask": 12, "from_end": True}),
        "TITLE": OperatorConfig("redact", {})
    }
)

In [78]:
# Print the anonymized text
print(f"text: {anonymized_results.text}")

text: person inform . full name : mr. <ANONYMIZED> . date of birth : <ANONYMIZED> . gender : male . address : no 3 , <ANONYMIZED> . e-mail : daniel @ <ANONYMIZED> phone number : +<ANONYMIZED> . id number : <ANONYMIZED> . statu : <ANONYMIZED> . occup : account
