# Asbestos Test Reports Data Collection From .doc Files

My goal is to extract features such as Report No, Report Date, Location (City, State, Zip), asbestos mineral test result from Word files to a structured dataset utilizing Regular Expressions and OS library. Since the manually created documents do not always follow a standard format, I will have to adjust the code to include all different templates to get all data accurately.

In [161]:
# Import libraries

import os
import zipfile
import re
import xml.dom.minidom

import textract
import docx
from docx import Document
from docx import *
import re
import json
import win32com.client as win32
from win32com.client import constants
from glob import glob
import fnmatch
import pythoncom
import sys

In [168]:
# convert .doc files to .txt files

training_directory = os.path.join(os.getcwd(), "training_data")

for process_file in  os.listdir(source_directory):
    file, extension = os.path.splitext(process_file)
    
    # We create a new text file name by concatenating the .txt extension to file UUID
    dest_file_path = file + '.txt'
    
    try:
        
        #extract text from the file
        content = textract.process(os.path.join(source_directory, process_file), encoding='utf-8', extension='doc')

        # create and open the new and we prepare to write the Binary Data which is represented by the wb - Write Binary
        write_text_file = open(os.path.join(training_directory, dest_file_path), "wb")

        #write the content and close the newly created file
        write_text_file.write(content)
        write_text_file.close()
    
    except:
        print("{} has been skipped.".format(process_file))
        pass

### Regular Expressions

#### 1. Report No (i.e. REPORT NO:		3180 , 137-644)
REPORT NO:\s*\d{3,}|\d{3}-\d{3}
#### 2. Date (i.e. April 21, 2019)
\w{3,}\s*\w\d,\s*\d{4}
#### 3. City State Zip (i.e. Fresno CA 93722 , Sacramento,  CA  95814)
\w+,?\s*\w\w\s*\d{5}
#### 4. Asbestos result  (i.e. Chrysotile       3-4% Chrysotile        4% Chrysotile        13-14% Chrysotile        <1%)
Chrysotile\s*<?((\d+-)?\d+)% 

## A) Extract Features From Documents

In [441]:
# Define a function to get report value from text

def get_report_value_from_text(text, regex1, regex2=None):
    report_titles = re.findall(regex1, text)
    if len(report_titles) != 0:
        if regex2 is None:
            # print(report_titles)
            return report_titles[0]            
        else: 
            report_values = re.findall(regex2, report_titles[0])
            if len(report_values) != 0:
                return report_values[0]

In [442]:
# Define a function to get report value from paragraph

def get_report_value_from_paragraph(document, regex1, regex2=None):
    for para in document.paragraphs:
        report_value = get_report_value_from_text(para.text, regex1, regex2)
        if report_value is not None:
            return report_value

In [443]:
# Define a function to get report value from table

def get_report_value_from_table(document, regex1, regex2=None):
    for table in document.tables:
        for row in table.rows:
            for cell in row.cells:
                for para in cell.paragraphs: 
                    # Find report number
                    report_value = get_report_value_from_text(para.text, regex1, regex2)
                    if report_value is not None:
                        return report_value

In [444]:
# Define a function to get report value

def get_report_value(document, regex1, regex2=None):
    report_value = get_report_value_from_paragraph(document, regex1, regex2)
    if report_value is None:
        report_value = get_report_value_from_table(document, regex1, regex2)
    return report_value

In [445]:
document1 = Document('./reports_archive/lab_report.docx')
document2 = Document('./reports_archive/lab_report2.docx')

In [446]:
get_report_value(document1, r"REPORT NO:\s*\d{3,}|\d{3}-\d{3}", r"\d{3}-\d{3}|\d{3,}")

'44154'

In [447]:
get_report_value(document2, r"REPORT NO:\s*\d{3,}|\d{3}-\d{3}", r"\d{3}-\d{3}|\d{3,}")

'138-147'

## 1. Extract Report Number From Documents

In [448]:
# Define a function to get report number

def get_report_number(document):
    return get_report_value(document, r"REPORT NO:\s*\d{3,}|\d{3}-\d{3}", r"\d{3}-\d{3}|\d{3,}")

In [449]:
get_report_number(document1)

'44154'

In [450]:
get_report_number(document2)

'138-147'

## 2. Extract Report Date From Documents

In [451]:
# Define a function to get report date

def get_report_date(document):
    return get_report_value(document, r"\w{3,}\s\w\d,\s\d{4}")

In [452]:
get_report_date(document1)

'November 13, 2006'

In [453]:
get_report_date(document2)

'May 29, 2019'

## 3. Extract Location (City, State, Zip)

In [454]:
# Define a function to get report date

def get_location(document):
    return get_report_value(document, r"\w+,?\s\w\w\s\d{5}")

In [455]:
get_location(document1)

'Hills, CA 91344'

In [456]:
get_location(document2) # missing loocation data

## 4. Extract Asbestos Test Result

Asbestos refers to six unique minerals — chrysotile, amosite, crocidolite, anthophyllite, tremolite and actinolite — belonging to the serpentine and amphibole families.
Source: https://www.asbestos.com/asbestos/types/

In [465]:
# Define a function to get test result

def get_mineral_test_result(document, mineral):
    return get_report_value(document, r"{}\s*(<?\d+-?\d+%)".format(mineral))


In [474]:
asbestos_minerals = ['Chrysotile', 'Amosite', 'Crocidolite', 'Anthophyllite', 'Tremolite', 'Actinolite']

In [475]:
def get_test_result(document, minerals_list):
    for mineral in minerals_list:
        mineral_test_result = get_mineral_test_result(document, mineral)
        if mineral_test_result is not None:
            return mineral + " " + mineral_test_result
    return "None Detected"         

In [477]:
get_test_result(document1, asbestos_minerals)

'Chrysotile 2-3%'

In [478]:
get_test_result(document2, asbestos_minerals)

'None Detected'