# Exploring and Processing

## Import Statements

In [66]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
import random as rng
import nltk
import fnmatch
import docx
from lxml import etree

import docx2txt
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.collocations import *
from nltk.util import ngrams
from nltk.stem.lancaster import LancasterStemmer
from string import punctuation
import sys as sys
from sys import platform
import re as re
from statistics import mode
import datefinder
from nltk import ne_chunk
from nltk.tag import StanfordNERTagger
import csv
import datetime

try:
    if(platform == "win32"):
        import win32com.client as win32
        from win32com.client import constants
except Exception as e:
    print(str(e))

import subprocess
import shutil


In [67]:
### Test Cell for Validating Document Conversion

test_path = "C:\\fileValidation"
doc_Path = os.path.join(test_path, 'TestDocConversion.doc')
try:
    if(platform == "win32"):
        word = win32.Dispatch("Word.application")
        #word = win32.gencache.EnsureDispatch('Word.Application')
        doc = word.Documents.Open(doc_Path)
        doc.Activate()

        # Rename path with .docx
        new_file_abs = os.path.abspath(doc_Path)
        new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)

        # Save and Close
        word.ActiveDocument.SaveAs(
            new_file_abs, FileFormat=16
        )
        doc.Close(False)
        
except Exception as e:
    #doc.Close(False)
    errorFile.write(filepath + ", " + str(e) + "\n")
    print(str(e))


In [68]:
### Test Cell for Validating Document Extraction
test_Path = "C:\\fileValidation"
doc_Path = os.path.join(test_path, 'TestDocXExtraction.docx')


In [69]:
###Test Cell for Validating Path Creation
test_Path = "C:\\fileValidation"




## File Prep Functions

### Function: Is it docx or pdf?

In [70]:
def checkFileType(filename):
        if(filename.lower().endswith(('.docx'))):
            return 0
        elif(filename.lower().endswith(('.pdf'))):
            return 1
        elif(filename.lower().endswith(('.doc'))):
            return 2
        else:
            return -1

### Function: read in file

In [71]:
def makeFilePath(docName):
    raw_data_path = os.path.join(os.getcwd(), 'data', 'raw')
    return os.path.join(raw_data_path, docName)

### Function: clean up text

In [72]:
def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    #text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
    return text

### Function: process dataFrame and group

In [73]:
def processDF(txtFile):
    df = pd.read_csv(txtFile, sep=" ", header=None) #this doesn't work for me bc of variable number of cols
    df = df.T 
    df = df.dropna()

    df['SingleRow']=1

    df=df.rename(columns={0 : 'Words'})
    print("in processDF " + txtFile)
    df.describe(include="all")
    #print(df.groupby('Words').SingleRow.sum().sort_values())
    #print(df)
    return df

### Function: Process a text file

In [74]:
def processDocFileWindows(filepath, errorFile):
    try:
        word = win32.Dispatch("Word.application")
        #word = win32.gencache.EnsureDispatch('Word.Application')
        doc = word.Documents.Open(filepath)
        doc.Activate()

        # Rename path with .docx
        new_file_abs = os.path.abspath(filepath)
        new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)

        # Save and Close
        word.ActiveDocument.SaveAs(
            new_file_abs, FileFormat=16
        )
        doc.Close(False)

        return new_file_abs
        
    except Exception as e:
        #doc.Close(False)
        errorFile.write(filepath + ", " + str(e) + "\n")
        print(str(e))
        return None

    
def processDocFileMac(filepath, errorFile):
    #textutil -convert docx ~/Desktop/mypage.webarchive
    try:
        subprocess.run(["textutil", "-convert", "docx", filepath])
        newFilePath = re.sub(r'\.\w+$', '.docx', filepath)
        #print("in Mac doc file process: " + newFilePath)
        return newFilePath
    except Exception as e:
        #errorFile.write(filepath + ", " + str(e) + "\n")
        print("Error processDocFileMac:  " + str(e))
        return None
    

def processDocFile(filePath, errorFile):
    
    try:
        if(platform == "win32"):
            newFilePath = processDocFileWindows(filePath, errorFile)
        else:
            newFilePath = processDocFileMac(filePath, errorFile)
        #print("return from mac vs windows: " + newFilePath)
        processedFilePath = processDocxFile(newFilePath, errorFile)
        if(processedFilePath):
            #rmv old .doc file
            os.remove(filePath)
    except Exception as e:
        #errorFile.write(filePath + ", " + str(e) + "\n")
        print("Error processDocFile:  " + str(e))
        return None
    
    
    return processedFilePath




def processDocxFile(filePath, errorFile):
    #print(filePath)
    try:
    ######below is the footer extraction and concatenation code.
    ######it is concatenated into a single list even when there are multiple footers.
        docxText = docx2txt.process(filePath)
        #print(docxText)
        ######Pre-appending ReplacedText with Footer information

        #replacedText = footer + "\n" + cleanText(docxText)
        replacedText = cleanText(docxText)
        #print(replacedText)
        #print(filePath)
        fileName = os.path.split(filePath)[1]#.split('/')[-1]
        #print(fileName)
        #fileName = fileName.replace(",","")
        #print(fileName)
        baseFileName = fileName[0:-5]
        #print(baseFileName)
        cwd = os.getcwd()

        newFilePath = os.path.join(cwd,'output',  baseFileName + ".txt")

        #print(newFilePath)
        singleFileDocx=open(newFilePath, 'wb+')
        singleFileDocx.write(replacedText.encode("utf-8"))
        singleFileDocx.close()
    except Exception as e:
        print(str(e))
        #errorFile.write(filePath + ", " + str(e) + "\n")
        return None

    #temp_df = processDF('singleTextDocx.txt')
    return newFilePath

### Function: Process pdf file

In [75]:
def processPDFfile(filePath, errorFile):
    password = ""
    extracted_text = ""
    try:
        #print(filePath)
        fileName = os.path.split(filePath)[1]#.split('/')[-1]
        #print(fileName)
        fileName = fileName.replace(",", " ")
        baseFileName = fileName[0:-4]
    
        fp = open(filePath, "rb")
        parser = PDFParser(fp)
        document = PDFDocument(parser, password)
        
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
            
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()

        # set parameters for analysis
        laparams = LAParams()

        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device 
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    newText = lt_obj.get_text()
                    newText = newText.replace('\n', ' ')
                    extracted_text += newText

        #close the pdf file
        fp.close()
        
        extracted_text = cleanText(extracted_text)#extracted_text.replace("\n", ' ')
        cwd = os.getcwd()
        newFilePath = os.path.join(cwd,'output', baseFileName + '-pdf' + ".txt")
        #print(newFilePath)
        with open(newFilePath, 'wb+') as singleFilePDF:
            singleFilePDF.write(extracted_text.encode("utf-8"))
        
    except Exception as e:
        errorFile.write(filePath + ", " + str(e) + "\n")
        return None
        #temp_df = processDF('./data/output/' + baseFileName + ".txt")
    return newFilePath

## NLTK Tokenizing Functions

### Function: DD specific text cleaning

In [76]:
def ddCleanText(text):
    newText = text.replace('Delta Dental', 'DeltaDental')
    newText = newText.replace('DELTA DENTAL', 'DELTADENTAL')
    newText = newText.replace('DeltaDental Insurance Company', 'DeltaDentalInsuranceCompany')
    return newText



### Function: Make tokenized word list

In [77]:
def getTokens(text):
    words = word_tokenize(text)
    
    customStopWords = set(stopwords.words('english')+list(punctuation))
    wordsWOStop=[word for word in words if word not in customStopWords]
    
    return wordsWOStop

### Function: Make tokenized Sentence list

In [78]:
def getSents(text):
    sents = sent_tokenize(text)

    return sents

### Function: Get Bigrams

In [79]:
def getBigrams(tokens):
    bigram_measures=nltk.collocations.BigramAssocMeasures();
    finder = BigramCollocationFinder.from_words(tokens)
    sorted_bgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))
    
    return sorted_bgs

### Function: Get Trigrams

In [80]:
def getTrigrams(tokens):
    trigram_measures =nltk.collocations.TrigramAssocMeasures();
    finder = TrigramCollocationFinder.from_words(tokens)
    sorted_tgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))

    return sorted_tgs

### Function: Get Footer

In [81]:
def getFooter(txtFile):
    try:
        if not("-pdf." in txtFile):
            docxFileName = (os.path.split(txtFile)[1]).replace(".txt",".docx")
            filePath = os.path.join(os.getcwd(),"processed",docxFileName)
            doc = docx.Document(filePath)
            footerXML = [x.blob.decode() for x in doc.part.package.parts if x.partname.find('footer')>0]

            footer = []
            for i in range(0,len(footerXML)):
                root = etree.XML(footerXML[i].split("\n",1)[1].replace("w:", ""))
                footer.append('')
                for p in root:
                    for r in p:
                    #print(r.get("t"))
                        for t in r:
                            if(t.tag == "t"):
                                footer[i] = footer[i] + t.text

            return footer
        else:
            return None
    except Exception as e:
        print(str(e))
        return None

## MetaData and Attribute Functions

### NLTK synonyms

#### Function: translate from syn POS to nltk POS

In [82]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return ''

#### Function: Use POS to cull wordnet synonyms

In [83]:
def getSynonyms_usingPOS(word_tuple):
    #print(word_tuple)
    word_tagged = word_tuple[0]
    word_pos = get_wordnet_pos(word_tuple[1])
    syns = wn.synsets(word_tagged, pos=word_pos)
    
    set1 = set()
    for syn in syns:
        for lem in syn.lemmas():
            set1.add(lem.name())
    #print(syns)
    return set1

#### Function: get synonyms of a single word. Helper function to Bigram and Trigram function

In [84]:
## You can't cull this one down with the POS b/c you can't tag a single word
def getSyns(word):
    syns1 = wn.synsets(word)
    
    set1 = set()
    for syn in syns1:
        for lem in syn.lemmas():
            set1.add(lem.name())
    
    return set1

#### Function: Get a similar bigram

In [85]:
def getSimilarBigrams(word1, word2):
    
    tagged_words = nltk.pos_tag([word1,word2])
    set1 = getSynonyms_usingPOS(tagged_words[0])
    if not len(set1):
        set1.add(word1)
    set2 = getSynonyms_usingPOS(tagged_words[1])
    if not len(set2):
        set2.add(word2)
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            word_set.add(" ".join([word1, word2]))
    
    return word_set

#### Function: get a similar trigram

In [86]:
def getSimilarTrigrams(word1, word2, word3):
    tagged_words = nltk.pos_tag([word1,word2,word3])
    
    set1 = getSynonyms_usingPOS(tagged_words[0])
    if not len(set1):
        set1.add(word1)
    set2 = getSynonyms_usingPOS(tagged_words[1])
    if not len(set2):
        set2.add(word2)
    set3 = getSynonyms_usingPOS(tagged_words[2])
    if not len(set3):
        set3.add(word3)
    
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            for word3 in set3:
                word_set.add(" ".join([word1, word2, word3]))
    #print(word_set)
    
    return word_set
        

#### Function: Get synonyms from a list of key words. Returns more keywords/phrases

In [87]:
def getSynonymsFromList(keywords):
    matches = []

    for kw in keywords:
        try:
            words = word_tokenize(kw)
        except Exception as e:
            print(str(e))
        #print(words)
        if(len(words) == 1):
            #print("is 1")
            syns = getSyns(words[0])
            for syn in syns:
                matches.append(syn)
            #keywords.append(list(syns))
        elif(len(words) == 2):
            #print("is 2")
            syns = getSimilarBigrams(words[0],words[1])
            #print(syns)
            matches.extend(getSimilarBigrams(words[0],words[1]))
        elif(len(words) == 3):
            #print("is 3")
            matches.extend(getSimilarTrigrams(words[0],words[1],words[2]))
        else:
            print("keyword string too long")
        #print(matches)
    keywords.extend(matches)
    keywords = set(keywords)

    #print(start_keywords)
    
    return keywords

### Metadata

#### get file name
Get the filename from a full path. Determines the OS and splits the string correctly based on that

In [88]:
def getFileName(fullPath):
    try:
        if(platform == "win32"):
            fileName = fullPath.split("\\")[-1]
        else:
            fileName = fullPath.split("/")[-1]
        return fileName
    except Exception as e:
        return None

#### get group number
Uses regex's made from keywords to attempt to find a group number in the file. Failing that, it searches the filename for the number.

In [89]:
def getGroupNumber(sents_tokens, filePath):
    
    group_keywords = ["group number", "groupnumber"]
    regex_exps = []
    poss_nums = []
    finalGN = None
    
    try:
        #Create regex exps out of group number keywords
        for kw in group_keywords:
                temp_re = kw + "\W\s*(?P<gn>\d+)(?P<gn2>[-\s]\d+)"
                regex = re.compile(temp_re)
                regex_exps.append(regex)

        #For each sentence, search for the expression, if found add the number to
        #list of possible group numbers
        for sent in sents_tokens:
            #print(sent)
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    temp_gn = result.group('gn')
                    temp_gn2 = result.group('gn2')
                    if(temp_gn2):
                        if(len(temp_gn2[1:])>len(temp_gn)):
                            poss_nums.append(temp_gn2[1:])
                        else:
                            poss_nums.append(temp_gn)
                    else:
                        #temp_gn2 = temp_gn2[1:]
                        #print(temp_gn2)
                        
                        poss_nums.append(temp_gn)

        #Try and get the number from the file name, looking for list of numeric chars
        num_regex = re.compile("(?P<gn>\d+)(?P<gn2>[-\s]+\d+)?")
        fileName = getFileName(filePath)
        fileGN = num_regex.search(fileName)

        if not fileGN==None:#if they filename has a number sequence
            temp_gn = None
            if(fileGN.group('gn2')):
                if(len(fileGN.group('gn'))>=len(fileGN.group('gn2')[1:])):
                    temp_gn = fileGN.group('gn')
                else:
                    temp_gn = fileGN.group('gn2')[1:]

            else:
                temp_gn = fileGN.group()
            if (temp_gn in poss_nums):#then if the file group number matches one in the document, choose it
                finalGN = temp_gn
            else:
                poss_nums.append(temp_gn)#otherwise add the filename one to the list and try to get the most co
                try:
                    finalGN = mode(poss_nums)
                except Exception as e:
                    return -1
        else: #it is none and there was no group number in the filename
            try:
                finalGN = mode(poss_nums)
            except Exception as e:
                #no mode found, couldn't find a group number
                return -1
    except Exception as e:
        #print(str(e))
        return None
    
    return finalGN

#### get contract start
Uses regex and a list of keywords to attempt to find the start date of the contract. It makes multiple passes based on patterns seen in contract samples so far.

Some of the passes are necessary to filter out non-date numbers that the datefinder incorrectly parses to dates

In [90]:
def getContractStart(sents_tokens):
    start_keywords = ["effective date\S\s*\S", "effective"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    try:
        start_keywords = getSynonymsFromList(start_keywords)
        #print(start_keywords)
        for kw in start_keywords:
                temp_re = kw
                regex = re.compile(temp_re)
                regex_exps.append(regex)

    ## Original pass through sentence tokens to find possible dates
        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    subset = word_tokenize(sent[sent.lower().find(result.group()):])[:20]
                    subset = " ".join(subset)
                    poss_dates.append(subset)
                    if "effective date" in subset.lower():
                        poss_dates.append(subset)

                
    ## Second pass through sentence tokens to find possible dates based on a date range format
        regex_exps = []
        backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
        for kw in backup_kw:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)

        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    half_1 = sent[sent.lower().find(result.group()):]
                    half_2 = sent[:sent.lower().find(result.group())]

                    subset_1 = " ".join(word_tokenize(half_1)[:6])
                    subset_2 = " ".join(word_tokenize(half_2)[-6:])
                    subset = subset_2 + subset_1

                    m = datefinder.find_dates(subset, strict=True)
                    temp_matches = []
                    for match in m:
                        if match.year >= 1966:
                            temp_matches.append(subset)
                    if len(temp_matches)>=2:
                        poss_dates.append(subset)
                        #print(subset)

                    
    ## Second pass through sentences with possible dates to eliminate ones without a year or with an invalid year
    ## These are likely other values flagged incorrectly as dates by the datefinder
    ## 1966 is the year Delta Dental was created
        for sent in poss_dates:
            #print(sent)
            find_year_re = re.compile("\d\d\d\d")
            year = find_year_re.search(sent)

            if not year==None:
                #print(sent)
                #print(year.group())
                m = datefinder.find_dates(sent, strict=True)
                for match in m:
                    if match.year >= 1966:
                        matches.append(match)
## Last pass: try to find the most common date. If there is more than one mode, choose the earliest date
##.           this seems to occur when it is finding the contract start and end in equal quantities
        #print(matches)
        try:
            finalDate = mode(matches)
        except ValueError as e:
            #print(str(e))
            if matches:
                earliestMatch = matches[0]
                for match in matches:
                    if(match < earliestMatch):
                        earliestMatch = match
                finalDate = earliestMatch
            else:
                finalDate = datetime.datetime(1066, 1, 1)
        except Exception as e:
            return None #i.e. not only could they not find a start date, something failed
    except Exception as e:
        return None
    return finalDate

####  get Contract End
Similar to get contract start, it uses regex and keywords over multiple passes to attempt and find the contract end.

In [91]:
def getContractEnd(sents_tokens):
    
    start_keywords = ["contract term\S\s*\S", "contract term ", "contract end", "termination date"]
    #start_keywords = ["termination date"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    try:
        start_keywords = getSynonymsFromList(start_keywords)
        #print(start_keywords)
        for kw in start_keywords:
                temp_re = kw
                regex = re.compile(temp_re)
                regex_exps.append(regex)

    ## Original pass through sentence tokens to find possible dates based on keywords
        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    subset = word_tokenize(sent[sent.lower().find(result.group()):])[:30]
                    #print(subset[2])
                    if not (subset[2]=='beginning'):
                        subset = " ".join(subset)
                       # print(subset)
                        poss_dates.append(subset)


    ## Second pass through sentence tokens to find possible dates based on a date range format
        regex_exps = []
        backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S", "\d\d\d\d\sto\s\S"]

        for kw in backup_kw:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)

        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    half_1 = sent[sent.lower().find(result.group()):]
                    half_2 = sent[:sent.lower().find(result.group())]

                    subset_1 = " ".join(word_tokenize(half_1)[:12])
                    subset_2 = " ".join(word_tokenize(half_2)[-12:])
                    subset = subset_2 + subset_1
                    #print(subset)
                    m = datefinder.find_dates(subset, strict=True)
                    temp_matches = []
                    for match in m:
                        #print(match)
                        if match.year >= 1966:
                            temp_matches.append(subset)
                    if len(temp_matches)>=2:
                        
                        poss_dates.append(subset)
                        #print(subset)


    ## Pass through sentences with possible dates to eliminate ones without a year or with an invalid year
    ## These are likely other values flagged incorrectly as dates by the datefinder
    ## 1966 is the year Delta Dental was created
        for sent in poss_dates:
            #print(sent)
            find_year_re = re.compile("\d\d\d\d")
            year = find_year_re.findall(sent)
            #print(year)
            if len(year)>=2:
                #print(year)
                m = datefinder.find_dates(sent, strict=True)
                maxMatch = datetime.datetime(1066,1,1)
                if not m == None:
                    #print(m)
                    for match in m:
                        if match > maxMatch:
                            maxMatch = match
                    if maxMatch.year >= 1966:
                        matches.append(match)
                        #print(matches)
            elif len(year)==1:
                find_year_re = re.compile("termination date")
                valid_sent = find_year_re.findall(sent.lower())
           #     print(sent)
                m = datefinder.find_dates(sent, strict=True)
                if not m == None:
          #          print(m)
                    for match in m:
                        if match.year >= 1966:
                          #  print(matches)
                            matches.append(match)

        #if(len(matches)):
        #    print(matches)

    ### If there are exactly two matches, try to find a max. If error b/c they're the same, choose one
        if(len(matches) == 2):
            try:
                finalDate = max(matches)
            except ValueError as e:
                finalDate = matches[0]
            except Exception as e:
                return None

    ## If there are more, try and find the top two most mentioned and take the later. else just take the latest            
        elif(len(matches) > 2):

            try:
                date1 = mode(matches)
                matches.remove(date1)

                date2 = mode(matches)
                matches.remove(date2)

                finalDate = max([date1, date2])
            except ValueError as e:
                #print(str(e))
                if matches:
                    latestMatch = matches[0]
                    for match in matches:
                        if(match > latestMatch):
                            latestMatch = match
                    finalDate = latestMatch
            except Exception as e:
                return None
        else:
            finalDate = mode(matches)
            #return datetime.datetime(1066, 1, 1)
            #print("could not find contract end for file")
    except Exception as e:
        return None
    #print("\n")
    return finalDate

#### get Contract Duration
Uses the functions getContractStart and getContractEnd to calculate a duration if possible

In [92]:
def getContractDuration(sents_tokens):
    start = None
    end = None
    duration = None
    
    try:
        start = getContractStart(sents_tokens)
        end = getContractEnd(sents_tokens)
        
        if(start and end):
            if (start.year==1066) or (end.year == 1066):
                return -1
            else:
                duration = (end - start).days
                if duration <= 0:
                    return -1
        else:
            return -1
    except Exception as e:
        print(str(e))
        return None

    return duration

#### get State/Location: -- Not Done

##### Helper function to create location data set from csv

In [93]:
def makeLocationDataStruct():
    categories = []
    location_data = {}
    
    us_filename = 'us_cities_states_counties.csv'
    cwd = os.getcwd()
    filepath = os.path.join(cwd, us_filename)
    #print(filepath)
    with open(filepath, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter='|',escapechar=',')
        category_row = 1
        for row in spamreader:
            if category_row:
                for cat in row:
                    categories.append(cat)
                    location_data[cat]=set()
                category_row = 0
            else:
                
                for item in range(len(row)):
                    #print(row[item])
                    if len(row[item]):
                        location_data[categories[item]].add(row[item])
        location_data['State full'].add("Washington , DC")
        location_data['State full'].add("Washington , D.C.")
    csvfile.close()
    
    states_filename = 'state_abbrv_to_name.csv'
    filepath = os.path.join(cwd, states_filename)
    
    location_data['translate_s2l'] = {}
    location_data['translate_l2s'] = {}
    
    with open(filepath, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        category_row = 1
        for row in spamreader:
            if category_row:
                category_row = 0
            else:
                location_data['translate_s2l'][row[0]] = row[1]
                location_data['translate_l2s'][row[1]] = row[0]
    csvfile.close()
    #print(location_data)
    return location_data

##### Call function to set up location data

In [94]:
location_data = makeLocationDataStruct()


##### Get Client Location

In [95]:
def checkIfCity(loc_str, loc_data, isDelta, isContractholder):
    cities = []
    
    if (loc_str in loc_data['City']):
        cities.append(loc_str)
    return cities

def checkIfState(loc_str, loc_data):
    states = []
    
    if(loc_str in loc_data['State full']):
        states.append(loc_str.lower())
    if(loc_str in location_data['State short']):
        try:
            states.append(location_data['translate_s2l'][loc_str].lower())
        except Exception as e:
            return states
    return states
    
def getClientLocation(sents_tokens, bgs, tgs, location_data, filename):
    loc_sents = set()
    cities = []
    states = []
    
    
    try:
        
        for sent in sents_tokens:

            sent_cities = []
            sent_states = []

            isDelta = "deltadental" in sent.lower()
            isContractholder = "contractholder" in sent.lower()
            isNotice = "notice to contractholder" in sent.lower()
            DCtext = ["washington , d.c.","washington , dc", "washington dc","district of columbia","washington d.c."]

            #Used to filter out sentences in ALL caps. They interfere with the Abbreviated States lists
            if sent.isupper():
                sent = sent.lower()


            text = nltk.word_tokenize(sent)

            i = 0

            while (i < len(text)) and (len(text) > 2):
                if(i < len(text)-2):
                    text_bg = " ".join([text[i], text[i+1]])
                else:
                    text_bg = ""
                if(i < len(text)-2):
                    text_tg = " ".join([text[i], text[i+1], text[i+2]])
                else:
                    text_tg = ""

                sent_cities.extend(checkIfCity(text[i], location_data, isDelta, isContractholder))
                sent_cities.extend(checkIfCity(text_bg, location_data, isDelta, isContractholder))
                sent_cities.extend(checkIfCity(text_tg, location_data, isDelta, isContractholder))

                sent_states.extend(checkIfState(text[i], location_data))
                sent_states.extend(checkIfState(text_bg, location_data))
                sent_states.extend(checkIfState(text_tg, location_data))

                i+=1

            if (len(sent_states)>0) and not isDelta:
                
                if bool(set(sent_states).intersection(DCtext)):
                    
                    for st in sent_states:
                        if st in DCtext:
                            states.append(st.lower())
                            if(isNotice):
                                states.append(st.lower())
                           
                short_sent_states = []
                for st in sent_states:
                    try:
                        short_sent_states.append(location_data['translate_l2s'][st.title()].lower())
                    except Exception as e:
                        e=e

                for city in sent_cities:
                    for state in sent_states+short_sent_states:
                        
                        if (len(checkIfState(city + " " + state.title(), location_data))==0) and not (city is "New York"):
                                
                                add_regex_str = str(city) + "[-,\s]+" + str(state) + "[-,\s]"
                                add_regex = re.compile(add_regex_str, re.IGNORECASE)
                                matches = add_regex.findall(sent)
                                if matches:
                                    cities.append(city)
                                    states.append(state.lower())
                                    if(isNotice):
                                        states.append(state.lower())
                                    
            
        FN_chunks = re.findall(r"[\w]+|[-\s_]", filename)
        
        for chunk in FN_chunks:
            FN_state = checkIfState(chunk, location_data)
            if len(FN_state)>0:
                for i in range(0,5):
                    states.append(FN_state[0].lower())
                    i+=1
                
              
        try:
            for st in states:
                st = st.lower()
            final_state = mode(states)
            #print(final_state)
            if(len(final_state)<3):
                try:
                    final_state = location_data['translate_s2l'][final_state.upper()].lower()
                except Exception as e:
                    #print(str(e))
                    final_state = final_state
        except ValueError as ve:
            #print(str(ve))
            if(len(states)>0):
                for st in states:
                    st = st.lower()
                try:
                    final_state = mode(states)
                except Exception as e:
                    final_state = "not_a_state"
            else:
                final_state = "not_a_state"
        except Exception as e:
            #print(str(e))
            final_state = None
        
    
    except Exception as e:
        print(str(e))
        return None #error in function execution
    return final_state

#### Delta Office Involved V2:

In [177]:
def getDeltaOffice(sents_tokens, bgs, tgs, location_data):
    loc_sents = set()
    states = []
    try:
        for sent in sents_tokens:
           
            sent_states = []

            isDelta = "deltadental" in sent.lower()
            isContractholder = "contractholder" in sent.lower()
            isNotice = "notice to delta dental" in sent.lower()

            if isDelta:
                
                if sent.isupper():
                    sent = sent.lower()

                #valid_states_regex = ["[wW]ashington *,? +[dD].?[cC].?","[dD]istrict [oO]f [cC]olumbia","[, ]+[gG][aA][\s,]+","[gG]eorgia","[, ]+[cC][aA][\s,]+","[cC]alifornia","[, ]+[pP][aA][\s,]+","[pP]ennsylvania","[, ]+[nN][yY][\s,]+","[nN]ew [yY]ork","[, ]+[wW][vV][\s,]+","[wW]est [vV]irginia","[, ]+[uU][tT][\s,]+","[uU]tah"]                       
                valid_states_regex = ["[, ]+(?P<state>[gG][aA])[\s,]+","(?P<state>[gG]eorgia)","[, ]+(?P<state>[cC][aA])[\s,]+","(?P<state>[cC]alifornia)","[, ]+(?P<state>[pP][aA])[\s,]+","(?P<state>[pP]ennsylvania)"]                       
                for r in valid_states_regex:
                    r = re.compile(r)
                    results = r.search(sent)
                    if(results):
                        print(sent)
                        states.append(results.group('state'))
            else:#(?P<word>\b\w+\b)
                state_and_cities_regex = ["[Aa]lpharetta[, ]+(?P<state>[gG][aA])[\s,]+","[aA]lpharetta[, ]+(?P<state>[gG]eorgia)",
                                          "[sS]acramento[, ]+(?P<state>[cC][aA])[\s,]+","[sS]acramento[, ]+(?P<state>[cC]alifornia)",
                                          "[sS]an [dD]iego[, ]+(?P<state>[cC][aA])[\s,]+","[sS]an [dD]iego[, ]+(?P<state>[cC]alifornia)",
                                          "[sS]an [fF]rancisco[, ]+(?P<state>[cC][aA])[\s,]+","[sS]an [fF]rancisco[, ]+(?P<state>[cC]alifornia)",
                                          "[mM]echanicsburg[, ]+(?P<state>[pP][aA])[\s,]+","[mM]echanicsburg[, ]+(?P<state>[pP]ennsylvania)"] 
                for r in state_and_cities_regex:
                    r = re.compile(r)
                    results = r.search(sent)
                    if(results):
                        print(sent)
                        states.append(results.group('state'))

        print(states)
        #print(valid_states)
        temp_states = []
        for state in states:
            if(len(state)==2):
                try:
                    long_state = location_data['translate_s2l'][state.upper()]
                    temp_states.append(long_state.lower())
                except Exception as e:
                    print(str(e))
            else:
                temp_states.append(state.lower())
        states = temp_states
        print(states)
        
        try:
            final_state = mode(states)
            if(len(final_state)<3):
                try:
                    final_state = location_data['translate_s2l'][final_state.upper()]
                except:
                    final_state = final_state
        except ValueError as ve:
            final_state = "not_a_state"
        except Exception as e:
            #print(e)
            final_state = None
        
    except Exception as e:
        print(str(e))
        final_state = None
    print(final_state)
    return final_state

#### Delta Office Involved -- V1

In [178]:
def getDeltaOffice_V1(sents_tokens, bgs, tgs, location_data):
    loc_sents = set()
    cities = []
    states = []
    DCtext = ["washington , d.c.","washington , dc", "washington dc","district of columbia","washington d.c."]
    try:
        for sent in sents_tokens:
           
            sent_cities = []
            sent_states = []

            isDelta = "deltadental" in sent.lower()
            isContractholder = "contractholder" in sent.lower()
            isNotice = "notice to delta dental" in sent.lower()



            if isDelta:
                
                if sent.isupper():
                    sent = sent.lower()
                    
                text = nltk.word_tokenize(sent)

                i = 0

                while (i < len(text)) and (len(text) > 2):
                    if(i < len(text)-2):
                        text_bg = " ".join([text[i], text[i+1]])
                    else:
                        text_bg = ""
                    if(i < len(text)-2):
                        text_tg = " ".join([text[i], text[i+1], text[i+2]])
                    else:
                        text_tg = ""

                    sent_cities.extend(checkIfCity(text[i], location_data, isDelta, isContractholder))
                    sent_cities.extend(checkIfCity(text_bg, location_data, isDelta, isContractholder))
                    sent_cities.extend(checkIfCity(text_tg, location_data, isDelta, isContractholder))

                    sent_states.extend(checkIfState(text[i], location_data))
                    sent_states.extend(checkIfState(text_bg, location_data))
                    sent_states.extend(checkIfState(text_tg, location_data))

                    i+=1

                if (len(sent_states)>0):
                    if bool(set(sent_states).intersection(DCtext)):
                        
                        for st in sent_states:
                            if st in DCtext:
                                states.append(st)
                                if(isNotice):
                                    states.append(st)
                                
                    short_sent_states = []
                    for st in sent_states:
                        try:
                            short_sent_states.append(location_data['translate_l2s'][st.title()].lower())
                        except Exception as e:
                            e=e
                           
                    for city in sent_cities:
                        for state in sent_states+short_sent_states:
                            if (len(checkIfState(city + " " + state.title(), location_data))==0) and not (city is "New York"):

                                add_regex_str = str(city) + "[-,\s]+" + str(state) + "[-,\s]"
                                add_regex = re.compile(add_regex_str, re.IGNORECASE)
                                matches = add_regex.findall(sent)
                                if matches:
                                    cities.append(city)
                                    states.append(state)
                                    if(isNotice):
                                        states.append(state)
        valid_states = ["washington , d.c.","washington , dc", "washington dc","district of columbia","washington d.c.","ga","GA","georgia","Georgia","CA","ca","California","california","PA","pa","Pennsylvania","pennsylvania","NY","ny","New York","new york","WV","wv","West Virginia","west virginia","UT","ut","Utah","utah"]                       
        
        #print(states)
        #print(valid_states)
        tempstates = []
        if(states):
            for state in states:
                #print(state)
                if state in valid_states:
                    tempstates.append(state)
                else:
                    print(state)
        #print(tempstates)
        states = tempstates
        try:
            final_state = mode(states)
            if(len(final_state)<3):
                try:
                    final_state = location_data['translate_s2l'][final_state.upper()]
                except:
                    final_state = final_state
        except ValueError as ve:
            final_state = "not_a_state"
        except Exception as e:
            #print(e)
            final_state = None
        
    except Exception as e:
        print(str(e))
        final_state = None
    print(final_state)
    return final_state

#### Get contractholder name

In [98]:
def longestSubstring(filename, sent):
    filename = filename.replace("docx", "")
    filename = filename.replace("pdf", "")
    filename = filename.replace("txt", "")
    filename = re.sub(r'[-,()._]', r' ',filename)
    filename = re.sub(r'\d+', r' ', filename)
    #print(filename)
    fn_words = nltk.word_tokenize(filename.lower())
    #print(fn_words)
    
    sent = sent.replace("docx", "")
    sent = sent.replace("pdf", "")
    sent = sent.replace("txt", "")
    sent = re.sub(r'[-,()._]', r' ',sent)
    sent = re.sub(r'\d+', r' ', sent)
    sent_words = nltk.word_tokenize(sent.lower())
    matches = []
    match = []
    i=0
    j=0
    
    while(i < len(fn_words)):
        while(j < len(sent_words)):
            if fn_words[i] == sent_words[j]:
                #match found
                #match.append(fn_words[i])
                m=i
                k=j
                while((m<len(fn_words)) and (k<len(sent_words)) and (fn_words[m] == sent_words[k])):
                    match.append(fn_words[m])
                    m+=1
                    k+=1
                if(len(match)>1):
                    #re.match(r"hello[0-9]+", 'hello1')
                    file_kw = [r'schedule\s\w{1,2}', r'attachment\s[a-zA-Z]', r'appendix\s[a-zA-Z]']
                    falseMatch = 0
                    for fm in file_kw:
                        if(re.match(fm, " ".join(match))):
                            falseMatch = 1
                    if not falseMatch:
                        matches.append(match)
                match = []
                j+=1
            else:
                j+=1
        j=0
        i+=1
            
        
    longest_match = []
    if(len(matches)):
        #print(matches)
        
        for m in matches:
            m = " ".join(m)
            if(len(m) > len(longest_match)):
                longest_match = m
        
        return longest_match
    
    return None



def getContractholder(sents_tokens, fileName):
    #start_keywords = ["contractholder\s?[-:]+\s?(?P<name>\w+)"]
    start_keywords = ["contractholder name\s?[-:]+\s+(?P<name>[\w\s]+)[gG]roup\s?[nN]umber", "contractholder\s?[-:]\s+(?P<name>[\w\s]+)[gG]roup\s?[nN]umber"]
    regex_exps = []
    poss_names = []
    finalName = ""
    matches = []
    
    try:
        #start_keywords = getSynonymsFromList(start_keywords)
        #print(start_keywords)
        for kw in start_keywords:
                temp_re = kw
                regex = re.compile(temp_re)
                regex_exps.append(regex)

        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    #subset = word_tokenize(sent[sent.lower().find(result.group('name')):])
                    subset = result.group('name')
                    #subset = " ".join(subset)
                    if not (subset.startswith('the employer')):
                        poss_names.append(subset)
            
            
            
        for sent in sents_tokens[0:5]:
            #print(sent)
            substring = longestSubstring(fileName, sent)
            if substring:
                poss_names.append(substring)
        
        
        
        #if(len(poss_names)>0):
            #print(poss_names)
        try:
            finalName = mode(poss_names)
            #print(finalName)
        except ValueError as e:
            e=e
            #print(str(e))
            #if matches:
                #earliestMatch = matches[0]
                #for match in matches:
                #    if(match < earliestMatch):
                #        earliestMatch = match
                #finalDate = earliestMatch
            #else:
            #    finalDate = datetime.datetime(1066, 1, 1)
        except Exception as e:
            print(str(e))
            return None #i.e. not only could they not find a start date, something failed
    except Exception as e:
        print(str(e))
        return None
    return finalName


#for file in base_info:
 #   fileName = os.path.split(file['filepath'])[1]
  #  print(fileName)
   # contractholderName = getContractholder(file['sentTokens'], fileName)
    #if(contractholderName):
     #   print(contractholderName)
      #  print("\n")
    #for sent in file['sentTokens'][0:5]:
        #print(sent)
        ##longestSubstring(fileName, sent)
    #longestSubstring(fileName, "test string")

#### File type functions

In [99]:
def isEnterprise(fileName, footer):

    isEnterprise = 0

    #print(fileName)
    enterpriseList = ["-ENT","ENT-","E-"]
    if(footer and len(footer)>0):
        for f in footer:
            #print(f)
            if(any([x in f for x in enterpriseList])):
                isEnterprise = 1

    #print("\n")
    return isEnterprise

def isASC(filePath, footer):
    isASC = 0

    filename = os.path.split(filePath)[1]


    ascList = ["ASC-", "-ASC","ASO-","-ASO"," ASC", "ASC "," ASO", "ASO "]
    ascRegex = ["[- (]ASC"]
    for r in ascRegex:
        r = re.compile(r)
        results = r.search(filename)
        if(results):
            isASC = 1
    if(footer and len(footer)>0):
        for f in footer:
            if(any([x in f for x in ascList])):
                isASC = 1

    return isASC


def isEOC(filePath, footer):
    isEOC = 0

    filename = os.path.split(filePath)[1]


    eocList = ["EOC-", "-EOC"]
    eocRegex = ["[- (]EOC"]
    for r in eocRegex:
        r = re.compile(r)
        results = r.search(filename)
        if(results):
            isEOC = 1
    if(footer and len(footer)>0):
        for f in footer:
            if(any([x in f for x in eocList])):
                isEOC = 1

    return isEOC

def isEBB(filePath, footer):
    isEBB = 0

    filename = os.path.split(filePath)[1]


    ebbList = ["EBB-", "-EBB"]
    ebbRegex = ["[- (]EBB"]
    for r in ebbRegex:
        r = re.compile(r)
        results = r.search(filename)
        if(results):
            isEBB = 1
    if(footer and len(footer)>0):
        for f in footer:
            if(any([x in f for x in ebbList])):
                isEBB = 1

    return isEBB

def isSchedule(filePath, footer):
    isSchedule = 0

    filename = os.path.split(filePath)[1]
    #print(filename)
    schRegex = ["[sS]chedule\s+I","[ (]S[I]+[) ]","[sS]ch[ed]*[ ]+[I12]+"]
    if "schedule" in filename.lower():
        isSchedule = 1
        #print("in filename")
    else:
        for r in schRegex:
            r = re.compile(r)
            results = r.search(filename)
            if results:
                isSchedule = 1
                #print(filename)
                #print(results.group())
    #print("\n")

    return isSchedule

def isContract(filePath, footer, sentTokens):
    isContract = 0

    filename = os.path.split(filePath)[1]
    #print(filename)
    #print(sentTokens[0])
    #print("\n")
    #schRegex = ["[cC]ontract","CONTRACT"]
    contractList = ["MC-", "-MC"]
    if(footer and len(footer)>0):
        for f in footer:
            if(any([x in f for x in contractList])):
                isContract = 1

    if "contract" in filename.lower():
        isContract = 1
        #print("in filename")
        #print("\n")
    if (len(sentTokens)>5):
        for sent in sentTokens[0:5]:
            if "this contract is entered into" in sent.lower():
                isContract = 1

    return isContract

def isAttachment(filePath, footer):
    isAttachment = 0

    filename = os.path.split(filePath)[1]
    #print(filename)
    attachRegex = ["[aA]ttach[ment]*"]

    for r in attachRegex:
        r = re.compile(r)
        results = r.search(filename)
        if results:
            isAttachment = 1
            
    return isAttachment

def isAppendix(filePath, footer):
    isAppendix = 0

    filename = os.path.split(filePath)[1]
    appRegex = ["[aA]ppendix"]

    for r in appRegex:
        r = re.compile(r)
        results = r.search(filename)
        if results:
            isAppendix = 1

    return isAppendix

def isRider(filePath, footer):
    isRider = 0

    filename = os.path.split(filePath)[1]
    #print(filename)
    riderRegex = ["[-( ]R\d[\d]?"]

    for r in riderRegex:
        r = re.compile(r)
        results = r.search(filename)
        if results:
            isRider = 1
            #print(filename)
            #print(results.group())
    #print("\n")

    return isRider

def isTaxModification(filePath, footer):
    isTaxModification = 0

    filename = os.path.split(filePath)[1]
    print(filename)
    tmRegex = ["[tT]ax[- ]+[mM]odif[ication]*","TAX[- ]+MODIF[ICATION]*"]

    for r in tmRegex:
        r = re.compile(r)
        results = r.search(filename)
        if results:
            isTaxModification = 1
            #print(filename)
            print(results.group())
    print("\n")

    return isTaxModification

def isSBCModification(filePath, footer):
    isSBC = 0

    filename = os.path.split(filePath)[1]


    sbcList = ["SBC-", "-SBC"]
    sbcRegex = ["[- (_]SBC"]
    for r in sbcRegex:
        r = re.compile(r)
        results = r.search(filename)
        if(results):
            isSBC = 1
    if(footer and len(footer)>0):
        for f in footer:
            if(any([x in f for x in sbcList])):
                isSBC = 1

    return isSBC

def isPremiumAgreement(filePath, footer):
    isPremiumAgreement = 0

    filename = os.path.split(filePath)[1]


    PAList = ["PRM-", "-PRM"]
    PARegex = ["[- (]PREM AGMT", "[Pp]remium[ ]+[Aa]greement"]
    for r in PARegex:
        r = re.compile(r)
        results = r.search(filename)
        if(results):
            isPremiumAgreement = 1
    if(footer and len(footer)>0):
        for f in footer:
            if(any([x in f for x in PAList])):
                isPremiumAgreement = 1

    return isPremiumAgreement


def getFileTypes(filePath, footer, sentTokens):
    types = []

    if(isEOC(filePath, footer)):
        types.append("EOC")

    if(isEBB(filePath, footer)):
        types.append("EBB")

    if(isSchedule(filePath, footer)):
        types.append("Schedule")

    if(isContract(filePath, footer, sentTokens)):
        types.append("Contract")

    if(isAttachment(filePath, footer)):
        types.append("Attachment")

    if(isRider(filePath, footer)):
        types.append("Rider")

    if(isTaxModification(filePath, footer)):
        types.append("TaxModification")

    if(isSBCModification(filePath, footer)):
        types.append("SBCModification")

    if(isPremiumAgreement(filePath, footer)):
        types.append("PremiumAgreement")
    
    if(isAppendix(filePath, footer)):
        types.append("Appendix")

    return types




### Batch Run to get attributes
#### Functions to process multiple files and their attributes at once

#### Function: batch pre process: fill output folder

In [100]:
def batchPreProcess(errorFile, pathToData):
    cwd = os.getcwd()
    ##print(cwd)
    processedTextPath = ""
    
    #dataPath = os.path.join(cwd, "processed")
    if(os.path.isdir(pathToData)):

        for file in os.listdir(pathToData):
            filepath = os.path.join(pathToData, file)
            if(os.path.isfile(filepath)):
                print("pre-processing: " + file)
                try:
                    
                    if(checkFileType(filepath) == 0):
                        processedTextPath = processDocxFile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " +  filepath)
                            
                    elif(checkFileType(filepath) == 1):
                        processedTextPath = processPDFfile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " + filepath)
                            
                    elif(checkFileType(filepath) == 2):
                        processedTextPath = processDocFile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " + filepath)
                            
                    else:
                        errorFile.write(filepath + ", pre-processing: invalid filetype\n")
                        raise TypeError('This path does not lead to a valid file type!')                     
                except Exception as e:
                    print(str(e))
                    errorFile.write(filepath + ", pre-processing," + str(e) + "\n")
                    print("Error pre-processing file: " + filepath)

    else:
        print("Folder data/raw doesn't exist")
        return None
    return "success"

#### Function: Batch return token and bigram sets for all output files
Returns file information as an array of objects containing key:value information about the file: 

In [101]:
def batchGetTokens(errorFile, dataPath):
    all_tokens = []
    cwd = os.getcwd()
    processedTextPath = ""
    
    #dataPath = os.path.join(cwd, "output")
    
    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath) and file.endswith(".txt")):
                try:

                    temp_obj = {}

                    with open(filepath, 'r', encoding='utf-8') as txtFile:
                        text = txtFile.read()

                    temp_obj['filepath'] = filepath

                    text = ddCleanText(text)
                    temp_obj['cleanText'] = text

                    wordTokens = getTokens(text)
                    sentTokens = getSents(text)
                    temp_obj['wordTokens'] = wordTokens
                    temp_obj['sentTokens'] = sentTokens

                    bgs = getBigrams(wordTokens)
                    tgs = getTrigrams(wordTokens)
                    temp_obj['bgs'] = bgs
                    temp_obj['tgs'] = tgs

                    temp_obj['footer'] = getFooter(filepath)
                    txtFile.close()
                    all_tokens.append(temp_obj)
                except Exception as e:
                    print(str(e))


    else:
        print("Folder /output doesn't exist. Pre-processing failed.")
        return None
    return all_tokens


#### Function: get metadata attributes
This function takes in a single files info -- in this section because it will be used in a batch function

In [179]:
def getMetaDataAtt(file_info):
    #print(file_info)
    file_attr = {}
    file_attr['filepath'] = file_info['filepath']

    fileName = getFileName(file_info['filepath'])
    if not fileName:
        fileName = os.path.split(file_info['filepath'])[1]
        file_attr['fileName'] = fileName
    else:
        #print(fileName)
        file_attr['fileName'] = fileName

    fileType = isEnterprise(fileName, file_info['footer'])
    file_attr['fileType'] = fileType

    groupNumber = getGroupNumber(file_info['sentTokens'], file_info['filepath'])
    file_attr['groupNumber'] = groupNumber


    contractStartDate = getContractStart(file_info['sentTokens'])
    file_attr['contractStartDate'] = contractStartDate


    contractEndDate = getContractEnd(file_info['sentTokens'])
    file_attr['contractEndDate'] = contractEndDate


    contractDuration = getContractDuration(file_info['sentTokens'])
    file_attr['contractDuration'] = contractDuration

    
    clientLocation = getClientLocation(file_info['sentTokens'], file_info['bgs'], file_info['tgs'], location_data, fileName)
    clientLocation = clientLocation.replace(",","")
    #if not clientLocation == "not_a_state":
    #    clientLocation[0].upper()
    file_attr['clientLocation'] = clientLocation


    deltaOfficeLocation = getDeltaOffice(file_info['sentTokens'], file_info['bgs'], file_info['tgs'], location_data)
    deltaOfficeLocation = deltaOfficeLocation.replace(",","")
    #if not deltaOfficeLocation == "not_a_state":
     #   deltaOfficeLocation[0].upper()
    file_attr['deltaOfficeLocation'] = deltaOfficeLocation


    contractholderName = getContractholder(file_info['sentTokens'], fileName)
    file_attr['contractholderName'] = contractholderName

    ent = isEnterprise(file_info['filepath'], file_info['footer'])
    file_attr['isENT'] = ent

    asc = isASC(file_info['filepath'], file_info['footer'])
    file_attr['isASC'] = asc

    fileTypes = getFileTypes(file_info['filepath'], file_info['footer'], file_info['sentTokens'])
    typeRanks = ["Contract","EOC","Attachment","Schedule","EBB","Appendix","Rider","TaxModification","SBCModification","PremiumAgreement"]
    mainFileType = ""
    for rank in typeRanks:
        if rank in fileTypes:
            mainFileType = rank
            break
    file_attr['mainFileType'] = mainFileType
    file_attr['fileTypes'] = fileTypes


    file_attr['footer'] = file_info['footer']
    eoc = isEOC(file_info['filepath'], file_info['footer'])
    file_attr['isEOC'] = eoc


    return file_attr



## Workspace Prep Functions

### Function: Initial setup/fill Raw data folder
    Just sets up the expected folder structure. Fills nothing.

In [103]:
def setupWorkspace(rawPath, errorFile):
    cwd = os.getcwd()
    print("cwd: " + cwd)
    
    
    processedPath = os.path.join(cwd, "processed")
    outputPath = os.path.join(cwd, "output")
    dataPath = os.path.join(cwd, "data")
    
    try:
        if not(os.path.isdir(rawPath)):
            raise Exception(rawPath + " doesn't exist.")
        elif not(os.listdir(rawPath)):
            raise Exception(rawPath + " is empty.")
        else:
        
            if(os.path.isdir(processedPath)):
                raise Exception(processedPath + " already exists.")
            else:
                print("creating " + processedPath + "...") 
                os.makedirs(processedPath)

            if(os.path.isdir(outputPath)):
                raise Exception(outputPath + " already exists.")
            else:
                print("creating " + outputPath + "...")
                os.makedirs(outputPath)

            if(os.path.isdir(dataPath)):
                raise Exception(dataPath + " already exists.")
            else:
                print("creating " + dataPath + "...")
                os.makedirs(dataPath)
            return "Success"
    except Exception as e:
        print(str(e))
        return None   

### Function: Move to processed folder
    Moves all files in the raw folder to the processed folder

In [104]:
def moveToProcessedFolder(rawPath, errorFile):
    cwd = os.getcwd()
    print("cwd: " + cwd)
    
    processedPath = os.path.join(cwd, "processed")
    outputPath = os.path.join(cwd, "output")
    dataPath = os.path.join(cwd, "data")
    
    try:
        if(os.path.isdir(rawPath) and os.path.isdir(processedPath)):
            #is it empty?
            if not os.listdir(rawPath):
                raise Exception("raw data folder is empty.")
            else:
                for file in os.listdir(rawPath):
                    print("processing: " + file)
                    #move a copy to processed Path
                    try:
                        #print("moving: " + file)
                        shutil.copy(os.path.join(rawPath, file), os.path.join(processedPath, file))
                    #catch copy exception here so it doesn't stop all files (?)
                    except Exception as e:
                        print(str(e))
                return processedPath
                        
            
        else:
            raise Exception("Expected file structure doesn't exist.")
            
        
    except Exception as e:
        print(str(e))
        return None

### Function: Fill output folder with .txt files
    Process all files in the Processed folder and output resulting txt to output folder

In [105]:
def createTxtFiles(pathToData, errorFile):
    cwd = os.getcwd()
    processedTextPath = ""
    
    #dataPath = os.path.join(cwd, "processed")
    if(os.path.isdir(pathToData)):

        for file in os.listdir(pathToData):
            filepath = os.path.join(pathToData, file)
            if(os.path.isfile(filepath)):
                print("pre-processing: " + file)
                try:
                    
                    if(checkFileType(filepath) == 0):
                        processedTextPath = processDocxFile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " +  filepath)
                            
                    elif(checkFileType(filepath) == 1):
                        processedTextPath = processPDFfile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " + filepath)
                            
                    elif(checkFileType(filepath) == 2):
                        processedTextPath = processDocFile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " + filepath)
                            
                    else:
                        #errorFile.write(filepath + ", pre-processing: invalid filetype\n")
                        raise TypeError('This path does not lead to a valid file type!')                     
                except Exception as e:
                    print(str(e))
                    #errorFile.write(filepath + ", pre-processing," + str(e) + "\n")
                    print("Error pre-processing file: " + filepath)

    else:
        print("Folder data/raw doesn't exist")
        return None
    return "success"

### Run it

In [106]:
def rawToWorkspace(pathToRawData):
    errorFilePath = os.path.join(os.getcwd(),'cannot_process.csv')
    errorFile = open(errorFilePath, 'w')
    
    setup = setupWorkspace(pathToRawData,errorFile)
    if(setup):
        moveToProcessed = moveToProcessedFolder(pathToRawData,errorFile)
        if(moveToProcessed):
            createTxt = createTxtFiles(os.path.join(os.getcwd(), "processed"),errorFile)
            if(createTxt):
                errorFile.close()
                return "Success"
    errorFile.close()
    return None
    
#rawToWorkspace("C:\\Users\\Sydney.knox\\Documents\\rawDataDI")

## Organize the Data

### Function: Create Group folders and move group files in

In [107]:
def getGroupFolders():
    cwd = os.getcwd()
    dataPath = os.path.join(cwd, "data")
    outputPath = os.path.join(cwd, "output")
    
    errorFilePath = os.path.join(os.getcwd(), 'cannot_process.csv')
    errorFile = open(errorFilePath, 'w')
    
    try:
        if not(os.path.isdir(dataPath)):
            raise Exception(dataPath + " doesn't exist.")
        elif not(os.path.isdir(outputPath)):
            raise Exception(outputPath + " doesn't exist.")
        elif not(os.listdir(outputPath)):
            raise Exception(dataPath + " is empty.")
        else:
            #get all of the base info for each file
            base_info = batchGetTokens(errorFile, outputPath)
            if not base_info:
                print("Error in getting tokens")
            else:
                
                sorted_file_info = {}
                #print(sorted_file_info)
                #Get all the meta data for each file
                for file in base_info:
                    destPath = dataPath
                    
                    file_attr = getMetaDataAtt(file)
                    if((file_attr['groupNumber']) and not(int(file_attr['groupNumber']) is -1)):
                        gn = file_attr['groupNumber']
                        destPath = os.path.join(destPath, str(int(gn)))
                    else:
                        gn = -1
                    #destPath = os.path.join(destPath, "no_group_number")
                    try:
                        sorted_file_info[str(int(gn))].append(file_attr)
                    except KeyError as e:
                        sorted_file_info[str(int(gn))] = []
                        sorted_file_info[str(int(gn))].append(file_attr)
                    except Exception as e:
                        print(str(e))

                for group in sorted_file_info:
                    print(group)
                    group_start_dates = set()
                    group_ch_names = set()
                    oneStartDate = 0
                    oneCHName = 0
                    
                    
                    for file in sorted_file_info[group]:
                        if(file['contractStartDate'].year > 1950):
                            group_start_dates.add(file['contractStartDate'].date())
                        if(len(file['contractholderName'])>0):
                            group_ch_names.add(file['contractholderName'].rstrip())
                
                
                    if(len(group_start_dates)==1):
                        oneStartDate = 1
                    elif(len(group_start_dates)==0):
                        oneStartDate = 1
                        group_start_dates.add("unknown_startDate")
                    if(len(group_ch_names)==1):
                        oneCHName = 1
                    elif(len(group_ch_names)==0):
                        oneCHName = 1
                        group_ch_names.add("unknown_name")

                    for file in sorted_file_info[group]:
                        if(int(group) == -1):
                            destPath = os.path.join(dataPath, "no_group_number")
                        else:
                            destPath = os.path.join(dataPath, str(int(group)))
                        #print(destPath)
                        if oneCHName:
                            temp = group_ch_names.pop()
                            destPath = os.path.join(destPath, temp)
                            #print(destPath)
                            group_ch_names.add(temp)
                        else:
                            if(len(file['contractholderName'])>0):
                                destPath = os.path.join(destPath, file['contractholderName'])
                            else:
                                destPath = os.path.join(destPath, "unknown_name")
                        if oneStartDate:
                            temp = group_start_dates.pop()
                            destPath = os.path.join(destPath, str(temp))
                            #print(destPath)
                            group_start_dates.add(temp)
                        else:
                            destPath = os.path.join(destPath, str(file['contractStartDate'].date()))
                        
                        try:
                            print(destPath)
                            os.makedirs(destPath, exist_ok=True)
                            #print(file['filepath'])
                            shutil.copy(file['filepath'], destPath)
                        except Exception as e:
                            print(str(e))
        
                    
    except Exception as e:
        print(str(e))


### Run 'em

In [108]:
#getGroupFolders()

## Run through using group folders

In [109]:
def createGroupCSV():
    cwd = os.getcwd()
    dataPath = os.path.join(cwd, "data")

    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')

    #attrByGroupFilePath = os.path.join(os.getcwd(), 'data', 'raw_attr_data_byGroup.csv')
    #attrByGroupFile = open(attrByGroupFilePath, 'w')

    try:
        if(os.path.isdir(dataPath)):

            for group in os.listdir(dataPath):
                print(group)            
                groupPath = os.path.join(dataPath, group)
                if(os.path.isdir(groupPath)):
                    base_info = batchGetTokens(errorFile, groupPath)
                    if not base_info:
                        print("Error in getting tokens")
                    else:
                        outputFilePath = os.path.join(groupPath,'group_'+ group + '_attribute_data.csv')
                        outputFile=open(outputFilePath, 'w')    
                        first_row = 1

                        for file in base_info:
                            file_attr = getMetaDataAtt(file)
                            if first_row:
                                for key in file_attr:
                                    outputFile.write(key + ",")
                                outputFile.write("\n")
                                first_row = 0
                            else:
                                for attr in file_attr:
                                    outputFile.write(str(file_attr[attr])+ ",")
                            outputFile.write("\n")

                        outputFile.close()

        else:
            raise Exception("data folder doesn't exist.")
    except Exception as e:
        print(str(e))

    errorFile.close()
    
    
def collectGroupInfo():
    cwd = os.getcwd()
    dataPath = os.path.join(cwd, "data")

    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')

    attrByGroupFilePath = os.path.join(os.getcwd(), 'data', 'raw_attr_data_byGroup.csv')
    attrByGroupFile = open(attrByGroupFilePath, 'w')

    try:
        if(os.path.isdir(dataPath)):

            for group in os.listdir(dataPath):
                
                
                
                groupPath = os.path.join(dataPath, group)
                if(os.path.isdir(groupPath)):
                    print(group)
                    group_attr = {}
                    first_file = 1
                    
                    base_info = batchGetTokens(errorFile, groupPath)
                    if not base_info:
                        print("Error in getting tokens")
                    else:
                        
                        for file in base_info:
                            file_attr = getMetaDataAtt(file)
                            #print(file_attr)
                            if first_file:
                                for key in file_attr:
                                    try:
                                        group_attr[key] = []
                                    except Exception as e:
                                        print(str(e))
                                first_file = 0
                            
                            print(group_attr)
                            
                            for attr in file_attr:
                                #print(attr)
                                #print(file_attr[attr] == "-1")
                                #print(attr)
                                #print((attr is 'groupNumber') or (attr is 'contractDuration'))
                                if((attr is 'groupNumber') or (attr is 'contractDuration')):
                                    #print(file_attr[attr] == "-1")
                                    #print(file_attr[attr] == -1)
                                    if not ((file_attr[attr]) == -1):
                                        group_attr[attr].append(file_attr[attr])
                                if((attr is 'contractStartDate') or (attr is 'contractEndDate')):
                                    #print((file_attr[attr] == datetime.datetime(1066,1,1)))
                                    if not (file_attr[attr] == datetime.datetime(1066,1,1)):
                                        group_attr[attr].append(file_attr[attr])
                                if((attr is 'clientLocation') or (attr is 'deltaOfficeLocation')):
                                    #print(file_attr[attr] is 'not_a_state')
                                    if not file_attr[attr] is 'not_a_state':
                                        group_attr[attr].append(file_attr[attr])
                                if(attr is 'contractHolderName'):
                                    if not file_attr[attr] is '':
                                        group_attr[attr].append(file_attr[attr])
                            print(group_attr)
                        #attrByGroupFile.write("\n")
                        for attr in group_attr:
                            #print(group_attr[attr])
                            try:
                                attrMode = mode(group_attr[attr])
                                attrByGroupFile.write(str(attrMode) + ",")
                                print(attrMode)
                            except ValueError as ve:
                                if(len(group_attr[attr])>0):
                                    attrByGroupFile.write(str(set(group_attr[attr])) + ",")
                                else:
                                    attrByGroupFile.write(",")
                            except Exception as e:
                                #print(attr + "has no mode.")
                                attrByGroupFile.write(",")
                                print(str(e))
                        attrByGroupFile.write("\n")
                            

        else:
            raise Exception("data folder doesn't exist.")
    except Exception as e:
        print(str(e))

    errorFile.close()
    attrByGroupFile.close()

In [110]:
#collectGroupInfo()

## Rate Table Extraction

In [111]:
def isRateTable(table):
    count = 0
    paymentFlag = 0
    percentFind = re.compile("%")
    for row in table.rows:
        for cell in row.cells:
            result = re.findall(percentFind, cell.text)
            if(len(result)):
                count += len(result)
            if(((cell.text).lower()).find('contractholder shall pay') > -1) or (((cell.text).lower()).find('primary enrollee shall pay') > -1):
                paymentFlag = 1
    if (count >= 2) and (paymentFlag == 0):
        return True
    else:
        return False


    
def isMultiPlan(table):
    isMultiPlan = False
    multiPlanList = ["high plan", "enhanced plan", "plan design"]
    for row in table.rows:
        for cell in row.cells:
            if(any([x in (cell.text).lower() for x in multiPlanList])):
                isMultiPlan = True
    return isMultiPlan



def processSinglePlan(table):
    first_row=1
    first_row_found = 0
    title_row=1
    temp_obj = []
    row_template = []

    for row in table.rows:

        if(((row.cells[0]).text).lower().find("contract benefit level") > -1):
            first_row_found = 1
        else:
            if first_row_found:

                if(title_row):
                    row_template = []
                    for cell in row.cells:
                        row_template.append(cell.text)
                    title_row = 0
                else:
                    new_row = {}
                    for cat in row_template:
                        new_row[cat] = ""
                    for index, cell in enumerate(row.cells):
                        new_row[row_template[index]] = cell.text

                    temp_obj.append(new_row)

    if not first_row_found:#never found contract benefit levels
         for row in table.rows:
            ppoFlag = 0
            non_ppoFlag = 0

            ppoList = ["ppo providers", "pposm providers", "dpo providers", "delta dental ppo", "in-network"]
            non_ppoList = ["non-delta dental providers","out-of-network"]

            for cell in row.cells:
                #print(cell.text)
                if any([x in (cell.text).lower() for x in ppoList]):
                    ppoFlag = 1
                    #print(cell.text)
                if any([x in (cell.text).lower() for x in non_ppoList]):
                    non_ppoFlag = 1
                    #print(cell.text)

            if(ppoFlag and non_ppoFlag):
                first_row_found = 1

            if first_row_found:
                if(title_row):
                    row_template = []
                    for cell in row.cells:
                        row_template.append(cell.text)
                    title_row = 0
                    #print(row_template)
                else:
                    new_row = {}
                    for cat in row_template:
                        new_row[cat] = ""
                    for index, cell in enumerate(row.cells):
                        new_row[row_template[index]] = cell.text

                    #print(new_row)
                    temp_obj.append(new_row)

    #print(temp_obj)
    #print("\n")

    plan_obj = []

    for row in temp_obj:
        cat = {}
        categoryIdentifiers = ["categor","benefits"]
        for cell in row:
            if (len(cell)==0) or (any([x in cell.lower() for x in categoryIdentifiers])):
                if ("diagnostic" in (row[cell]).lower()) and ("preventive" in (row[cell]).lower()):
                    cat['category'] = "DandP"
                elif ("basic" in (row[cell]).lower()):
                    cat['category'] = "basic"
                elif ("major" in (row[cell]).lower()):
                    cat['category'] = "major"
                elif ("orthodontic" in (row[cell]).lower()):
                    cat['category'] = "orthodontic"

                else:
                    if(len(row)>=3):
                        cat['category'] = row[cell].lower().replace("\n"," ")

            elif(" ppo" in cell.lower()) or ("in-network" in cell.lower()) or (" dpo " in cell.lower()):
                kw = ["%","not covered"]

                if any([x in (row[cell]).lower() for x in kw]):
                    cat['PPO_rate'] = row[cell]

            elif("non-delta" in cell.lower()) or ("out-of-network" in cell.lower()):
                kw = ["%","not covered"]

                if any([x in (row[cell]).lower() for x in kw]):
                    cat['non-PPO_rate'] = row[cell]

        if(len(cat)>=3):
            plan_obj.append(cat)

    return plan_obj



def processMultiPlan(table):
    first_row=1
    first_row_found = 0
    plan_row=1
    title_row=1
    temp_obj = {}
    row_template = []

    for row in table.rows:
        if not first_row_found:
            for cell in row.cells:
                if((cell.text).lower().find("contract benefit level") == 0):
                    first_row_found = 1
        else:
            if(plan_row):
                plan_template = []
                for cell in row.cells:
                    plan_template.append(cell.text)
                plan_row = 0
               # print(plan_template)
                for plan in plan_template:
                    temp_obj[plan] = []
               # print(temp_obj)
            elif(title_row):
                row_template = []
                for cell in row.cells:
                    row_template.append(cell.text)
                title_row = 0
                #print(row_template)
            else:
                for plan in temp_obj:
                    if ("plan" in plan.lower()) and not ("delta dental will pay" in (row.cells[0].text).lower()):
                       # print(plan)
                        new_row = {}
                        for index, cat in enumerate(row_template):
                            if not("plan" in plan_template[index].lower()):
                                new_row[cat] = ""
                            if(plan_template[index] == plan):
                                new_row[cat] = ""
                        for index, cell in enumerate(row.cells):
                            if not("plan" in plan_template[index].lower()):
                                new_row[row_template[index]] = cell.text
                            if(plan_template[index] == plan):
                                new_row[row_template[index]] = cell.text
                        #print(new_row)
                        temp_obj[plan].append(new_row)

    if not first_row_found:#never found contract benefit levels
        highPlanFlag = 0
        lowPlanFlag = 0
        for row in table.rows:
            #ppoFlag = 0
            #non_ppoFlag = 0
            if(len(set(row.cells))==1):
                #print(row)
                break

            #ppoList = ["ppo providers", "pposm providers", "dpo providers", "delta dental ppo", "in-network"]
            #non_ppoList = ["non-delta dental providers","out-of-network"]
            highPlanList = ["high plan", "enhanced plan", "premier plan"]
            lowPlanList = ["low plan", "standard plan", "ppo plan"]

            for cell in row.cells:
                #print(cell.text)
                if any([x in (cell.text).lower() for x in highPlanList]):
                    highPlanFlag = 1
                    #print(cell.text)
                if any([x in (cell.text).lower() for x in lowPlanList]):
                    lowPlanFlag = 1
                    #print(cell.text)

            if(highPlanFlag and lowPlanFlag):
                if(plan_row):
                    plan_template = []
                    for cell in row.cells:
                        plan_template.append(cell.text)
                    plan_row = 0
                   # print(plan_template)
                    for plan in plan_template:
                        temp_obj[plan] = []
                   # print(temp_obj)
                elif(title_row):
                    row_template = []
                    for cell in row.cells:
                        row_template.append(cell.text)
                    title_row = 0
                    #print(row_template)
                else:
                    for plan in temp_obj:
                        if ("plan" in plan.lower()) and not ("delta dental will pay" in (row.cells[0].text).lower()):
                           # print(plan)
                            new_row = {}
                            for index, cat in enumerate(row_template):
                                if not("plan" in plan_template[index].lower()):
                                    new_row[cat] = ""
                                if(plan_template[index] == plan):
                                    new_row[cat] = ""
                            for index, cell in enumerate(row.cells):
                                if not("plan" in plan_template[index].lower()):
                                    new_row[row_template[index]] = cell.text
                                if(plan_template[index] == plan):
                                    new_row[row_template[index]] = cell.text
                            #print(new_row)
                            temp_obj[plan].append(new_row)

    #print(temp_obj)
    plan_set = {}


    for plan in temp_obj:
        #print(plan)
        plan_obj = []
        for row in temp_obj[plan]:
            #print(row)
            cat = {}
            for cell in row:
                if(" ppo" in cell.lower()) or ("in-network" in cell.lower()) or (" dpo " in cell.lower()):
                    kw = ["%","not covered","n/a"]

                    if any([x in (row[cell]).lower() for x in kw]):
                        cat['PPO_rate'] = row[cell]
                elif("non-delta" in cell.lower()) or ("out-of-network" in cell.lower()):
                    kw = ["%","not covered","n/a"]

                    if any([x in (row[cell]).lower() for x in kw]):
                        cat['non-PPO_rate'] = row[cell]

                else:#("categor" in cell.lower()) or (len(cell)==0):
                    #print(cell.lower())
                    #print((row[cell]).lower())
                    if ("diagnostic" in (row[cell]).lower()) and ("preventive" in (row[cell]).lower()):
                        cat['category'] = "DandP"
                    elif ("basic benefit" in (row[cell]).lower()) or ("basic service" in (row[cell]).lower()):
                        cat['category'] = "basic"
                    elif ("major benefit" in (row[cell]).lower()) or ("major service" in (row[cell]).lower()):
                        cat['category'] = "major"
                    elif ("orthodontic benefit" in (row[cell]).lower()) or ("orthodontic service" in (row[cell]).lower()):
                        cat['category'] = "orthodontic"

                    else:
                        cat['category'] = (row[cell]).lower().replace("\n"," ")

            if(len(cat)>=3):
                plan_obj.append(cat)
        if(len(plan_obj)>0):
            plan_set[plan] = plan_obj

    #print(plan_set)
    return plan_set
    





def isNormalized(table):
    isNormalized = False
    for row in table.rows:
        if(((row.cells[0].text).lower()).find('contract benefit levels')):
            isNormalized = True  
    return isNormalized



def isDenormalized(sentTokens):
    isDeNormalized = False
    BCSfound = False

    for index, sent in enumerate(sentTokens):
        if("benefit summary chart" in sent.lower()):
            count = 0
            regex = re.compile("%")
            for i in range(0,20):
                try:
                    numPercents = regex.findall(sentTokens[index + i])
                    #print(len(numPercents))
                    count += len(numPercents)
                except Exception as e:
                    print(str(e))
            #print(count)
            if(count > 20):
                    isDeNormalized = 1
    return isDeNormalized



def printAttributes(plan_obj, outputFile, row):
    #    "DandP PPO":"",
    #    "DandP non-PPO":"",
    #    "basic PPO":"",
    #    "basic non-PPO":"",
    #    "major PPO":"",
    #    "major non-PPO":"",
    #    "orthodontic PPO":"",
    #    "orthodontic non-PPO":""

    categoryList = ['DandP','basic','major','orthodontic']
    try:
        for cat in categoryList:
            found=0
            #print(cat)
            for plan_row in plan_obj:
                if (len(plan_row)>=3) and (plan_row['category'] == cat):
                    found = 1
                    #outputFile.write(row['PPO_rate'] + "," + row['non-PPO_rate'] + ",")
                    row[str(cat)+" PPO"] = plan_row['PPO_rate']
                    row[str(cat)+" non-PPO"] = plan_row['non-PPO_rate']

            #if not found:
            #    outputFile.write(",,")
        for plan_row in plan_obj:
            if (plan_row['category'] not in categoryList) and (len(plan_row)>=3 and plan_row['PPO_rate'] and plan_row['non-PPO_rate']):
                row[str(plan_row['category']) + " PPO"] = plan_row['PPO_rate']
                row[str(plan_row['category']) + " non-PPO"] = plan_row['non-PPO_rate']

                #outputFile.write(row['category'] + ":" + row['PPO_rate'] + "," + row['category'] + ":" + row['non-PPO_rate'] + ",")
    except Exception as e:
        print(str(e))
        print(row['Filename'])
        print(plan_obj)

    return row
    #outputFile.write("\n")
            

        
def getRateTables(file_info):
    plan_objs = []
    try:
        filePath = file_info['filepath']
        fileName = file_info['fileName']

        if not "-pdf." in fileName:
            docxFileName = fileName.replace(".txt", ".docx")
            docxFilePath = os.path.join(os.getcwd(), "processed", docxFileName)

            document = docx.Document(docxFilePath)
            docRateTables = []

            for table in document.tables:
                if(isRateTable(table)):
                    if(isMultiPlan(table)):
                        plan_objs = processMultiPlan(table)
                    else:
                        plan_obj = processSinglePlan(table)
                        plan_objs.append(plan_obj)
    except Exception as e:
        print(str(e))
        return None
    return plan_objs     
        

def rateTableCSV():
    from docx import Document
    cwd = os.getcwd()

    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')

    base_info = None
    #pp = batchPreProcess(errorFile, os.path.join(cwd, "processed"))
    #if pp == None:
    #    print("Error in pp")
    #else:
    dataPath = os.path.join(cwd, "output")
    base_info = batchGetTokens(errorFile, dataPath)
    if not base_info:
        print("Error in getting tokens")
    else:
        outputFilePath = os.path.join(os.getcwd(),'rate_table_output.csv')
        outputFile=open(outputFilePath, 'w')
        #first_row = 1
        table_rows = []
        #outputFile.write("Filename,hasRateTable,isMultiplan,isNormalized,DandP PPO,DandP non-PPO,basic PPO,basic non-PPO,major PPO,major non-PPO,orthodontics PPO,orthodontics non-PPO\n")

        for file in base_info:#os.listdir(os.path.join(cwd, "processed")):
            row_template = {
                        "Filename":"",
                        "hasRateTable":"",
                        "isMultiplan":"",
                        "isNormalized":"",
                        "DandP PPO":"",
                        "DandP non-PPO":"",
                        "basic PPO":"",
                        "basic non-PPO":"",
                        "major PPO":"",
                        "major non-PPO":"",
                        "orthodontic PPO":"",
                        "orthodontic non-PPO":""
                    }
            row = row_template
            try:
                #file['fileName'] = os.path.split(file['filepath'])[1]
                row['Filename'] = os.path.split(file['filepath'])[1]
                if(isDenormalized(file['sentTokens'])):
                    #outputFile.write("\""+file['fileName'] + "\"" + ",True,False,False,\n")
                    row['hasRateTable'] = "True"
                    row['isMultiplan'] = "False"
                    row['isNormalized'] = "False"
                    table_rows.append(row)
                elif not(row['Filename'].endswith("-pdf.txt")):
                    docxFileName = row['Filename'].replace("txt","docx")
                    document = Document(os.path.join(cwd, "processed", docxFileName))
                    docRateTables = []
                    #print(docxFileName)#outputFile.write(file+",")
                    if(len(document.tables)>0):
                        rateTableFound = 0
                        for table in document.tables:
                            if(isRateTable(table)):
                                rateTableFound = 1
                                #print("is rate table")
                                if(isMultiPlan(table)):
                                    #print("is Multiplan\n")
                                    list_plan_objs = processMultiPlan(table)
                                    #print(len(list_plan_objs))
                                    for plan_obj in list_plan_objs:
                                        row = row_template
                                        row['Filename'] = os.path.split(file['filepath'])[1]
                                        #outputFile.write("\"" + docxFileName + "\"" + ",True,True,True,")
                                        row['hasRateTable'] = "True"
                                        row['isMultiplan'] = "True"
                                        row['isNormalized'] = "True"
                                        row = printAttributes(list_plan_objs[plan_obj], outputFile, row)
                                        table_rows.append(row)
                                else:
                                    #print("is single plan")
                                    plan_obj = processSinglePlan(table)
                                    #outputFile.write("\"" + docxFileName + "\""+",True,False,True,")
                                    row['hasRateTable'] = "True"
                                    row['isMultiplan'] = "False"
                                    row['isNormalized'] = "True"
                                    row = printAttributes(plan_obj, outputFile, row)
                                    table_rows.append(row)
                        if not rateTableFound:
                            #outputFile.write("\"" + docxFileName + "\""+",False,,,,,,,,,,\n")
                            row['hasRateTable'] = "False"
                            table_rows.append(row)
                    else:
                        #outputFile.write("\"" + docxFileName + "\""+",False,,,,,,,,,,\n")
                        row['hasRateTable'] = "False"
                        table_rows.append(row)
                else:
                    #outputFile.write("\""+file['fileName'] + "\""+",False,\n")
                    row['hasRateTable'] = "False"
                    table_rows.append(row)

            except Exception as e:
                print(str(e))

        outputFile.close()
    errorFile.close()

    #for row in table_rows:
    #    print(row)

    return table_rows

def printRateTableCSV(table_rows):
    cwd = os.getcwd()

    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')

    outputFilePath = os.path.join(os.getcwd(),'rate_table_output.csv')
    outputFile=open(outputFilePath, 'w')

    categoryList = ["Filename","hasRateTable","isMultiplan","isNormalized","DandP PPO","DandP non-PPO","basic PPO","basic non-PPO","major PPO","major non-PPO","orthodontic PPO","orthodontic non-PPO"]

    for row in table_rows:
        for cat in row:
            if not (cat in categoryList):
                categoryList.append(cat)

    #print(categoryList)
    for cat in categoryList:
        outputFile.write(cat + ",")
    outputFile.write("\n")

    for row in table_rows:
        for index,category in enumerate(categoryList):
            found = 0
            for attr in row:
                if attr == category:
                    outputFile.write("\""+row[attr]+"\"" + ",")
                    found = 1
            if not found:
                outputFile.write(",")
        outputFile.write("\n")


    outputFile.close()
    errorFile.close()    
    
    
def findingRateTablesTestFunction():
    from docx import Document
    cwd = os.getcwd()


    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')


    base_info = None
    #pp = batchPreProcess(errorFile, os.path.join(cwd, "processed"))
    #if pp == None:
    #    print("Error in pp")
    #else:
    dataPath = os.path.join(cwd, "output")
    base_info = batchGetTokens(errorFile, dataPath)
    if not base_info:
        print("Error in getting tokens")
    else:
        outputFilePath = os.path.join(os.getcwd(),'rate_table_output.csv')
        outputFile=open(outputFilePath, 'w')
        first_row = 1

        outputFile.write("Filename,hasRateTable,isMultiplan,isNormalized,DandP PPO,DandP non-PPO,basic PPO,basic non-PPO,major PPO,major non-PPO,orthodontics PPO,orthodontics non-PPO\n")

        for file in base_info:#os.listdir(os.path.join(cwd, "processed")):
            count = 0
            try:
                file['fileName'] = os.path.split(file['filepath'])[1]
                if(isDenormalized(file['sentTokens'])):
                    outputFile.write("\""+file['fileName'] + "\"" + ",True,False,False,\n")

                elif not(file['fileName'].endswith("-pdf.txt")):
                    docxFileName = file['fileName'].replace("txt","docx")
                    document = Document(os.path.join(cwd, "processed", docxFileName))
                    docRateTables = []
                    print(docxFileName)#outputFile.write(file+",")
                    if(len(document.tables)>0):
                        rateTableFound = 0
                        for table in document.tables:
                            if(isRateTable(table)):
                                rateTableFound = 1
                                #print("is rate table")
                                if(isMultiPlan(table)):
                                    print("is Multiplan\n")
                                    list_plan_objs = processMultiPlan(table)
                                    print(len(list_plan_objs))
                                    for plan_obj in list_plan_objs:
                                        outputFile.write("\"" + docxFileName + "\"" + ",True,True,True,")
                                        printAttributes(list_plan_objs[plan_obj], outputFile)
                                else:
                                    print("is single plan")
                                    plan_obj = processSinglePlan(table)
                                    outputFile.write("\"" + docxFileName + "\""+",True,False,True,")
                                    printAttributes(plan_obj, outputFile)
                        if not rateTableFound:
                            outputFile.write("\"" + docxFileName + "\""+",False,,,,,,,,,,\n")
                    else:
                        outputFile.write("\"" + docxFileName + "\""+",False,,,,,,,,,,\n")
                else:
                    outputFile.write("\""+file['fileName'] + "\""+",False,\n")
            except Exception as e:
                print(str(e))

        outputFile.close()
    errorFile.close()

## Deductible Extraction

In [112]:
def isDeductibleTable(table):
    isDeductible = False
    keywords = ["deductible", "deductibles and maximum"]
    for kw in keywords:
        regex = re.compile(kw)
        if table.rows:
            for row in table.rows:
                for cell in row.cells:

                    results = re.search(regex, (cell.text).lower())
                    #print(results)
                    if results:
                        isDeductible = True
                    #print(cell.text)
        #if not isDeductible:



    return isDeductible

def processDeductibleTable(table):

    tableInfo = {}
    categories = ["annual deductible","annual maximum","orthodontic maximum"]
    if isMultiPlan(table):
        print("is multiplan")
    else:
        first_row=1
        first_row_found = 0
        title_row=1
        cat_row = {}

        for row in table.rows:

            if(((row.cells[0]).text).lower().find("deductible") > -1) and not first_row_found:
                first_row_found = 1
            else:
                if(title_row):
                    ppoFlag = 0
                    non_ppoFlag = 0

                    ppoList = ["ppo providers", "pposm providers", "dpo providers", "delta dental ppo", "in-network"]
                    non_ppoList = ["non-delta dental providers","out-of-network"]

                    for cell in row.cells:
                        #print(cell.text)
                        if any([x in (cell.text).lower() for x in ppoList]):
                            ppoFlag = 1
                            #print(cell.text)
                        if any([x in (cell.text).lower() for x in non_ppoList]):
                            non_ppoFlag = 1
                    if ppoFlag and non_ppoFlag:

                        cat_row["categories"] = ""
                        for cell in range(1,len(row.cells)):
                            cat_row["categories"] = ";".join([cat_row["categories"], row.cells[cell].text])

                        #tableInfo.append(cat_row)
                    title_row = 0

                if first_row_found:
                    new_row = {}
                    for cat in categories:
                        if cat in (row.cells[0].text).lower():
                            tableInfo[cat] = ""
                            for cell in range(1, len(row.cells)):
                                tableInfo[cat] = ";".join([tableInfo[cat], row.cells[cell].text])
                    #if(len(new_row)>0):
                        #print(new_row)
                    #    tableInfo.append(new_row)


        #print(tableInfo)
        plan = {}
        if(len(cat_row)>0):
            cats = cat_row["categories"].split(";")
            print(cats)
            for cat in cats:
                plan[cat] = {}
            for row in tableInfo:
                #print(row)
                if row is "annual deductible":
                    cells = tableInfo[row].split(";")
                    moneyRegex = re.compile("\$\d+[,]?\d*")
                    for index, cell in enumerate(cells):
                        #print(cell)
                        results = re.search(moneyRegex, cell)
                        if(results):
                            plan[cats[index]] = cell
                            #print(results.group())
        print(plan)



        return tableInfo

def findingDeductibleTablesTestFunction():
    from docx import Document
    cwd = os.getcwd()


    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')


    base_info = None
    #pp = batchPreProcess(errorFile, os.path.join(cwd, "processed"))
    #if pp == None:
    #    print("Error in pp")
    #else:
    dataPath = os.path.join(cwd, "output")
    base_info = batchGetTokens(errorFile, dataPath)
    if not base_info:
        print("Error in getting tokens")
    else:
        #outputFilePath = os.path.join(os.getcwd(),'rate_table_output.csv')
        #outputFile=open(outputFilePath, 'w')
        #first_row = 1

        #outputFile.write("Filename,isRateTable,isMultiplan,isNormalized,DandP PPO,DandP non-PPO,basic PPO,basic non-PPO,major PPO,major non-PPO,orthodontics PPO,orthodontics non-PPO\n")

        for file in os.listdir(os.path.join(cwd, "processed")):
            #count = 0
            try:
                if(file.endswith(".docx")):
                    document = Document(os.path.join(cwd, "processed", file))
                    docRateTables = []
                    #print(file)#outputFile.write(file+",")
                    #print(file)
                    for table in document.tables:
                        if isDeductibleTable(table):#print(table)
                            print(file)
                            if(len(table.rows[0].cells[0].text)>0):
                                #print(file)
                                #print(table.rows[0].cells[0].text)
                                #print("\n")
                                deductibleInfo = processDeductibleTable(table)
                        #outputFile.write(file+",")
                        #if(isDeductibleTable(table)):
                        #    print(file)

            except Exception as e:
                print(str(e))

        #outputFile.close()
    errorFile.close()

## Modification Tests

In [114]:
def modificationTest(pathToRawFile):
        import stat
        try:
            if os.path.isfile(pathToRawFile):

                dirPath = os.path.join(os.getcwd(),"modificationTestDir")
                if not os.path.isdir(dirPath):
                    os.makedirs(dirPath)

                        #print("moving: " + file)
                    shutil.copy(pathToRawFile, os.path.join(dirPath, os.path.split(pathToRawFile)[1]))

                    os.chmod(dirPath, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
                    os.chmod(os.path.join(dirPath, os.path.split(pathToRawFile)[1]), stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
                    rawToWorkspace("C:\\Users\\Sydney.knox\\Documents\\data-insights\\modificationTestDir")
                    testOutputAttrCSV()
                    tableRows = rateTableCSV()
                    printRateTableCSV(tableRows)
                    #catch copy exception here so it doesn't stop all files (?)
            else:
                print("not a file")
        except Exception as e:
            print(str(e))

def modificationTest_changeFile(pathToRawFile):
        import stat
        try:
            if os.path.isfile(pathToRawFile):

                dirPath = os.path.join(os.getcwd(),"modificationTestDir")
                if not os.path.isdir(dirPath):
                    os.makedirs(dirPath)

                filepath = os.path.join(dirPath, os.path.split(pathToRawFile)[1])

                if not os.path.isfile(filepath):
                    shutil.copy(pathToRawFile, filepath)

                os.chmod(dirPath, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
                os.chmod(filepath, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

                word = win32.Dispatch("Word.application")
                #word = win32.gencache.EnsureDispatch('Word.Application')
                doc = word.Documents.Open(filepath)
                doc.Activate()

                # Rename path with .docx
                #new_file_abs = os.path.abspath(filepath)
                #new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)
                word.Selection.Find.Text = "the"
                word.Selection.Find.Replacement.Text = "Pariveda Solutions"
                word.Selection.Find.Execute(Replace=2, Forward=True)
                # Save and Close
                word.ActiveDocument.SaveAs(
                    filepath, FileFormat=16
                )

                word.Quit()
                    #catch copy exception here so it doesn't stop all files (?)
            else:
                print("not a file")
        except Exception as e:
            print(str(e))


#pathToRawFile = "C:\\Users\\Sydney.knox\\Documents\\rawDataDI\\TX 17404 Contract Regional (7.2.18).docx"
#modificationTest(pathToRawFile)
#modificationTest_changeFile(pathToRawFile)

## Output Attribute CSV -- Temp Function
    Assumes output folder is full of text files -- ie workspace setup has been run successfully

In [175]:
def testOutputAttrCSV():
    cwd = os.getcwd()


    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')


    base_info = None
    #pp = batchPreProcess(errorFile, os.path.join(cwd, "processed"))
    #if pp == None:
    #    print("Error in pp")
    #else:
    dataPath = os.path.join(cwd, "output")
    base_info = batchGetTokens(errorFile, dataPath)
    if not base_info:
        print("Error in getting tokens")
    else:
        outputFilePath = os.path.join(os.getcwd(),'data','raw_attribute_data.csv')
        outputFile=open(outputFilePath, 'w')
        first_row = 1
        for file in base_info:
            
            file_attr = getMetaDataAtt(file)
            #plans = getRateTables(file_attr)
            #if(plans and len(plans)>0):
            #    file_attr['hasRateTable'] = "True"
            #else:
            #    file_attr['hasRateTables'] = "False"

            try:
                filename = (os.path.split(file['filepath'])[1])
            except Exception as e:
                print(str(e))

            if first_row:
                for key in file_attr:
                    #print(key)
                    outputFile.write(key + ",")
                outputFile.write("\n")
                for attr in file_attr:
                    #print(attr + ": ")
                    attrStr = ""
                    if ((attr == 'fileTypes') or (attr == 'footer')) and file_attr[attr]:
                        for substr in file_attr[attr]:
                            attrStr = attrStr + substr + ";"
                        outputFile.write(attrStr+",")
                    else:
                        outputFile.write(str(file_attr[attr])+ ",")
                outputFile.write("\n")
                first_row = 0
            else:
                for attr in file_attr:
                    #print(attr + ": " + file_attr[attr])
                    attrStr = ""
                    if ((attr == 'fileTypes') or (attr == 'footer')) and file_attr[attr]:
                        for substr in file_attr[attr]:
                            attrStr = attrStr + substr + ";"
                        outputFile.write(attrStr+",")
                    else:
                        outputFile.write("\"" + str(file_attr[attr])+ "\",")
            #    outputFile.write(filename + ",")
            #    outputFile.write(str(count) + ",")
                outputFile.write("\n")#

        outputFile.close()
    errorFile.close()

def testDenormalized():
    cwd = os.getcwd()


    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')


    base_info = None
    #pp = batchPreProcess(errorFile, os.path.join(cwd, "processed"))
    #if pp == None:
    #    print("Error in pp")
    #else:
    dataPath = os.path.join(cwd, "output")
    base_info = batchGetTokens(errorFile, dataPath)
    if not base_info:
        print("Error in getting tokens")
    else:
        #outputFilePath = os.path.join(os.getcwd(),'data','raw_attribute_data.csv')
        #outputFile=open(outputFilePath, 'w')
        #first_row = 1
        for file in base_info:
            #file_attr = getMetaDataAtt(file)
            #plans = getRateTables(file_attr)
            filename = os.path.split(file['filepath'])[1]
            if(isDenormalized(file['sentTokens'])):
                print(filename)
                print("Is denormalized")

        #outputFile.close()
    errorFile.close()


def fullOutputAttrCSV():
    cwd = os.getcwd()


    errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
    errorFile = open(errorFilePath, 'w')


    base_info = None
    pp = batchPreProcess(errorFile, os.path.join(cwd, "processed"))
    if pp == None:
        print("Error in pp")
    else:
        dataPath = os.path.join(cwd, "output")
        base_info = batchGetTokens(errorFile, dataPath)
        if not base_info:
            print("Error in getting tokens")
        else:
            outputFilePath = os.path.join(os.getcwd(),'data','raw_attribute_data.csv')
            outputFile=open(outputFilePath, 'w')
            first_row = 1
            for file in base_info:
                file_attr = getMetaDataAtt(file)
                plans = getRateTables(file_attr)
                if(plans and len(plans)>0):
                    file_attr['hasRateTable'] = "True"
                else:
                    file_attr['hasRateTables'] = "False"

                try:
                    filename = (os.path.split(file['filepath'])[1])
                    #print(filename)
        #
                except Exception as e:
                    print(str(e))###
    ##
    #
        #    #print(count)
                if first_row:
                    for key in file_attr:
                        #print(key)
                        outputFile.write(key + ",")
                    outputFile.write("\n")
                    for attr in file_attr:
                        #print(attr + ": ")
                        attrStr = ""
                        if ((attr == 'fileTypes') or (attr == 'footer')) and file_attr[attr]:
                            for substr in file_attr[attr]:
                                attrStr = attrStr + substr + ";"
                            outputFile.write(attrStr+",")
                        else:
                            outputFile.write(str(file_attr[attr])+ ",")
                    outputFile.write("\n")
                    first_row = 0
                else:
                    for attr in file_attr:
                        #print(attr + ": " + file_attr[attr])
                        attrStr = ""
                        if ((attr == 'fileTypes') or (attr == 'footer')) and file_attr[attr]:
                            for substr in file_attr[attr]:
                                attrStr = attrStr + substr + ";"
                            outputFile.write(attrStr+",")
                        else:
                            outputFile.write("\"" + str(file_attr[attr])+ "\",")
                #    outputFile.write(filename + ",")
                #    outputFile.write(str(count) + ",")
                    outputFile.write("\n")#

            outputFile.close()
    errorFile.close()

#rawToWorkspace("C:\\Users\\Sydney.knox\\Documents\\rawDataDI")
#fullOutputAttrCSV()
#tableRows = rateTableCSV()
#printRateTableCSV(tableRows)
testOutputAttrCSV()


Package not found at '/Users/sydneyknox/Documents/data-insights/data/processed/01255 (Schedule I) January 1 2014.docx'
Package not found at '/Users/sydneyknox/Documents/data-insights/data/processed/01255 (Schedule I) January 1 2013.docx'
Package not found at '/Users/sydneyknox/Documents/data-insights/data/processed/01094 (Schedule I) July 1 2013.docx'
Package not found at '/Users/sydneyknox/Documents/data-insights/data/processed/01255 - EOC (January 1 2011).docx'
Package not found at '/Users/sydneyknox/Documents/data-insights/data/processed/1036 (Schedule I) July 1 2009.docx'
list index out of range
19168 EOC.txt
If you prefer to write us with your question(s), please mail your inquiry to the following address: DeltaDental of West Virginia Administrative Offices One Delta Drive Mechanicsburg, PA 17055 Anthony S. Barth, President & CEO DEFINITIONS Terms when capitalized in your Evidence of Coverage booklet have defined meanings, given in the section below or throughout the booklet secti

It only summarizes the detailed provisions of the group dental contract issued by DeltaDental of California (“DeltaDental”) and cannot modify the Contract in any way.
If you prefer to write us with your question(s), please mail your inquiry to the following address: DELTADENTAL OF CALIFORNIA 100 First Street San Francisco, CA 94105 19223 1 CA-ENT-ASC-PPO-E(2014) Employee Benefit Booklet City of Alhambra Dental Plan DEFINITIONS Terms when capitalized in your Employee Benefit Booklet have defined meanings, given in the section below or throughout the booklet sections.
If you receive dental 19223 6 CA-ENT-ASC-PPO-E(2014) Employee Benefit Booklet City of Alhambra Dental Plan services from a Provider outside the state of California, the Provider will be paid according to DeltaDental’s network payment provisions for said state according to the terms of the Contract.
Box 997330 Sacramento, CA 95899-7339 Payment Guidelines We do not pay PPO or Premier Providers any incentive as an inducement t

Send your appeal or grievance to DeltaDental at the address shown below: DeltaDental of West Virginia One Delta Drive Mechanicsburg, PA 17055-6999 DeltaDental will send the Enrollee a written acknowledgment within five (5) days upon receipt of the appeal or grievance.
5.16 Holding Company DeltaDental is a member of the Insurance Holding Company System of DeltaDental of California (the “Enterprise”).
Box 2105 Mechanicsburg, PA 17055-2105 5.12 Protection Disclaimer Residents of West Virginia who purchase life insurance, annuities or health insurance should know that the insurance companies licensed in this state to write these types of insurance are members of the West Virginia Life and Health Insurance Guaranty Association.
['PA', 'PA', 'California', 'PA']
['pennsylvania', 'pennsylvania', 'california', 'pennsylvania']
pennsylvania
test2-pdf.txt
[]
[]
not_a_state
1036- sch 1 7-1-09-pdf.txt
[]
[]
not_a_state
01094 EOC  7-1-16.txt
Combined Evidence of Coverage and Disclosure Form DeltaDent

100, Suite 100 Alpharetta, GA 30022 (Hereinafter called “DDIC”) WITNESSETH THAT THE PARTIES AGREE AS FOLLOWS: This Dental Service Contract affords national access to all DeltaDental Plans to all Enrollees.
Communications intended for DeltaDental shall be addressed to One Delta Drive, Mechanicsburg, PA 17055.
['GA', 'PA']
['georgia', 'pennsylvania']
not_a_state
19380 - JACKSON CHILD DEVELOPMENT CENTER INC - SCHEDULE A - Eff 7-1-18 to 6-30-19.txt
[]
[]
not_a_state
01094 (Schedule I) July 1 2013.txt
[]
[]
not_a_state
01248 (R8 FOR SMITHSONIAN INSTITUTION) Eff. 10-1-14.txt
[]
[]
not_a_state
01094 _SBC Modification_-pdf.txt
01094 This Modification made by and between Company and DeltaDentalInsuranceCompany, 1130 Sanctuary Parkway, Suite 600, Alpharetta, GA 30009, hereinafter called “DDIC”.
['GA']
['georgia']
georgia
01255 - EOC (January 1 2011).txt
EVIDENCE OF COVERAGE FORMTEXT NATIONAL DEMOCRATIC INSTITUTE FOR INTERNATIONAL AFFAIRS Group Number: FORMTEXT 01255 Effective Date: FORMTEXT 1/1/

 DeltaDental of California COMBINED EVIDENCE OF COVERAGE AND DISCLOSURE FORM CHARGERS FOOTBALL COMPANY, LLC ENT-PPO-CA-E deltadentalins.com Group No: 19340 Effective Date: May 1, 2018 Can you read this document?
Your plan is underwritten and administered by DeltaDental of California (“DeltaDental”).
If you prefer to write us with your question(s), please mail your inquiry to the following address: DeltaDental of California P.O.
Box 997330 Sacramento, CA 95899-7330 Anthony S. Barth, President & CEO ENT-PPO-CA-E 1 19340 Chargers Football Company, LLC Dental Plan Evidence of Coverage DEFINITIONS Terms when capitalized in your Evidence of Coverage booklet have defined meanings, given in the section below or throughout the booklet sections.
If you receive dental services from a Provider outside the state of California, the Provider will be paid according to DeltaDental’s network payment provisions for said state according to the terms of the Contract.
DeltaDental is not required to continue

Communications intended for DeltaDental shall be addressed to One Delta Drive, Mechanicsburg, PA 17055.
['PA']
['pennsylvania']
pennsylvania
10041 Appendix B EOC Non-Union (Jan2018)-pdf.txt
Claims Administered by: DeltaDentalInsuranceCompany 1130 Sanctuary Parkway Alpharetta, Georgia 30009 (770) 641-5100 (800) 521-2651 MS-ASC-PPO-EOC(2006) 1 10041 TABLE OF CONTENTS GROUP HIGHLIGHTS .................................................................................................................................. 3 DEFINITIONS ............................................................................................................................................... 3 CHOICE OF DENTIST .................................................................................................................................. 5 WHO IS ELIGIBLE?
Box #1809 Alpharetta, Georgia 30023 (800) 521-2651 deltadentalins.com MS-ASC-PPO-EOC(2006) 12 10041 PRE-TREATMENT ESTIMATES A Dentist may file a Claim Form 

Dental Plan Evidence of Coverage DeltaDental Administrative Offices One Delta Drive Mechanicsburg, PA 17055-6999 (717) 766-8500 Toll free: (800) 932-0783 TTY/TDD: (888) 373-3582 www.deltadentalins.com Jackson Children’s Services, Inc.
Mechanicsburg, PA 17055 DeltaDental One Delta Drive EOC-NY-POS-06 1 Jackson Children’s Services, Inc.
Mechanicsburg, PA 17055-6999 form DeltaDental P.O.
Mechanicsburg, PA 17055 One Delta Drive DeltaDental EOC-NY-POS-06 10 Jackson Children’s Services, Inc.
Box 997330 Sacramento, CA 95899-7330 This notice is effective on and after January 1, 2017.
– • effective January 1, 2015 Updated contact information (mailing address and phone number) – effective July 1, 2013 • Updated DeltaDental’s duty to notify affected individuals if a breach of their unsecured PHI occurs – effective July 1, 2013 • Clarified that DeltaDental does not and will not sell your information without your express written authorization – effective July 1, 2013 • Clarified several instances w

Combined Evidence of Coverage and Disclosure Form DeltaDental Administrative Offices One Delta Drive Mechanicsburg, PA 17055-6999 (717) 766-8500 Toll free: (800) 932-0783 TTY/TDD: (888) 373-3582 www.deltadentalins.com TABLE OF CONTENTS TOC \O "1-3" \H \Z HYPERLINK \l "_Toc223151226" INTRODUCTION PAGEREF _Toc223151226 \h 1 HYPERLINK \l "_Toc223151227" Using This Evidence of Coverage PAGEREF _Toc223151227 \h 1 HYPERLINK \l "_Toc223151228" Contact Us PAGEREF _Toc223151228 \h 1 HYPERLINK \l "_Toc223151229" SELECTING YOUR DENTIST PAGEREF _Toc223151229 \h 1 HYPERLINK \l "_Toc223151230" Free Choice of Dentist PAGEREF _Toc223151230 \h 1 HYPERLINK \l "_Toc223151231" Referrals to Specialists PAGEREF _Toc223151231 \h 2 HYPERLINK \l "_Toc223151232" Locating a DeltaDental Participating Dentist PAGEREF _Toc223151232 \h 2 HYPERLINK \l "_Toc223151233" PLAN INFORMATION PAGEREF _Toc223151233 \h 2 HYPERLINK \l "_Toc223151234" Benefit Summary Charts PAGEREF _Toc223151234 \h 2 HYPERLINK \l "_Toc223151235" 

Notice to DeltaDental shall be to: DeltaDentalInsuranceCompany 1000 Mansell Exchange West Building 100, Suite 100 Alpharetta, Georgia 30022 Notice to Employer shall be to: Atlas Roofing Corporation 802 Highway 19 North, Suite 190 Meridian, Mississippi 39307 5.05 Both parties to the Contract agree to permit and encourage the professional relationship between Dentist and patient to be maintained without interference.
This plan is self-funded by your employer Claims Administered by: EMBED WPDraw30.Drawing 1000 Mansell Exchange West Building 100, Suite 100 Alpharetta, Georgia 30022 (770) 645-8700 (800) 521-2651 TABLE OF CONTENTS TOC \o "1-1" GROUP HIGHLIGHTS PAGEREF _Toc49747231 \h 3 DEFINITIONS PAGEREF _Toc49747232 \h 3 CHOICE OF DENTIST PAGEREF _Toc49747233 \h 4 WHO IS ELIGIBLE?
Box 1809 Alpharetta, GA 30023 (800) 521-2651 AVA (800) 510-9545 PREDETERMINATIONS A Dentist may file an Attending Dentist Statement before treatment, showing the services to be provided to an Enrollee.
['Georgia'

DeltaDental is a member of the DeltaDental of California Holding Company System (the “Enterprise”).
Notice to DeltaDental shall be to: DeltaDental One Delta Drive Mechanicsburg, PA 17055-6999 Notice to Contractholder shall be to: Ms. Maria Jones City of Charleston 501 Virginia Street East Charleston, WV 25330 5.06 Both parties to the Contract agree to permit and encourage the professional relationship between Provider and patient to be maintained without interference.
If you prefer to write us with your question(s), please mail your inquiry to the following address: DeltaDental One Delta Drive Mechanicsburg, PA 17055-6999 DEFINITIONS Terms when capitalized in your Employee Benefit Booklet have defined meanings, given in the section below or throughout the booklet sections.
Box 2105 Mechanicsburg, PA 17055 Payment Guidelines We do not pay PPO or Premier Providers any incentive as an inducement to deny, reduce, limit or delay any appropriate service.
Send your appeal or grievance to us a

Communications intended for DeltaDental shall be addressed to One Delta Drive, Mechanicsburg, PA 17055.
['PA']
['pennsylvania']
pennsylvania
test.txt
Craig Nottingham Pariveda Solutions 201 California Street, Suite 1250 San Francisco, CA 94111 craig.nottingham@parivedasolutions.com 201-565-6242 Craig Nottingham Pariveda Solutions 201 California Street, Suite 1250 San Francisco, CA 94111 craig.nottingham@parivedasolutions.com 201-565-6242 LEGAL AFFAIRS DEPARTMENT Data Insights POC Prepared for DeltaDental of California LEGAL AFFAIRS DEPARTMENT Data Insights POC Prepared for DeltaDental of California Andrea Fegley Vice President Legal, Regulatory and Compliance DeltaDental of California 560 Mission Street San Francisco, CA 94105 AFegley@delta.org (415) 972-8376 May 30, 2018 Dear Andrea, We appreciate the opportunity to present this proposal as a follow-on to the Regulatory Filing strategy work.
Craig Nottingham Pariveda Solutions 201 California Street, Suite 1250 San Francisco, CA 94111 

Claims Administered by: DeltaDentalInsuranceCompany 1130 Sanctuary Parkway Alpharetta, Georgia 30009 (770) 641-5100 (800) 521-2651 TABLE OF CONTENTS GROUP HIGHLIGHTS 3 DEFINITIONS 3 CHOICE OF DENTIST 5 WHO IS ELIGIBLE?
Box #1809 Alpharetta, Georgia 30023 (800) 521-2651 deltadentalins.com PRE-TREATMENT ESTIMATES A Dentist may file a Claim Form before treatment, showing the services to be provided to an Enrollee.
['Georgia', 'Georgia']
['georgia', 'georgia']
georgia
TX 19015 EOC ENT (7.2.18).txt
Box 1809 Alpharetta, GA 30023 You may contact the Texas Department of Insurance to obtain information on companies, coverages, rights or complaints at: 1-800-252-3439 You may write the Texas Department of Insurance at: P.O.
Box 1809 Alpharetta, GA 30023 You may contact the Texas Department of Insurance to obtain information on companies, coverages, rights or complaints at: 1-800-252-3439 You may write the Texas Department of Insurance at: P.O.
Box 1809 Alpharetta, GA 30023 Usted puede comunicarse

01248 Effective Date: 10/1/2014 DeltaDental Administrative Offices One Delta Drive Mechanicsburg, PA 17055-6999 (717) 766-8500 Toll free: (800) 932-0783 TTY/TDD: (888) 373-3582 www.deltadentalins.com Smithsonian Institution Dental Plan TABLE OF CONTENTS Evidence of Coverage INTRODUCTION ........................................................................................................................... 1 Using This Evidence of Coverage .................................................................................................... 1 Contact Us .................................................................................................................................... 1 SELECTING YOUR DENTIST ........................................................................................................ 2 Free Choice of Dentist ................................................................................................................... 2 Referrals to Specialists .........

Claims Administered by: DeltaDentalInsuranceCompany 1130 Sanctuary Parkway Suite 600 Alpharetta, Georgia 30009 (770) 641-5100 (800) 521-2651 TABLE OF CONTENTS TOC \o "1-1" GROUP HIGHLIGHTS PAGEREF _Toc121112666 \h 3 DEFINITIONS PAGEREF _Toc121112667 \h 3 CHOICE OF DENTIST PAGEREF _Toc121112668 \h 5 WHO IS ELIGIBLE?
Box #1809 Alpharetta, Georgia 30023 (800) 521-2651 HYPERLINK "http://www.WeKeepYouSmiling.com" deltadentalins.com PRE-TREATMENT ESTIMATES A Dentist may file a Claim Form before treatment, showing the services to be provided to an Enrollee.
['Georgia', 'Georgia']
['georgia', 'georgia']
georgia
19340 Attach D (05-01-18).txt
[]
[]
not_a_state
TX-19278 ASC-ENT (7.2.18)-pdf.txt
[]
[]
not_a_state
19168 Attachment B High.txt
[]
[]
not_a_state
25-1149 ASC Agreement Signed-pdf.txt
[]
[]
not_a_state
TX 19015 EOC ENT(7.2.18)-pdf.txt
Box 1809 Alpharetta, GA 30023 You may contact the Texas Department of Insurance to obtain information on companies, coverages, rights or complaints at: 1-8

Claims Administered by: DeltaDentalInsuranceCompany 1130 Sanctuary Parkway Suite 600 Alpharetta, Georgia 30009 (770) 641-5100 (800) 521-2651 TABLE OF CONTENTS TOC \o "1-1" GROUP HIGHLIGHTS PAGEREF _Toc121112666 \h 3 DEFINITIONS PAGEREF _Toc121112667 \h 3 CHOICE OF DENTIST PAGEREF _Toc121112668 \h 5 WHO IS ELIGIBLE?
Box #1809 Alpharetta, Georgia 30023 (800) 521-2651 HYPERLINK "http://www.WeKeepYouSmiling.com" deltadentalins.com PRE-TREATMENT ESTIMATES A Dentist may file a Claim Form before treatment, showing the services to be provided to an Enrollee.
['Georgia', 'Georgia']
['georgia', 'georgia']
georgia
01255 (Full Contract) Eff. 2-1-07-pdf.txt
[]
[]
not_a_state
19340 Attach C (05-01-18).txt
Premiums: Monthly Amount: Per Primary Enrollee: $58.38 Per Primary Enrollee and Spouse: $116.74 Per Primary Enrollee and Child(ren): $129.15 Per Primary Enrollee and Family: $196.95 Premiums are to be remitted to: DeltaDental of California P.O.
Box 44460 San Francisco, CA 94144-0460 Payment Breakdo

WE, US and OUR always refers to DeltaDental of California (DeltaDental).
DELTADENTAL OF CALIFORNIA 100 First Street San Francisco, CA 94105 For claims, eligibility and benefits inquiries, or additional information, call DeltaDental’s Customer Service department toll-free at: 800-765-6003 or contact us on our website: deltadentalins.com.
Sponsor - means the Carpenters Health and Welfare Fund for California who offers voluntary Benefits to Northern California Union Carpenter retirees through the dental plan provided by DeltaDental and administered by the Carpenter Funds Administrative Office of Northern California, Inc. Usual, Customary and Reasonable (UCR) - A Usual fee is the amount that an individual dentist regularly charges and receives for a given service or the fee actually charged, whichever is less.
Nearly 29,000 dentists in active practice in California are DeltaDental Dentists.
Box 997330, Sacramento, CA 95899-7330.
Claims submitted by out-of-country dentists for Enrollees res

## Notes

### Contract Start Date

With contract start date I began by searching through the tokenized sentences with a regex expression. 
I found a datefinder module to use on each flagged sentence to pull out the dates
	Issue: the datefinder module works poorly on large, run -on sentences which are common in the contracts. It tends to find other numbers that aren't dates and try to make a date out of them.
	
	Sol: only take a subset, starting at the flagged word
	
Sometimes a match isn't found with the keywords I've seen related to the start date
	Sol: look for keywords related to contract term and take the earlier date from that sentence
	
Issue: Datefinder focusing on numbers that aren't dates
	Sol: filter for sentences that have a year (ie four digits in a row) and dates that are before Delta Dental existed (in 1966)
	
Issue: Sometimes there are multiple modes. Usually I saw this when there were equal mentions of the end date
	Sol: if there are multiple modes, take the earliest date. 
	



### Testing finding the contract end date

This was much the same as the contract start date Issues

	Looking for keywords Contract Term/Contract End
	Filtering on invalid years
	Filtering on if there IS a year in the sentence (assuming it won't be written out like nineteen ninety-four)
	If there are only two results, take the later one
	If there are more than two, take the top two most common and then take the latter of the two


### Contract Duration:

Call contract start and end and try to get a duration out of them

 ^^^ pretty much worked


### Comments so far:


	Even with trying to be variable, this won't work if they even change the wording a bit. Maybe spend some time looking into using the libraries to get synonyms.
	
	It would also be great to get the other contracts to see exactly where we're going wrong
	


### Working with synonyms in NLTK

	Getting an expanded set of search terms can be done, but I can't yet figure out how to pick the right contexts. For example, the search bigram "contract term" gives back a huge amount of synonyms, with only 3 or 4 actually being equivalent in meaning to "contract term"
	
	We could always manually select ones that are similar but that seems to defeat the purpose: ie we could select only the noun meanings of contract
	
	We could build our own corpus of words based on all of the documents that we have, and then compare the given synonyms to a freq distribution of those words to pick out the ways other contracts might say the same thing
		But this would still miss things for sure
		
	We could include the word type with the seed words and only choose synonyms of the same type, although this does involve more hardcoding


### Folder Format

data/raw

data/processed/[group number - group name]/

data/output

*** Not every file seems to have a name, so we would have to parse the file to get it

*** Additionally each num - name combo may have contracts from multiple dates


### Using NER tagging to identify location sections

#### NLTK NER tagging
Basic NER tagging with nltk works horribly on our files out of the box

#### Polyglot
Polyglot has a lot of issues getting downloaded

#### NLTK wrapper for Stanford NER
NLTK has a wrapper for the Stanford NER tagger so I'm going to try that next
	Download the model jar file
	
	
The stanford NER tagger is working a bit better
http://www.nltk.org/api/nltk.tag.html#nltk.tag.stanford.StanfordTagger
https://nlp.stanford.edu/software/stanford-ner-2018-02-27.zip (download of jar files)
https://textminingonline.com/how-to-use-stanford-named-entity-recognizer-ner-in-python-nltk-and-other-programming-languages

The stanford one takes FOREVER though
	There is a faster version in CoreNLP but that's all in Java and I don't think the wrapper interacts with it

#### GeoText
GeoText

Easy to set up and use, but doesn't do states or state abbreviations
And it misses a LOT that the NER tagger got
	It is completely unreliable honestly. 

#### Options
Option 1: Use the NLTK wrapper for Stanford NER tagger and just wait forever
Option 2: Get a giant csv of all US cities/states/abbreviations/counties (exists) and make a data set out of that to compare to
	Cons: not flexible or extensible, is already 4 years out of date
	Pros: much faster, will only have to create the thing once
	
Note: even with the NER it will only give us pieces of the address, we would still have to go into the sentence and try to regex it out

Option 3: create our own trained model from some of the files we already have and see how that does with the Stanford tagger. Might be faster
	Could also see how it does with the native NLTK tagger


### Location Function Issues

Trying to Regex out a full address proves difficult

You can kind of get it down to the right sentence by looking for ones that contain 'contractholder' and avoiding ones that contain 'deltadental'

But it's still not perfect

Tabling getting the entire address for now, I'm looking at getting the contractholder state and city

Getting the state so far works okay, but there are some strange cities out there that cause issues. IE DPO is apparently a city in the US, as is Premium. These words show up enough in other locations that they interfere with trying to find the most popular ACTUAL city mentioned. Even filtering on the above keywords comes back with Premium as the city.

Honestly, without a bit of work the city is completely unreliable

Running into issues with filtering by keyword because the keyword is often cut off from parts of the sentence containing the location information by punctuation within the address itself


Issues with a lot of false positives. I think it would be easier to find the location of the Delta office handling it instead of the client address, which doesn't seem to be clearly marked anywhere




### Docx vs Doc issues

#### Right now we are using a docx specific library

#### They gave us a bunch of doc files, docx2text can't handle those

#### Catdoc software and the python subprocess module
       Catdoc does NOT pay attention to formatting, so that could get messy. it simply looks for readable text and extracts it in the order it finds it
       Catdoc works natively on Mac but not windows
       https://blog.brush.co.nz/2009/09/catdoc-windows/ is a pckg for windows...but just from some random guy
       

#### Antiword
        Linux specific, you can get packages for both Mac and Windows...the windows one looks especially iffy
        Also just the fact that we would need to have a separate setup is not very desirable. 
        
#### Textutil
        Can be used on Mac pretty easily with python subprocess
        
        
       


### Mac vs Windows Issues

#### Running on windows

#### Before going through any of this, ensure your windows parallels setup is done from the OneNote directions!
#### i.e., make sure Anaconda is installed :| 

        when importing modules using pip install
                Instead of "pip install pdfminer" use "pip install pdfminer.six"
        when installing datefinder
                It won't work and will tell you that you need Visual Studio 2015
                Instead, download the src code from https://github.com/akoumjian/datefinder
                Open file: setup.py and look for line 
                    install_requires=['regex==2016.01.10', 'python-dateutil>=2.4.2', 'pytz'],
                and change the first == to >=
                    install_requires=['regex>=2016.01.10', 'python-dateutil>=2.4.2', 'pytz'],
                save and then run "pip install './pathToDatefinderSrc'
                
                see: https://stackoverflow.com/questions/44016287/error-in-pip-install-datefinder?noredirect=1&lq=1
              May also need to run pip install --upgrade setuptools
              
         Some things don't get installed automatically:
                 in python window run : import nltk 
                                        nltk.download('punkt')
                                        nltk.download('stopwords')
                                        nltk.download('averaged_perceptron_tagger')
                                        nltk.download('wordnet')
              
              