# Exploring and Processing

### Import Statements

In [68]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
import random as rng
import nltk
import fnmatch

import docx2txt
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

from nltk.book import *
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.collocations import *
from nltk.util import ngrams
from nltk.stem.lancaster import LancasterStemmer
from string import punctuation
import sys as sys
from sys import platform
import re as re
from statistics import mode
import datefinder
from nltk import ne_chunk
from geotext import GeoText
from nltk.tag import StanfordNERTagger


## Psuedocode

In [10]:
### Analysis Function

## Function which calculates the number of combinations (Not Permutations) of distinct values in a dataset. Would 
## take in a Pandas DataFrame ideally and multiply all distinct value counts for all available columns.

# Expected input: Pandas DataFrame
# Expected output: Int, the number of possible combinations.

In [11]:
### Parsing Function

## Function to collect mentions of States/Districts/Territories in the United States (DC and Puerto Rico are included)
## and rank their relevence, frequency should be a good factor. Note: California, Pennsylvania, and Georgia could 
## appear multiple times from addresses.

# Expected input: A single file, text.
# Expected output: A list of locations mentioned, ranked by most used to least used.

In [12]:
### Parsing Function

## Function for searching a tokenized sentence dictionary for words or phrases. There's probably 
## already something for this.

# Expected input: A tokenized list, and a phrase to search for.
# Expected output: A list of sentences containing the phrase searched.

In [13]:
### Parsing Function

## Function to take a file and produce the Acronyms from them.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, most likely first.

In [14]:
### Parsing Function

## Function to take Acronyms and match them to the best possible N-gram for them from a document. Not all will be 
## possible, but a list of options will help.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, sorted most likely first.

In [15]:
### Generally important

## Need to expand stopwords to include States and Districts, also Delta Dental adjacent names as another available set.

In [16]:
## Function to load in two different text extracted contracts and return a comparison metric between 
## the two (Similarity, possibly as a percent?)

# Expected Input: Two contracts for Comparison
# Expected Output: A measure, some kind of decimal to represent similarity.

## File Prep Functions

### Function: Is it docx or pdf?

In [17]:
def checkFileType(filename):
        if(filename.lower().endswith(('.doc','.docx'))):
            return 0
        elif(filename.lower().endswith(('.pdf'))):
            return 1
        else:
            return -1

### Function: read in file

In [18]:
def makeFilePath(docName):
    raw_data_path = os.path.join(os.getcwd(), 'data', 'raw')
    return os.path.join(raw_data_path, docName)

### Function: clean up text

In [19]:
def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    #text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
    return text

### Function: process dataFrame and group

In [20]:
def processDF(txtFile):
    df = pd.read_csv(txtFile, sep=" ", header=None) #this doesn't work for me bc of variable number of cols
    df = df.T 
    df = df.dropna()

    df['SingleRow']=1

    df=df.rename(columns={0 : 'Words'})
    print("in processDF " + txtFile)
    df.describe(include="all")
    #print(df.groupby('Words').SingleRow.sum().sort_values())
    #print(df)
    return df

### Function: Process a text file

In [21]:
def processTextFile(filePath):
    #print(filePath)
    docxText = docx2txt.process(filePath)
    #print(docxText)
    replacedText = cleanText(docxText)
    #print(replacedText)
    fileName = filePath.split('/')[-1]
    #print(fileName)
    baseFileName = fileName[0:-5]
    #print(baseFileName)
    newFilePath = './data/output/' + baseFileName + ".txt"
    #print(newFilePath)
    singleFileDocx=open(newFilePath, 'wb+')    
    singleFileDocx.write(replacedText.encode("utf-8"))
    singleFileDocx.close()

    #temp_df = processDF('singleTextDocx.txt')
    return newFilePath

### Function: Process pdf file

In [22]:
def processPDFfile(filePath):
        password = ""
        extracted_text = ""

        fileName = filePath.split('/')[-1]
        baseFileName = fileName[0:-4]
    
        fp = open(filePath, "rb")
        parser = PDFParser(fp)
        document = PDFDocument(parser, password)
        
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
            
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()

        # set parameters for analysis
        laparams = LAParams()

        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device 
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    newText = lt_obj.get_text()
                    newText = newText.replace('\n', ' ')
                    extracted_text += newText

        #close the pdf file
        fp.close()
        
        extracted_text = cleanText(extracted_text)#extracted_text.replace("\n", ' ')
        
        newFilePath = './data/output/' + baseFileName + '-pdf' + ".txt"
        with open(newFilePath, 'wb+') as singleFilePDF:
            singleFilePDF.write(extracted_text.encode("utf-8"))
        
        
        #temp_df = processDF('./data/output/' + baseFileName + ".txt")
        return newFilePath

## Docx Extraction -- Currently not using this: see File Prep Functions

DocX extractor for data purposes. Requires customization to each purpose but is important for pulling data out of DocX files. Does not understand tables or bullet points, however is visually consistent with what's on the page.

Strongest values: Order, consistentcy, noise reduction

Weakest values: Completeness, flexibility, whitespace characters, formatting

#### Docx Practice

In [23]:
import docx2txt


def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
        
    return text
fileName = "TX 17404 Contract Regional (7.2.18).docx"
baseFileName = ""
if(fileName.lower().endswith(('.docx'))):
    baseFileName = fileName[0:-5]
    #print(baseFileName)
elif(fileName.lower().endswith(('.pdf'))):
    baseFileName = fileName[0:-4]
else:
    print("ending error")

docText = docx2txt.process("./data/raw/" + fileName)
singleFileDocx=open('./data/output/' + baseFileName + ".txt", 'wb+')
replacedText = cleanText(docText)
#print(replacedText)
singleFileDocx.write(docText.encode("utf-8"))
singleFileDocx.close()


## PDF Extraction -- Currently not using this: see File Prep Functions

#### PDF practice

In [24]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
# pdfTextMiner.py
# Python 2.7.6
# For Python 3.x use pdfminer3k module
# This link has useful information on components of the program
# https://euske.github.io/pdfminer/programming.html
# http://denis.papathanasiou.org/posts/2010.08.04.post.html


''' Important classes to remember
PDFParser - fetches data from pdf file
PDFDocument - stores data parsed by PDFParser
PDFPageInterpreter - processes page contents from PDFDocument
PDFDevice - translates processed information from PDFPageInterpreter to whatever you need
PDFResourceManager - Stores shared resources such as fonts or images used by both PDFPageInterpreter and PDFDevice
LAParams - A layout analyzer returns a LTPage object for each page in the PDF document
PDFPageAggregator - Extract the decive to page aggregator to get LT object elements
'''

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

''' This is what we are trying to do:
1) Transfer information from PDF file to PDF document object. This is done using parser
2) Open the PDF file
3) Parse the file using PDFParser object
4) Assign the parsed content to PDFDocument object
5) Now the information in this PDFDocumet object has to be processed. For this we need
   PDFPageInterpreter, PDFDevice and PDFResourceManager
 6) Finally process the file page by page 
'''

base_path = "C://data"

my_file = os.path.join(base_path + "/" + "test.pdf")
log_file = os.path.join(base_path + "/" + "pdf_log.txt")

password = ""
extracted_text = ""

# Open and read the pdf file in binary mode
fp = open(my_file, "rb")

# Create parser object to parse the pdf content
parser = PDFParser(fp)

# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)

# Check if document is extractable, if not abort
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
    
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()

# set parameters for analysis
laparams = LAParams()

# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device 
interpreter = PDFPageInterpreter(rsrcmgr, device)

# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
    # As the interpreter processes the page stored in PDFDocument object
    interpreter.process_page(page)
    # The device renders the layout from interpreter
    layout = device.get_result()
    # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            extracted_text += lt_obj.get_text()
            
#close the pdf file
fp.close()

# print (extracted_text.encode("utf-8"))
            
with open(log_file, "wb") as my_log:
    my_log.write(extracted_text.encode("utf-8"))
print("Done !!")


FileNotFoundError: [Errno 2] No such file or directory: 'C://data/test.pdf'

## NLTK Tokenizing Functions

### Function: DD specific text cleaning

In [25]:
def ddCleanText(text):
    newText = text.replace('Delta Dental', 'DeltaDental')
    newText = newText.replace('DELTA DENTAL', 'DELTADENTAL')
    newText = newText.replace('DeltaDental Insurance Company', 'DeltaDentalInsuranceCompany')
    return newText



### Function: Make tokenized word list

In [26]:
def getTokens(text):
    words = word_tokenize(text)
    
    customStopWords = set(stopwords.words('english')+list(punctuation))
    wordsWOStop=[word for word in words if word not in customStopWords]
    
    return wordsWOStop

### Function: Make tokenized Sentence list

In [27]:
def getSents(text):
    sents = sent_tokenize(text)

    return sents

### Function: Get Bigrams

In [28]:
def getBigrams(tokens):
    bigram_measures=nltk.collocations.BigramAssocMeasures();
    finder = BigramCollocationFinder.from_words(tokens)
    sorted_bgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))
    
    return sorted_bgs

### Function: Get Trigrams

In [29]:
def getTrigrams(tokens):
    trigram_measures =nltk.collocations.TrigramAssocMeasures();
    finder = TrigramCollocationFinder.from_words(tokens)
    sorted_tgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))

    return sorted_tgs

## MetaData and Attribute Functions

#### Filename
Contract Start
Contract End
Contract Duration
State
Delta Office Involved

#### Group Information
(Group Number)

#### Numeric attributes Only
Basics
Diagnostics
Major
Endo
Oral
Perio
Prostho
Ortho

In [30]:
##Establish a dataframe to capture the attributes
#d = {'key': 'file','value':fileName}
#{'key':'state', 'value':state}
#df = pd.DataFrame(d, index=['uid'])
d = {}

### NLTK synonyms

#### I kept this to processing single words, bigrams or trigrams so as to keep the complexity down

#### Function: translate from syn POS to nltk POS

In [31]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

#### Function: Use POS to cull wordnet synonyms

In [32]:
def getSynonyms_usingPOS(word_tuple):
    #print(word_tuple)
    word_tagged = word_tuple[0]
    word_pos = get_wordnet_pos(word_tuple[1])
    syns = wn.synsets(word_tagged, pos=word_pos)
    
    set1 = set()
    for syn in syns:
        for lem in syn.lemmas():
            set1.add(lem.name())
    #print(syns)
    return set1

#### Function: get synonyms of a single word. Helper function to Bigram and Trigram function

In [33]:
## You can't cull this one down with the POS b/c you can't tag a single word
def getSyns(word):
    syns1 = wn.synsets(word)
    
    set1 = set()
    for syn in syns1:
        for lem in syn.lemmas():
            set1.add(lem.name())
    
    return set1

#### Function: Get a similar bigram

In [34]:
def getSimilarBigrams(word1, word2):
    #print([word1, word2])
    #print(word_tokenize(" ".join([word1,word2])))
    
    tagged_words = nltk.pos_tag([word1,word2])
    #print(tagged_words)
    set1 = getSynonyms_usingPOS(tagged_words[0])
    if not len(set1):
        set1.add(word1)
    #print(set1)
    set2 = getSynonyms_usingPOS(tagged_words[1])
    if not len(set2):
        set2.add(word2)
    #print(set2)
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            word_set.add(" ".join([word1, word2]))
    #print(word_set)
    
    return word_set

#### Function: get a similar trigram

In [35]:
def getSimilarTrigrams(word1, word2, word3):
    tagged_words = nltk.pos_tag([word1,word2,word3])
    
    set1 = getSynonyms_usingPOS(tagged_words[0])
    if not len(set1):
        set1.add(word1)
    set2 = getSynonyms_usingPOS(tagged_words[1])
    if not len(set2):
        set2.add(word2)
    set3 = getSynonyms_usingPOS(tagged_words[2])
    if not len(set3):
        set3.add(word3)
    
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            for word3 in set3:
                word_set.add(" ".join([word1, word2, word3]))
    #print(word_set)
    
    return word_set
        

#### Function: Get synonyms from a list of key words. Returns more keywords/phrases

In [36]:
def getSynonymsFromList(keywords):
    matches = []

    for kw in keywords:
        try:
            words = word_tokenize(kw)
        except Exception as e:
            print(str(e))
        #print(words)
        if(len(words) == 1):
            print("is 1")
            syns = getSyns(words[0])
            for syn in syns:
                matches.append(syn)
            #keywords.append(list(syns))
        elif(len(words) == 2):
            print("is 2")
            syns = getSimilarBigrams(words[0],words[1])
            #print(syns)
            matches.extend(getSimilarBigrams(words[0],words[1]))
        elif(len(words) == 3):
            print("is 3")
            matches.extend(getSimilarTrigrams(words[0],words[1],words[2]))
        else:
            print("keyword string too long")
        #print(matches)
    keywords.extend(matches)
    keywords = set(keywords)

    #print(start_keywords)
    
    return keywords

#### POS tagging key

ADJ	adjective	new, good, high, special, big, local
ADV	adverb	really, already, still, early, now
CNJ	conjunction	and, or, but, if, while, although
DET	determiner	the, a, some, most, every, no
EX	existential	there, there's
FW	foreign word	dolce, ersatz, esprit, quo, maitre
MOD	modal verb	will, can, would, may, must, should
N	noun	year, home, costs, time, education
NP	proper noun	Alison, Africa, April, Washington
NUM	number	twenty-four, fourth, 1991, 14:24
PRO	pronoun	he, their, her, its, my, I, us
P	preposition	on, of, at, with, by, into, under
TO	the word to	to
UH	interjection	ah, bang, ha, whee, hmpf, oops
V	verb	is, has, get, do, make, see, run
VD	past tense	said, took, told, made, asked
VG	present participle	making, going, playing, working
VN	past participle	given, taken, begun, sung
WH	wh determiner	who,

#### Attempt with using POS tagging in the synonyms to reduce extraneous syns
We would then have to manually tag all of the original phrases and words we use to seed the decisions
It does seem to reduce though

In [37]:
start_keywords = ["effective date\S\s*\S", "effective"]
print(getSynonymsFromList(start_keywords))


is 2
is 1
{'effectual date\\S\\s*\\S', 'in_force', 'efficient date\\S\\s*\\S', 'efficacious', 'good date\\S\\s*\\S', 'efficient', 'in_effect date\\S\\s*\\S', 'good', 'effective', 'efficacious date\\S\\s*\\S', 'in_effect', 'effectual', 'in_force date\\S\\s*\\S', 'effective date\\S\\s*\\S'}


### Metadata

#### get file name
Get the filename from a full path. Determines the OS and splits the string correctly based on that

In [38]:
def getFileName(fullPath):
    if(platform == "win32"):
        fileName = fullPath.split("\\")[-1]
    else:
        fileName = fullPath.split("/")[-1]
    return fileName

In [39]:
print(getFileName(filePath))

NameError: name 'filePath' is not defined

#### get group number
Uses regex's made from keywords to attempt to find a group number in the file. Failing that, it searches the filename for the number.

In [None]:
def getGroupNumber(sents_tokens, filePath):
    
    group_keywords = ["group number", "groupnumber"]
    regex_exps = []
    poss_nums = []
    finalGN = None
    
    #Create regex exps out of group number keywords
    for kw in group_keywords:
            temp_re = kw + "\W\s*(?P<gn>\d+)"
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
    #For each sentence, search for the expression, if found add the number to
    #list of possible group numbers
    for sent in sents_tokens:
        #print(sent)
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                temp_gn = result.group('gn')
                poss_nums.append(temp_gn)
    
    #Try and get the number from the file name, looking for list of numeric chars
    num_regex = re.compile("\d+")
    fileName = getFileName(filePath)
    fileGN = num_regex.search(fileName)
    
    if not fileGN==None:#if they filename has a number sequence
        if fileGN.group() in poss_nums:#then if the file group number matches one in the document, choose it
            finalGN = fileGN.group()
        else:
            poss_nums.append(fileGN.group())#otherwise add the filename one to the list and try to get the most co
            try:
                finalGN = mode(poss_nums)
            except:
                print("Unexpected error: Cannot determine group number of file: " + filePath)
    
    return finalGN

#### get contract start
Uses regex and a list of keywords to attempt to find the start date of the contract. It makes multiple passes based on patterns seen in contract samples so far.

Some of the passes are necessary to filter out non-date numbers that the datefinder incorrectly parses to dates

In [40]:
def getContractStart(sents_tokens):
    start_keywords = ["effective date\S\s*\S", "effective"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    start_keywords = getSynonymsFromList(start_keywords)
    print(start_keywords)
    for kw in start_keywords:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
## Original pass through sentence tokens to find possible dates
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                subset = word_tokenize(sent[sent.lower().find(result.group()):])[:20]
                subset = " ".join(subset)
                poss_dates.append(subset)

                
## Second pass through sentence tokens to find possible dates based on a date range format
    regex_exps = []
    backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
    for kw in backup_kw:
        temp_re = kw
        regex = re.compile(temp_re)
        regex_exps.append(regex)
        
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                half_1 = sent[sent.lower().find(result.group()):]
                half_2 = sent[:sent.lower().find(result.group())]
                
                subset_1 = " ".join(word_tokenize(half_1)[:6])
                subset_2 = " ".join(word_tokenize(half_2)[-6:])
                subset = subset_2 + subset_1
                
                m = datefinder.find_dates(subset)
                temp_matches = []
                for match in m:
                    if match.year >= 1966:
                        temp_matches.append(subset)
                if len(temp_matches)>=2:
                    poss_dates.append(subset)
                    #print(subset)
                    
                    
## Second pass through sentences with possible dates to eliminate ones without a year or with an invalid year
## These are likely other values flagged incorrectly as dates by the datefinder
## 1966 is the year Delta Dental was created
    for sent in poss_dates:
        find_year_re = re.compile("\d\d\d\d")
        year = find_year_re.search(sent)
        
        if not year==None:
            m = datefinder.find_dates(sent)
            for match in m:
                if match.year >= 1966:
                    matches.append(match)
## Last pass: try to find the most common date. If there is more than one mode, choose the earliest date
##.           this seems to occur when it is finding the contract start and end in equal quantities
    #print(matches)
    try:
        finalDate = mode(matches)
    except ValueError as e:
        #print(str(e))
        if matches:
            earliestMatch = matches[0]
            for match in matches:
                if(match < earliestMatch):
                    earliestMatch = match
            finalDate = earliestMatch
    except Exception as e:
        print(str(e))
  
    return finalDate

####  get Contract End
Similar to get contract start, it uses regex and keywords over multiple passes to attempt and find the contract end.

In [41]:
def getContractEnd(sents_tokens):
    
    start_keywords = ["contract term\S\s*\S", "contract term ", "contract end"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    start_keywords = getSynonymsFromList(start_keywords)
    print(start_keywords)
    for kw in start_keywords:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
## Original pass through sentence tokens to find possible dates based on keywords
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                subset = word_tokenize(sent[sent.lower().find(result.group()):])[:25]
                subset = " ".join(subset)
                poss_dates.append(subset)
                
                
## Second pass through sentence tokens to find possible dates based on a date range format
    regex_exps = []
    backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
    for kw in backup_kw:
        temp_re = kw
        regex = re.compile(temp_re)
        regex_exps.append(regex)
        
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                half_1 = sent[sent.lower().find(result.group()):]
                half_2 = sent[:sent.lower().find(result.group())]
                
                subset_1 = " ".join(word_tokenize(half_1)[:6])
                subset_2 = " ".join(word_tokenize(half_2)[-6:])
                subset = subset_2 + subset_1
                
                m = datefinder.find_dates(subset)
                temp_matches = []
                for match in m:
                    if match.year >= 1966:
                        temp_matches.append(subset)
                if len(temp_matches)>=2:
                    poss_dates.append(subset)
                    #print(subset)
                
                
## Pass through sentences with possible dates to eliminate ones without a year or with an invalid year
## These are likely other values flagged incorrectly as dates by the datefinder
## 1966 is the year Delta Dental was created
    for sent in poss_dates:
        #print(sent)
        find_year_re = re.compile("\d\d\d\d")
        year = find_year_re.search(sent)
        
        if not year==None:
            m = datefinder.find_dates(sent)
            for match in m:
                if match.year >= 1966:
                    matches.append(match)
                    
   
    #print(matches)
    
### If there are exactly two matches, try to find a max. If error b/c they're the same, choose one
    if(len(matches) == 2):
        try:
            finalDate = max(matches)
        except ValueError as e:
            finalDate = matches[0]
        except Exception as e:
            print(str(e))

## If there are more, try and find the top two most mentioned and take the later. else just take the latest            
    elif(len(matches) > 2):
        
        try:
            date1 = mode(matches)
            matches.remove(date1)
            
            date2 = mode(matches)
            matches.remove(date2)
            
            finalDate = max([date1, date2])
        except ValueError as e:
            #print(str(e))
            if matches:
                latestMatch = matches[0]
                for match in matches:
                    if(match > latestMatch):
                        latestMatch = match
                finalDate = latestMatch
        except Exception as e:
            print(str(e))
    #else:
        #print("could not find contract end for file")
    
    #print("\n")
    return finalDate

#### get Contract Duration
Uses the functions getContractStart and getContractEnd to calculate a duration if possible

In [None]:
def getContractDuration(sents_tokens):
    start = None
    end = None
    duration = None
    
    try:
        start = getContractStart(sents_tokens)
    except Exception as e:
        print("Can't find contract start")
        return
    
    try:
        end = getContractEnd(sents_tokens)
    except Exception as e:
        print("Can't find contract end")
        return
    if(start and end):
        duration = end - start

    return duration

#### get State/Location: -- Not Done

In [64]:
arg1 = '/Users/sydneyknox/Documents/data-insights/jupyter-pseudocode/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz'
arg2 = '/Users/sydneyknox/Documents/data-insights/jupyter-pseudocode/stanford-ner-2018-02-27/stanford-ner.jar'

st = StanfordNERTagger(arg1, arg2)

def getContractLocation(sents_tokens):
    loc_sents = set()
    for sent in sents_tokens:
        #places = GeoText(sent)
        #print(places.countries)
        #print(places.cities)
        text = nltk.word_tokenize(sent)
        tagged = st.tag(text)
        for tag in tagged:
            try:
                print("--->   " + str(tag))
                if(tag[1] == 'LOCATION'):
                    loc_sents.add(sent)
            except Exception as e:
                print(str(e))
            #if not tag[1] == 'O':
             #   print(tag)
        #for entity in nltk.ne_chunk(text):
        #    if(isinstance(entity, nltk.tree.Tree)):
        #        etext = " ".join([word for word, tag in entity.leaves()])
                #print(etext + entity.label())
                #label = entity.label()
                
        #        if entity.label()=='GEO':
       #             print(etext + " " + entity.label())
    print(loc_sents)
    return

In [61]:
getContractLocation(sentsTokens)

NameError: name 'sentsTokens' is not defined

#### Delta Office Involved -- Not Done

In [45]:
def getDeltaOffice():
    
    return

### Batch Run to get attributes
#### Functions to process multiple files and their attributes at once

#### Function: batch pre process: fill output folder

In [46]:
def batchPreProcess():
    cwd = os.getcwd()
    print(cwd)
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "data/raw")

    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                #print(file)
                try:
                    if(checkFileType(filepath) == 0):
                        processedTextPath = processTextFile(filepath)
                    elif(checkFileType(filepath) == 1):
                        processedTextPath = processPDFfile(filepath)
                    else:
                        raise TypeError('This path does not lead to a valid file type!')                     
                except Exception as e:
                    print(str(e))

    else:
        print("data/raw doesn't exist")

#### Function: Batch return token and bigram sets for all output files
Returns file information as an array of objects containing key:value information about the file: 

In [47]:
[ 
    {
        'filepath':'users/sydneyknox...', 
        'wordTokens':[*tokens*], 
        ...  
    }, 
    {  
        'sentenceTokens':[*tokens*],
        'cleanText':"string containing the original text from the processed file..."
    }
]

SyntaxError: invalid syntax (<ipython-input-47-479752cbe912>, line 4)

In [48]:
def batchGetTokens():
    all_tokens = []
    cwd = os.getcwd()
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "data/output")
    print(dataPath)
    
    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                try:
                    #print(file)

                    temp_obj = {}

                    with open(filepath, 'r') as txtFile:
                        text = txtFile.read()

                    temp_obj['filepath'] = filepath

                    text = ddCleanText(text)
                    temp_obj['cleanText'] = text

                    wordTokens = getTokens(text)
                    sentTokens = getSents(text)
                    temp_obj['wordTokens'] = wordTokens
                    temp_obj['sentTokens'] = sentTokens

                    bgs = getBigrams(wordTokens)
                    tgs = getTrigrams(wordTokens)
                    temp_obj['bgs'] = bgs
                    temp_obj['tgs'] = tgs

                    txtFile.close()
                    all_tokens.append(temp_obj)
                except Exception as e:
                    print("Error opening and tokenizing " + file)
                    #print(str(e))

    else:
        print("data/output doesn't exist")
    return all_tokens

#### Function: get metadata attributes
This function takes in a single files info -- in this section because it will be used in a batch function

In [49]:
def getMetaDataAtt(file_info):
    #print(file_info)
    fileName = getFileName(file_info['filepath'])
    print(fileName)
    
    #groupNumber = getGroupNumber(file_info['sentTokens'], file_info['filepath'])
    #if(groupNumber):
    #    print("group number: " , groupNumber)
    
    contractStartDate = getContractStart(file_info['sentTokens'])
    if(contractStartDate):
        print("start: " , contractStartDate)
    
    contractEndDate = getContractEnd(file_info['sentTokens'])
    if(contractEndDate):
        print("end: " , contractEndDate)
    
    #contractDuration = getContractDuration(file_info['sentTokens'])
    #if(contractDuration):
    #    print("duration: " , contractDuration)
    #print("\n")
    
    d={'key':'filename', 'value':fileName}
    dfMD = pd.DataFrame(d, index=['MetaData'])
    
   ## df=pd.DataFrame({'key':'group_number','value':groupNumber}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
   ## df=pd.DataFrame({'key':'contract_start_date','value':contractStartDate}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
    return dfMD

### Workspace

#### Chunking/ POS practice

In [50]:
cp = nltk.RegexpParser('CHUNK: {<V.* to>}')
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK': print(subtree)

##### Practice with synonyms

In [51]:
def testSyns(sentTokens, word1, word2):
    syns1 = wn.synsets(word1)
    syns2 = wn.synsets(word2)
    #print(syns)
    #print(syns[0].lemmas()[1].name())
    set1 = set()
    for syn in syns1:
        for lem in syn.lemmas():
            set1.add(lem.name())
    #print(set1)
    set2 = set()
    for syn in syns2:
        for lem in syn.lemmas():
            set2.add(lem.name())
    #print(set2)
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            word_set.add(" ".join([word1, word2]).lower())
    #print(word_set)
    for sent in sentTokens:
        words = word_tokenize(sent)
        start = 0
        end = 1
        #while(end < len(words)):
            #print(" ".join([words[start], words[end]]))
            #if(" ".join([words[start], words[end]]).lower() in word_set):
                #print(" ".join([words[start], words[end]]))
                #print(sent)
            #start += 1
            #end += 1
    return
            
#testSyns(sentTokens, "effective", "date")            


## workspace: batch process files

In [52]:
filePath = "./data/raw/TX 17404 Contract Regional (7.2.18).docx"
processedTextPath = ""

if(checkFileType(filePath) == 0):
    processedTextPath = processTextFile(filePath)
elif(checkFileType(filePath) == 1):
    processedTextPath = processPDFfile(filePath)
else:
    raise TypeError("File type incorrect") 

#print(processedTextPath)
with open(processedTextPath, 'r') as txtFile:
            text = txtFile.read()


text = ddCleanText(text)
wordTokens = getTokens(text)
sentTokens = getSents(text)
#print(sentTokens)
bgs = getBigrams(wordTokens)
tgs = getTrigrams(wordTokens)


#### Call PreProcess

In [53]:
batchPreProcess()

/Users/sydneyknox/Documents/data-insights/jupyter-pseudocode
This path does not lead to a valid file type!


#### Call the tokenizing functions to get organized data for all the files in the pre-processed folder

In [54]:
base_info = batchGetTokens()

/Users/sydneyknox/Documents/data-insights/jupyter-pseudocode/data/output
Error opening and tokenizing .DS_Store


#### Call the functions to begin extracting and storing the data from all files

In [42]:
#for file in base_info:
 #   dfMD = getMetaDataAtt(file)
    #testSyns(file['sentTokens'], "group","number")
    
    #transpose=dfMD.transpose()
    #print(transpose)
    #print(dfMD)

In [65]:
getContractLocation(sentTokens)

--->   ('DeltaDentalInsuranceCompany', 'O')
--->   ('DeltaDentalInsuranceCompany', 'O')
--->   ('Contract', 'O')
--->   ('Delivery', 'O')
--->   ('Receipt', 'O')
--->   ('Contract', 'O')
--->   ('#', 'O')
--->   ('17404', 'O')
--->   ('For', 'O')
--->   ('CAMERON', 'O')
--->   ('COUNTY', 'O')
--->   ('I', 'O')
--->   ('acknowledge', 'O')
--->   ('receipt', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('above', 'O')
--->   ('contract', 'O')
--->   ('on', 'O')
--->   ('at', 'O')
--->   ('(', 'O')
--->   ('month', 'O')
--->   ('day', 'O')
--->   ('year', 'O')
--->   (')', 'O')
--->   ('(', 'O')
--->   ('city', 'O')
--->   ('state', 'O')
--->   (')', 'O')
--->   ('(', 'O')
--->   ('print', 'O')
--->   ('name', 'O')
--->   (')', 'O')
--->   ('(', 'O')
--->   ('signature', 'O')
--->   (')', 'O')
--->   ('Thank', 'O')
--->   ('you', 'O')
--->   ('for', 'O')
--->   ('choosing', 'O')
--->   ('DeltaDentalInsuranceCompany', 'O')
--->   ('.', 'O')
--->   ('DELTADENTAL', 'O')
--->   ('INSURAN

--->   ('THE', 'O')
--->   ('EMPLOYER', 'O')
--->   ('MUST', 'O')
--->   ('COMPLY', 'O')
--->   ('WITH', 'O')
--->   ('THE', 'O')
--->   ('WORKERS', 'O')
--->   ('’', 'O')
--->   ('COMPENSATION', 'O')
--->   ('LAW', 'O')
--->   ('AS', 'O')
--->   ('IT', 'O')
--->   ('PERTAINS', 'O')
--->   ('TO', 'O')
--->   ('NON-SUBSCRIBERS', 'O')
--->   ('AND', 'O')
--->   ('THE', 'O')
--->   ('REQUIRED', 'O')
--->   ('NOTIFICATIONS', 'O')
--->   ('THAT', 'O')
--->   ('MUST', 'O')
--->   ('BE', 'O')
--->   ('FILED', 'O')
--->   ('AND', 'O')
--->   ('POSTED', 'O')
--->   ('.', 'O')
--->   ('TEXAS', 'LOCATION')
--->   ('NOTICE', 'O')
--->   ('OF', 'O')
--->   ('COMPLAINT', 'O')
--->   ('IMPORTANT', 'O')
--->   ('NOTICE', 'O')
--->   ('To', 'O')
--->   ('obtain', 'O')
--->   ('information', 'O')
--->   ('or', 'O')
--->   ('make', 'O')
--->   ('a', 'O')
--->   ('complaint', 'O')
--->   (':', 'O')
--->   ('You', 'O')
--->   ('may', 'O')
--->   ('call', 'O')
--->   ('DeltaDentalInsuranceCompany', 'O')
---

--->   ('TABLE', 'O')
--->   ('OF', 'O')
--->   ('CONTENTS', 'O')
--->   ('ARTICLE', 'O')
--->   ('1', 'O')
--->   ('DEFINITIONS', 'O')
--->   ('ARTICLE', 'O')
--->   ('2', 'O')
--->   ('ELIGIBILITY', 'O')
--->   ('AND', 'O')
--->   ('ENROLLMENT', 'O')
--->   ('ARTICLE', 'O')
--->   ('3', 'O')
--->   ('MONTHLY', 'O')
--->   ('PREMIUMS', 'O')
--->   ('ARTICLE', 'O')
--->   ('4', 'O')
--->   ('BENEFITS', 'O')
--->   ('LIMITATIONS', 'O')
--->   ('AND', 'O')
--->   ('EXCLUSIONS', 'O')
--->   ('ARTICLE', 'O')
--->   ('5', 'O')
--->   ('DEDUCTIBLE', 'O')
--->   ('MAXIMUM', 'O')
--->   ('&', 'O')
--->   ('COORDINATION', 'O')
--->   ('OF', 'O')
--->   ('BENEFITS', 'O')
--->   ('ARTICLE', 'O')
--->   ('6', 'O')
--->   ('CONDITIONS', 'O')
--->   ('UNDER', 'O')
--->   ('WHICH', 'O')
--->   ('BENEFITS', 'O')
--->   ('WILL', 'O')
--->   ('BE', 'O')
--->   ('PROVIDED', 'O')
--->   ('ARTICLE', 'O')
--->   ('7', 'O')
--->   ('GENERAL', 'O')
--->   ('PROVISIONS', 'O')
--->   ('ARTICLE', 'O')
--->   ('8

--->   ('1.10', 'O')
--->   ('Contracting', 'O')
--->   ('Dentist', 'O')
--->   ('Agreement', 'O')
--->   ('--', 'O')
--->   ('an', 'O')
--->   ('agreement', 'O')
--->   ('between', 'O')
--->   ('DeltaDental', 'O')
--->   ('and', 'O')
--->   ('a', 'O')
--->   ('Dentist', 'O')
--->   ('that', 'O')
--->   ('establishes', 'O')
--->   ('the', 'O')
--->   ('terms', 'O')
--->   ('and', 'O')
--->   ('conditions', 'O')
--->   ('under', 'O')
--->   ('which', 'O')
--->   ('services', 'O')
--->   ('are', 'O')
--->   ('provided', 'O')
--->   ('.', 'O')
--->   ('1.11', 'O')
--->   ('DPO', 'O')
--->   ('Dentist', 'O')
--->   ('--', 'O')
--->   ('a', 'O')
--->   ('contracting', 'O')
--->   ('DeltaDental', 'PERSON')
--->   ('Dentist', 'PERSON')
--->   ('who', 'O')
--->   ('agrees', 'O')
--->   ('to', 'O')
--->   ('accept', 'O')
--->   ('DPO', 'O')
--->   ('Dentist', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('Fees', 'O')
--->   ('as', 'O')
--->   ('payment', 'O')
--->   ('in', 'O')
--->   ('full

--->   ('1.23', 'O')
--->   ('Open', 'O')
--->   ('Enrollment', 'O')
--->   ('Period', 'O')
--->   ('--', 'O')
--->   ('the', 'O')
--->   ('month', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('year', 'O')
--->   ('during', 'O')
--->   ('which', 'O')
--->   ('employees', 'O')
--->   ('may', 'O')
--->   ('change', 'O')
--->   ('coverage', 'O')
--->   ('for', 'O')
--->   ('the', 'O')
--->   ('next', 'O')
--->   ('Contract', 'O')
--->   ('Year', 'O')
--->   ('.', 'O')
--->   ('1.24', 'O')
--->   ('Pre-Treatment', 'O')
--->   ('Estimate', 'O')
--->   ('--', 'O')
--->   ('an', 'O')
--->   ('estimation', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('allowable', 'O')
--->   ('Benefits', 'O')
--->   ('under', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('for', 'O')
--->   ('the', 'O')
--->   ('services', 'O')
--->   ('proposed', 'O')
--->   ('assuming', 'O')
--->   ('the', 'O')
--->   ('patient', 'O')
--->   ('is', 'O')
--->   ('eligible', 'O')
--->   ('.', 'O')
---

--->   ('Said', 'O')
--->   ('termination', 'O')
--->   ('date', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('adjusted', 'O')
--->   ('retroactively', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('immediately', 'O')
--->   ('preceding', 'O')
--->   ('3', 'O')
--->   ('months', 'O')
--->   ('plus', 'O')
--->   ('the', 'O')
--->   ('current', 'O')
--->   ('month', 'O')
--->   ('provided', 'O')
--->   (':', 'O')
--->   ('a', 'O')
--->   (')', 'O')
--->   ('no', 'O')
--->   ('claims', 'O')
--->   ('were', 'O')
--->   ('submitted', 'O')
--->   ('to', 'O')
--->   ('be', 'O')
--->   ('processed', 'O')
--->   ('on', 'O')
--->   ('said', 'O')
--->   ('Enrollee', 'O')
--->   ('subsequent', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('date', 'O')
--->   ('of', 'O')
--->   ('retroactive', 'O')
--->   ('termination', 'O')
--->   (';', 'O')
--->   ('and', 'O')
--->   ('b', 'O')
--->   (')', 'O')
--->   ('Premiums', 'O')
--->   ('were', 'O')
--->   ('actually', 'O')
--->   ('p

--->   ('Coverage', 'O')
--->   ('for', 'O')
--->   ('said', 'O')
--->   ('grandchild', 'O')
--->   ('may', 'O')
--->   ('not', 'O')
--->   ('be', 'O')
--->   ('terminated', 'O')
--->   ('solely', 'O')
--->   ('because', 'O')
--->   ('the', 'O')
--->   ('grandchild', 'O')
--->   ('is', 'O')
--->   ('no', 'O')
--->   ('longer', 'O')
--->   ('dependent', 'O')
--->   ('upon', 'O')
--->   ('the', 'O')
--->   ('Eligible', 'O')
--->   ('Person', 'O')
--->   ('for', 'O')
--->   ('federal', 'O')
--->   ('income', 'O')
--->   ('tax', 'O')
--->   ('purposes', 'O')
--->   ('.', 'O')
--->   ('Children', 'O')
--->   ('including', 'O')
--->   ('grandchildren', 'O')
--->   ('under', 'O')
--->   ('26', 'O')
--->   ('years', 'O')
--->   ('of', 'O')
--->   ('age', 'O')
--->   ('for', 'O')
--->   ('whom', 'O')
--->   ('the', 'O')
--->   ('Eligible', 'O')
--->   ('Person', 'O')
--->   ('is', 'O')
--->   ('required', 'O')
--->   ('to', 'O')
--->   ('insure', 'O')
--->   ('under', 'O')
--->   ('a', 'O')
---

--->   ('2.06', 'O')
--->   ('Except', 'O')
--->   ('for', 'O')
--->   ('an', 'O')
--->   ('employee', 'O')
--->   ('absent', 'O')
--->   ('from', 'O')
--->   ('work', 'O')
--->   ('due', 'O')
--->   ('to', 'O')
--->   ('a', 'O')
--->   ('leave', 'O')
--->   ('of', 'O')
--->   ('absence', 'O')
--->   ('governed', 'O')
--->   ('by', 'O')
--->   ('the', 'O')
--->   ('“', 'O')
--->   ('Family', 'O')
--->   ('&', 'O')
--->   ('Medical', 'O')
--->   ('Leave', 'O')
--->   ('Act', 'O')
--->   ('of', 'O')
--->   ('1993', 'O')
--->   ('”', 'O')
--->   ('(', 'O')
--->   ('P.L', 'O')
--->   ('.', 'O')
--->   ('103.3', 'O')
--->   (')', 'O')
--->   ('an', 'O')
--->   ('Enrollee', 'O')
--->   ('will', 'O')
--->   ('not', 'O')
--->   ('be', 'O')
--->   ('covered', 'O')
--->   ('for', 'O')
--->   ('any', 'O')
--->   ('dental', 'O')
--->   ('services', 'O')
--->   ('received', 'O')
--->   ('while', 'O')
--->   ('a', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('is', 'O')
--->   ('on',

--->   ('Continuation', 'O')
--->   ('of', 'O')
--->   ('coverage', 'O')
--->   ('under', 'O')
--->   ('USERRA', 'LOCATION')
--->   ('may', 'O')
--->   ('not', 'O')
--->   ('extend', 'O')
--->   ('beyond', 'O')
--->   ('the', 'O')
--->   ('earlier', 'O')
--->   ('of', 'O')
--->   (':', 'O')
--->   ('24', 'O')
--->   ('months', 'O')
--->   ('beginning', 'O')
--->   ('on', 'O')
--->   ('the', 'O')
--->   ('date', 'O')
--->   ('the', 'O')
--->   ('leave', 'O')
--->   ('of', 'O')
--->   ('absence', 'O')
--->   ('begins', 'O')
--->   ('or', 'O')
--->   ('the', 'O')
--->   ('date', 'O')
--->   ('the', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('fails', 'O')
--->   ('to', 'O')
--->   ('return', 'O')
--->   ('to', 'O')
--->   ('work', 'O')
--->   ('within', 'O')
--->   ('the', 'O')
--->   ('time', 'O')
--->   ('required', 'O')
--->   ('by', 'O')
--->   ('USERRA', 'ORGANIZATION')
--->   ('.', 'O')
--->   ('For', 'O')
--->   ('USERRA', 'LOCATION')
--->   ('leave', 'O')
--->   

--->   ('But', 'O')
--->   ('it', 'O')
--->   ('does', 'O')
--->   ('not', 'O')
--->   ('include', 'O')
--->   ('the', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('new', 'O')
--->   ('spouse', 'O')
--->   ('stepchild', 'O')
--->   ('or', 'O')
--->   ('foster', 'O')
--->   ('child', 'O')
--->   ('acquired', 'O')
--->   ('during', 'O')
--->   ('the', 'O')
--->   ('continuation', 'O')
--->   ('period', 'O')
--->   ('whether', 'O')
--->   ('or', 'O')
--->   ('not', 'O')
--->   ('the', 'O')
--->   ('new', 'O')
--->   ('Dependent', 'O')
--->   ('is', 'O')
--->   ('enrolled', 'O')
--->   ('for', 'O')
--->   ('coverage', 'O')
--->   ('.', 'O')
--->   ('b', 'O')
--->   (')', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('remains', 'O')
--->   ('in', 'O')
--->   ('force', 'O')
--->   ('.', 'O')
--->   ('Continuation', 'O')
--->   ('Periods', 'O')
--->   ('.', 'O')
--->   ('The', 'O')
--->   ('maximum', 'O')
--->   ('period', 'O')

--->   ('c', 'O')
--->   (')', 'O')
--->   ('Subsequent', 'O')
--->   ('Qualifying', 'O')
--->   ('Event', 'O')
--->   ('.', 'O')
--->   ('If', 'O')
--->   ('the', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('Dependent', 'O')
--->   (':', 'O')
--->   ('i', 'O')
--->   (')', 'O')
--->   ('is', 'O')
--->   ('a', 'O')
--->   ('Qualified', 'O')
--->   ('Beneficiary', 'O')
--->   (';', 'O')
--->   ('and', 'O')
--->   ('ii', 'O')
--->   (')', 'O')
--->   ('has', 'O')
--->   ('a', 'O')
--->   ('subsequent', 'O')
--->   ('Qualifying', 'O')
--->   ('Event', 'O')
--->   ('during', 'O')
--->   ('the', 'O')
--->   ('18', 'O')
--->   ('or', 'O')
--->   ('29', 'O')
--->   ('month', 'O')
--->   ('continuation', 'O')
--->   ('period', 'O')
--->   (';', 'O')
--->   ('then', 'O')
--->   ('coverage', 'O')
--->   ('for', 'O')
--->   ('that', 'O')
--->   ('Dependent', 'O')
--->   ('Enrollee', 'O')
--->   ('may', 'O')
--->   ('be', 'O')
--->   ('continue

--->   ('(', 'O')
--->   ('2', 'O')
--->   (')', 'O')
--->   ('Continued', 'O')
--->   ('coverage', 'O')
--->   ('elected', 'O')
--->   ('under', 'O')
--->   ('this', 'O')
--->   ('section', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('effective', 'O')
--->   ('on', 'O')
--->   ('the', 'O')
--->   ('date', 'O')
--->   ('after', 'O')
--->   ('the', 'O')
--->   ('person', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('coverage', 'O')
--->   ('under', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('would', 'O')
--->   ('otherwise', 'O')
--->   ('terminate', 'O')
--->   ('due', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('occurrence', 'O')
--->   ('of', 'O')
--->   ('a', 'O')
--->   ('Qualifying', 'O')
--->   ('Event', 'O')
--->   ('provided', 'O')
--->   (':', 'O')
--->   ('a', 'O')
--->   (')', 'O')
--->   ('the', 'O')
--->   ('person', 'O')
--->   ('has', 'O')
--->   ('notified', 'O')
--->   ('the', 'O')
--->   ('Contractholder', 'O')
--->   ('within', 'O

--->   ('This', 'O')
--->   ('Contract', 'O')
--->   ('will', 'O')
--->   ('continue', 'O')
--->   ('in', 'O')
--->   ('force', 'O')
--->   ('during', 'O')
--->   ('this', 'O')
--->   ('period', 'O')
--->   (';', 'O')
--->   ('if', 'O')
--->   ('the', 'O')
--->   ('Premium', 'O')
--->   ('remains', 'O')
--->   ('unpaid', 'O')
--->   ('at', 'O')
--->   ('the', 'O')
--->   ('end', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('grace', 'O')
--->   ('period', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('may', 'O')
--->   ('be', 'O')
--->   ('terminated', 'O')
--->   ('by', 'O')
--->   ('DeltaDental', 'PERSON')
--->   ('in', 'O')
--->   ('accordance', 'O')
--->   ('with', 'O')
--->   ('the', 'O')
--->   ('notice', 'O')
--->   ('requirements', 'O')
--->   ('of', 'O')
--->   ('Section', 'O')
--->   ('8.01', 'O')
--->   ('.', 'O')
--->   ('3.03', 'O')
--->   ('If', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('is', 'O')
--->   ('terminated', 'O')
--->   ('bef

--->   ('Additional', 'O')
--->   ('eligibility', 'O')
--->   ('periods', 'O')
--->   ('if', 'O')
--->   ('any', 'O')
--->   ('for', 'O')
--->   ('specific', 'O')
--->   ('services', 'O')
--->   ('are', 'O')
--->   ('shown', 'O')
--->   ('in', 'O')
--->   ('Appendix', 'O')
--->   ('A', 'O')
--->   ('.', 'O')
--->   ('If', 'O')
--->   ('an', 'O')
--->   ('Enrollee', 'O')
--->   ('receives', 'O')
--->   ('dental', 'O')
--->   ('services', 'O')
--->   ('from', 'O')
--->   ('a', 'O')
--->   ('Dentist', 'O')
--->   ('outside', 'O')
--->   ('the', 'O')
--->   ('state', 'O')
--->   ('of', 'O')
--->   ('Texas', 'LOCATION')
--->   ('the', 'O')
--->   ('Dentists', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('reimbursed', 'O')
--->   ('according', 'O')
--->   ('to', 'O')
--->   ('DeltaDental', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('network', 'O')
--->   ('payment', 'O')
--->   ('provisions', 'O')
--->   ('for', 'O')
--->   ('said', 'O')
--->   ('state', 'O')
--->   ('according

--->   ('Basic', 'O')
--->   ('Benefits', 'O')
--->   ('(', 'O')
--->   ('1', 'O')
--->   (')', 'O')
--->   ('Oral', 'O')
--->   ('Surgery', 'O')
--->   (':', 'O')
--->   ('extractions', 'O')
--->   ('and', 'O')
--->   ('other', 'O')
--->   ('surgical', 'O')
--->   ('procedures', 'O')
--->   ('(', 'O')
--->   ('including', 'O')
--->   ('pre-and', 'O')
--->   ('post-operative', 'O')
--->   ('care', 'O')
--->   (')', 'O')
--->   ('.', 'O')
--->   ('(', 'O')
--->   ('2', 'O')
--->   (')', 'O')
--->   ('General', 'O')
--->   ('Anesthesia', 'O')
--->   ('or', 'O')
--->   ('IV', 'O')
--->   ('Sedation', 'O')
--->   (':', 'O')
--->   ('when', 'O')
--->   ('administered', 'O')
--->   ('by', 'O')
--->   ('a', 'O')
--->   ('Dentist', 'O')
--->   ('for', 'O')
--->   ('covered', 'O')
--->   ('oral', 'O')
--->   ('surgery', 'O')
--->   ('or', 'O')
--->   ('selected', 'O')
--->   ('endodontic', 'O')
--->   ('and', 'O')
--->   ('periodontal', 'O')
--->   ('surgical', 'O')
--->   ('procedures', 'O')
-

--->   ('Optional', 'ORGANIZATION')
--->   ('Services', 'ORGANIZATION')
--->   ('also', 'O')
--->   ('include', 'O')
--->   ('the', 'O')
--->   ('use', 'O')
--->   ('of', 'O')
--->   ('specialized', 'O')
--->   ('techniques', 'O')
--->   ('instead', 'O')
--->   ('of', 'O')
--->   ('standard', 'O')
--->   ('procedures', 'O')
--->   ('.', 'O')
--->   ('For', 'O')
--->   ('example', 'O')
--->   (':', 'O')
--->   ('a', 'O')
--->   ('crown', 'O')
--->   ('where', 'O')
--->   ('a', 'O')
--->   ('filling', 'O')
--->   ('would', 'O')
--->   ('restore', 'O')
--->   ('the', 'O')
--->   ('tooth', 'O')
--->   (';', 'O')
--->   ('a', 'O')
--->   ('precision', 'O')
--->   ('denturepartial', 'O')
--->   ('where', 'O')
--->   ('a', 'O')
--->   ('standard', 'O')
--->   ('denturepartial', 'O')
--->   ('could', 'O')
--->   ('be', 'O')
--->   ('used', 'O')
--->   (';', 'O')
--->   ('an', 'O')
--->   ('inlayonlay', 'O')
--->   ('instead', 'O')
--->   ('of', 'O')
--->   ('an', 'O')
--->   ('amalgam', 'O')
-

--->   ('(', 'O')
--->   ('3', 'O')
--->   (')', 'O')
--->   ('Prefabricated', 'O')
--->   ('stainless', 'O')
--->   ('steel', 'O')
--->   ('restorations', 'O')
--->   ('are', 'O')
--->   ('limited', 'O')
--->   ('to', 'O')
--->   ('once', 'O')
--->   ('in', 'O')
--->   ('a', 'O')
--->   ('12', 'O')
--->   ('month', 'O')
--->   ('period', 'O')
--->   ('if', 'O')
--->   ('the', 'O')
--->   ('service', 'O')
--->   ('is', 'O')
--->   ('provided', 'O')
--->   ('by', 'O')
--->   ('the', 'O')
--->   ('same', 'O')
--->   ('Dentist', 'O')
--->   ('.', 'O')
--->   ('(', 'O')
--->   ('4', 'O')
--->   (')', 'O')
--->   ('DeltaDental', 'O')
--->   ('limits', 'O')
--->   ('payment', 'O')
--->   ('for', 'O')
--->   ('stainless', 'O')
--->   ('steel', 'O')
--->   ('crowns', 'O')
--->   ('under', 'O')
--->   ('this', 'O')
--->   ('section', 'O')
--->   ('to', 'O')
--->   ('services', 'O')
--->   ('on', 'O')
--->   ('baby', 'O')
--->   ('teeth', 'O')
--->   ('.', 'O')
--->   ('However', 'O')
--->   ('a

--->   ('Benefits', 'O')
--->   ('are', 'O')
--->   ('not', 'O')
--->   ('provided', 'O')
--->   ('for', 'O')
--->   ('orthodontic', 'O')
--->   ('retreatment', 'O')
--->   ('procedures', 'O')
--->   ('.', 'O')
--->   ('(', 'O')
--->   ('5', 'O')
--->   (')', 'O')
--->   ('Orthodontic', 'O')
--->   ('Benefits', 'O')
--->   ('are', 'O')
--->   ('limited', 'O')
--->   ('to', 'O')
--->   ('Dependent', 'O')
--->   ('Child', 'O')
--->   ('Enrollees', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('end', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('month', 'O')
--->   ('of', 'O')
--->   ('their', 'O')
--->   ('19th', 'O')
--->   ('birthday', 'O')
--->   ('.', 'O')
--->   ('(', 'O')
--->   ('6', 'O')
--->   (')', 'O')
--->   ('Non-orthodontic', 'O')
--->   ('procedures', 'O')
--->   ('performed', 'O')
--->   ('for', 'O')
--->   ('the', 'O')
--->   ('purpose', 'O')
--->   ('of', 'O')
--->   ('orthodontic', 'O')
--->   ('treatment', 'O')
--->   ('are', 'O')
--->   ('subject', 'O')


--->   ('For', 'O')
--->   ('example', 'O')
--->   (':', 'O')
--->   ('equilibration', 'O')
--->   ('periodontal', 'O')
--->   ('splinting', 'O')
--->   ('occlusal', 'O')
--->   ('adjustment', 'O')
--->   ('.', 'O')
--->   ('any', 'O')
--->   ('Single', 'O')
--->   ('Procedure', 'O')
--->   ('started', 'O')
--->   ('prior', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('date', 'O')
--->   ('the', 'O')
--->   ('patient', 'O')
--->   ('became', 'O')
--->   ('eligible', 'O')
--->   ('for', 'O')
--->   ('services', 'O')
--->   ('under', 'O')
--->   ('this', 'O')
--->   ('program', 'O')
--->   ('.', 'O')
--->   ('prescribed', 'O')
--->   ('drugs', 'O')
--->   ('medication', 'O')
--->   ('pain', 'O')
--->   ('killers', 'O')
--->   ('or', 'O')
--->   ('experimental', 'O')
--->   ('procedures', 'O')
--->   ('.', 'O')
--->   ('charges', 'O')
--->   ('by', 'O')
--->   ('any', 'O')
--->   ('hospital', 'O')
--->   ('or', 'O')
--->   ('other', 'O')
--->   ('surgical', 'O')
--->   ('or', 'O')


--->   ('The', 'O')
--->   ('reduction', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('the', 'O')
--->   ('amount', 'O')
--->   ('paid', 'O')
--->   ('for', 'O')
--->   ('or', 'O')
--->   ('provided', 'O')
--->   ('under', 'O')
--->   ('the', 'O')
--->   ('terms', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('primary', 'O')
--->   ('plan', 'O')
--->   ('for', 'O')
--->   ('covered', 'O')
--->   ('services', 'O')
--->   ('under', 'O')
--->   ('Article', 'O')
--->   ('4', 'O')
--->   ('.', 'O')
--->   ('Order', 'O')
--->   ('of', 'O')
--->   ('Benefit', 'O')
--->   ('Determination', 'O')
--->   ('Rules', 'O')
--->   (':', 'O')
--->   ('The', 'O')
--->   ('following', 'O')
--->   ('rules', 'O')
--->   ('determine', 'O')
--->   ('which', 'O')
--->   ('is', 'O')
--->   ('the', 'O')
--->   ('“', 'O')
--->   ('primary', 'O')
--->   ('”', 'O')
--->   ('plan', 'O')
--->   (':', 'O')
--->   ('If', 'O')
--->   ('the', 'O')
--->   ('other', 'O')
--->   ('Plan', 'O')
--->   ('is', 'O

--->   ('(', 'O')
--->   ('5', 'O')
--->   (')', 'O')
--->   ('If', 'O')
--->   ('the', 'O')
--->   ('specific', 'O')
--->   ('terms', 'O')
--->   ('of', 'O')
--->   ('a', 'O')
--->   ('court', 'O')
--->   ('decree', 'O')
--->   ('state', 'O')
--->   ('that', 'O')
--->   ('the', 'O')
--->   ('parents', 'O')
--->   ('will', 'O')
--->   ('share', 'O')
--->   ('joint', 'O')
--->   ('custody', 'O')
--->   ('without', 'O')
--->   ('stating', 'O')
--->   ('that', 'O')
--->   ('one', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('parents', 'O')
--->   ('is', 'O')
--->   ('responsible', 'O')
--->   ('for', 'O')
--->   ('the', 'O')
--->   ('health', 'O')
--->   ('care', 'O')
--->   ('expenses', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('child', 'O')
--->   ('the', 'O')
--->   ('Plans', 'O')
--->   ('covering', 'O')
--->   ('the', 'O')
--->   ('child', 'O')
--->   ('will', 'O')
--->   ('follow', 'O')
--->   ('the', 'O')
--->   ('order', 'O')
--->   ('of', 'O')
--->   ('benefit', 

--->   ('Additionally', 'O')
--->   ('Enrollees', 'O')
--->   ('should', 'O')
--->   ('always', 'O')
--->   ('confirm', 'O')
--->   ('with', 'O')
--->   ('the', 'O')
--->   ('dentist', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('office', 'O')
--->   ('that', 'O')
--->   ('a', 'O')
--->   ('listed', 'O')
--->   ('Dentist', 'O')
--->   ('is', 'O')
--->   ('still', 'O')
--->   ('a', 'O')
--->   ('contracted', 'O')
--->   ('DPO', 'O')
--->   ('Dentist', 'O')
--->   ('or', 'O')
--->   ('a', 'O')
--->   ('Premier', 'O')
--->   ('Dentist', 'O')
--->   ('.', 'O')
--->   ('DPO', 'O')
--->   ('Dentist', 'O')
--->   ('The', 'O')
--->   ('DPO', 'O')
--->   ('program', 'O')
--->   ('potentially', 'O')
--->   ('allows', 'O')
--->   ('the', 'O')
--->   ('greatest', 'O')
--->   ('reduction', 'O')
--->   ('in', 'O')
--->   ('Enrollees', 'O')
--->   ('’', 'O')
--->   ('out-of-pocket', 'O')
--->   ('expenses', 'O')
--->   ('since', 'O')
--->   ('this', 'O')
--->   ('select', 'O')
--->   ('group', '

--->   ('If', 'O')
--->   ('DeltaDental', 'PERSON')
--->   ('does', 'O')
--->   ('not', 'O')
--->   ('furnish', 'O')
--->   ('the', 'O')
--->   ('form', 'O')
--->   ('within', 'O')
--->   ('15', 'O')
--->   ('days', 'O')
--->   ('after', 'O')
--->   ('requested', 'O')
--->   ('by', 'O')
--->   ('a', 'O')
--->   ('Dentist', 'O')
--->   ('or', 'O')
--->   ('Enrollee', 'O')
--->   ('the', 'O')
--->   ('requirements', 'O')
--->   ('for', 'O')
--->   ('proof', 'O')
--->   ('of', 'O')
--->   ('loss', 'O')
--->   ('set', 'O')
--->   ('forth', 'O')
--->   ('in', 'O')
--->   ('section', 'O')
--->   ('6.05', 'O')
--->   ('of', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('deemed', 'O')
--->   ('to', 'O')
--->   ('have', 'O')
--->   ('been', 'O')
--->   ('complied', 'O')
--->   ('with', 'O')
--->   ('upon', 'O')
--->   ('the', 'O')
--->   ('submission', 'O')
--->   ('to', 'O')
--->   ('DeltaDental', 'O')
--->   ('within', 'O')
--->   ('the', '

--->   ('If', 'O')
--->   ('the', 'O')
--->   ('requested', 'O')
--->   ('information', 'O')
--->   ('is', 'O')
--->   ('not', 'O')
--->   ('received', 'O')
--->   ('within', 'O')
--->   ('45', 'O')
--->   ('days', 'O')
--->   ('the', 'O')
--->   ('claim', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('denied', 'O')
--->   ('.', 'O')
--->   ('Subject', 'O')
--->   ('to', 'O')
--->   ('due', 'O')
--->   ('written', 'O')
--->   ('proof', 'O')
--->   ('of', 'O')
--->   ('loss', 'O')
--->   ('all', 'O')
--->   ('accrued', 'O')
--->   ('indemnities', 'O')
--->   ('for', 'O')
--->   ('loss', 'O')
--->   ('for', 'O')
--->   ('which', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('provides', 'O')
--->   ('periodic', 'O')
--->   ('payment', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('paid', 'O')
--->   ('monthly', 'O')
--->   ('.', 'O')
--->   ('6.07', 'O')
--->   ('Claims', 'O')
--->   ('Appeal', 'O')
--->   ('DeltaDental', 'O')
--->   ('will', 'O')
--->   ('notif

--->   ('Any', 'O')
--->   ('other', 'O')
--->   ('payments', 'O')
--->   ('provided', 'O')
--->   ('by', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('made', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('unless', 'O')
--->   ('the', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('requests', 'O')
--->   ('when', 'O')
--->   ('filing', 'O')
--->   ('proof', 'O')
--->   ('of', 'O')
--->   ('loss', 'O')
--->   ('that', 'O')
--->   ('the', 'O')
--->   ('payment', 'O')
--->   ('be', 'O')
--->   ('made', 'O')
--->   ('directly', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('Dentist', 'O')
--->   ('providing', 'O')
--->   ('the', 'O')
--->   ('services', 'O')
--->   ('.', 'O')
--->   ('All', 'O')
--->   ('Benefits', 'O')
--->   ('not', 'O')
--->   ('paid', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('Dentist', 'O')
--->   ('will', 'O')
--->   ('be', 'O')


--->   ('7.07', 'O')
--->   ('Not', 'O')
--->   ('in', 'O')
--->   ('Lieu', 'O')
--->   ('of', 'O')
--->   ('Workers', 'O')
--->   ('’', 'O')
--->   ('Compensation', 'O')
--->   ('This', 'O')
--->   ('Contract', 'O')
--->   ('is', 'O')
--->   ('not', 'O')
--->   ('in', 'O')
--->   ('lieu', 'O')
--->   ('of', 'O')
--->   ('and', 'O')
--->   ('does', 'O')
--->   ('not', 'O')
--->   ('affect', 'O')
--->   ('any', 'O')
--->   ('requirements', 'O')
--->   ('for', 'O')
--->   ('coverage', 'O')
--->   ('by', 'O')
--->   ('workers', 'O')
--->   ('’', 'O')
--->   ('compensation', 'O')
--->   ('insurance', 'O')
--->   ('.', 'O')
--->   ('7.08', 'O')
--->   ('Certificate', 'O')
--->   ('of', 'O')
--->   ('Insurance', 'O')
--->   ('DeltaDental', 'O')
--->   ('will', 'O')
--->   ('issue', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('Contractholder', 'O')
--->   ('an', 'O')
--->   ('electronic', 'O')
--->   ('copy', 'O')
--->   ('containing', 'O')
--->   ('a', 'O')
--->   ('certificate', 'O'

--->   ('No', 'O')
--->   ('statement', 'O')
--->   ('by', 'O')
--->   ('you', 'O')
--->   ('with', 'O')
--->   ('respect', 'O')
--->   ('to', 'O')
--->   ('the', 'O')
--->   ('an', 'O')
--->   ('Enrollee', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('insurability', 'O')
--->   ('will', 'O')
--->   ('be', 'O')
--->   ('used', 'O')
--->   ('to', 'O')
--->   ('reduce', 'O')
--->   ('or', 'O')
--->   ('deny', 'O')
--->   ('a', 'O')
--->   ('claim', 'O')
--->   ('or', 'O')
--->   ('contest', 'O')
--->   ('the', 'O')
--->   ('validity', 'O')
--->   ('of', 'O')
--->   ('insurance', 'O')
--->   ('for', 'O')
--->   ('such', 'O')
--->   ('Enrollee', 'O')
--->   ('after', 'O')
--->   ('that', 'O')
--->   ('person', 'O')
--->   ('’', 'O')
--->   ('s', 'O')
--->   ('coverage', 'O')
--->   ('has', 'O')
--->   ('been', 'O')
--->   ('in', 'O')
--->   ('effect', 'O')
--->   ('two', 'O')
--->   ('(', 'O')
--->   ('2', 'O')
--->   (')', 'O')
--->   ('years', 'O')
--->   ('.', 'O')
--->   ('7.14', '

--->   ('By', 'O')
--->   ('DeltaDental', 'PERSON')
--->   ('at', 'O')
--->   ('the', 'O')
--->   ('end', 'O')
--->   ('of', 'O')
--->   ('a', 'O')
--->   ('contract', 'O')
--->   ('term', 'O')
--->   ('upon', 'O')
--->   ('60', 'O')
--->   ('days', 'O')
--->   ('written', 'O')
--->   ('notice', 'O')
--->   ('.', 'O')
--->   ('8.02', 'O')
--->   ('In', 'O')
--->   ('the', 'O')
--->   ('event', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('is', 'O')
--->   ('terminated', 'O')
--->   ('under', 'O')
--->   ('the', 'O')
--->   ('second', 'O')
--->   ('bullet', 'O')
--->   ('item', 'O')
--->   ('in', 'O')
--->   ('Section', 'O')
--->   ('8.01', 'O')
--->   ('Contractholder', 'O')
--->   ('will', 'O')
--->   ('become', 'O')
--->   ('immediately', 'O')
--->   ('obligated', 'O')
--->   ('upon', 'O')
--->   ('termination', 'O')
--->   ('to', 'O')
--->   ('pay', 'O')
--->   ('DeltaDental', 'O')
--->   ('for', 'O')
--->   ('that', 'O')
--->   ('portion', 'O')
--->   ('of', 'O')
--->

--->   ('Waiting', 'O')
--->   ('periods', 'O')
--->   ('are', 'O')
--->   ('calculated', 'O')
--->   ('for', 'O')
--->   ('each', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('andor', 'O')
--->   ('Dependent', 'O')
--->   ('Enrollee', 'O')
--->   ('from', 'O')
--->   ('the', 'O')
--->   ('Effective', 'O')
--->   ('Date', 'O')
--->   ('reported', 'O')
--->   ('by', 'O')
--->   ('the', 'O')
--->   ('Contractholder', 'O')
--->   ('for', 'O')
--->   ('said', 'O')
--->   ('Primary', 'O')
--->   ('Enrollee', 'O')
--->   ('.', 'O')
--->   ('Orthodontic', 'O')
--->   ('Benefits', 'O')
--->   ('are', 'O')
--->   ('limited', 'O')
--->   ('to', 'O')
--->   ('dependent', 'O')
--->   ('children', 'O')
--->   ('of', 'O')
--->   ('Primary', 'O')
--->   ('Enrollees', 'O')
--->   ('who', 'O')
--->   ('have', 'O')
--->   ('been', 'O')
--->   ('enrolled', 'O')
--->   ('in', 'O')
--->   ('this', 'O')
--->   ('Contract', 'O')
--->   ('for', 'O')
--->   ('12', 'O')
--->   ('consecutive', '

--->   ('Only', 'O')
--->   ('the', 'O')
--->   ('policyholders', 'O')
--->   ('of', 'O')
--->   ('insurance', 'O')
--->   ('companies', 'O')
--->   ('which', 'O')
--->   ('are', 'O')
--->   ('members', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('Association', 'ORGANIZATION')
--->   ('are', 'O')
--->   ('eligible', 'O')
--->   ('for', 'O')
--->   ('this', 'O')
--->   ('protection', 'O')
--->   ('.', 'O')
--->   ('However', 'O')
--->   ('even', 'O')
--->   ('if', 'O')
--->   ('a', 'O')
--->   ('company', 'O')
--->   ('is', 'O')
--->   ('a', 'O')
--->   ('member', 'O')
--->   ('of', 'O')
--->   ('the', 'O')
--->   ('Association', 'ORGANIZATION')
--->   ('protection', 'O')
--->   ('is', 'O')
--->   ('limited', 'O')
--->   ('and', 'O')
--->   ('policyholders', 'O')
--->   ('must', 'O')
--->   ('meet', 'O')
--->   ('certain', 'O')
--->   ('guidelines', 'O')
--->   ('to', 'O')
--->   ('qualify', 'O')
--->   ('.', 'O')
--->   ('(', 'O')
--->   ('The', 'O')
--->   ('law', 'O')
--->   

--->   ('When', 'O')
--->   ('you', 'O')
--->   ('are', 'O')
--->   ('selecting', 'O')
--->   ('an', 'O')
--->   ('insurance', 'O')
--->   ('company', 'O')
--->   ('you', 'O')
--->   ('should', 'O')
--->   ('not', 'O')
--->   ('rely', 'O')
--->   ('on', 'O')
--->   ('Association', 'O')
--->   ('coverage', 'O')
--->   ('.', 'O')
--->   ('Texas', 'ORGANIZATION')
--->   ('Life', 'ORGANIZATION')
--->   ('Accident', 'ORGANIZATION')
--->   ('Health', 'ORGANIZATION')
--->   ('and', 'ORGANIZATION')
--->   ('Hospital', 'ORGANIZATION')
--->   ('Service', 'ORGANIZATION')
--->   ('Insurance', 'ORGANIZATION')
--->   ('Guaranty', 'ORGANIZATION')
--->   ('Association', 'ORGANIZATION')
--->   ('6504', 'ORGANIZATION')
--->   ('Bridge', 'ORGANIZATION')
--->   ('Point', 'ORGANIZATION')
--->   ('Parkway', 'ORGANIZATION')
--->   ('Suite', 'ORGANIZATION')
--->   ('450', 'O')
--->   ('Austin', 'LOCATION')
--->   ('Texas', 'LOCATION')
--->   ('78730', 'O')
--->   ('800-982-6362', 'O')
--->   ('www.txlifega.or

In [74]:
loc_sents = {'For USERRA leave that extends beyond 31 days the Premium for continuation of coverage will be the same as for COBRA coverage.', '4.07 Exclusions DeltaDental does not pay Benefits for: treatment of injuries or illness covered by workers’ compensation or employers’ liability laws; services received without cost from any federal state or local agency except for services covered by the Medical Assistance Act of 1967 as amended (Article 695j-1 Vernon’s Texas Civil Statutes).', '(The law is found in the Texas Insurance Code Article 21.28-D.) BECAUSE OF STATUTORY LIMITATIONS ON POLICYHOLDER PROTECTION IT IS POSSIBLE THAT THE ASSOCIATION MAY NOT COVER YOUR POLICY OR MAY NOT COVER YOUR POLICY IN FULL.', '7.03 Conformity With State Laws All legal questions about this Contract will be governed by the state of Texas where this Contract was entered into and is to be performed.', 'Premiums: Monthly Amount: Per Primary Enrollee: $20.40 Per Primary Enrollee with one Dependent Enrollee: $38.08 Per Primary Enrollee with two or more Dependent Enrollees: $57.60 Payment Breakdown: Primary Enrollee shall pay: 100% for personal coverage 100% for Dependent coverage Contractholder may charge persons electing continued coverage pursuant to Title X of P.L.', 'AVISO IMPORTANTE Para obtener informacion o para someter una queja: Usted puede llamar al numero de telefon gratis de DeltaDentalInsuranceCompany’s para informacion o para someter una queja al 1-800-521-2651 Usted tambien puede escribir a DeltaDentalInsuranceCompany DeltaDentalInsuranceCompany 1130 Sanctuary Parkway Suite 600 Alpharetta Georgia 30009 Puede comunicarse con el Departamento de Seguros de Texas para obtener informacion acerca de companias coberturas derechos o quejas al 1-800-252-3439 Puede escribir al Departamento de Seguros de Texas P.O.', 'Notice by United States mail will be effective 48 hours after mailing with fully prepaid postage.', 'This Contract is issued and delivered in the State of Texas and is governed by its laws.', 'Box 149104 Austin TX 78714-9104 FAX # (512) 475-1771 Web: http://www.tdi.state.tx.us E-mail: ConsumerProtection@tdi.state.tx.us DISPUTAS SOBRE PRIMAS O RECLAMOS: Si tiene una disputa concerniente a su prima o a un reclamo debe comunicarse con el agente o DeltaDentalInsuranceCompany primero.', '2.08 Continued Coverage Under USERRA As required under the Uniformed Services Employment and Reemployment Rights Act of 1994 (USERRA) if a Primary Enrollee is covered by this Contract on the date his or her USERRA leave of absence begins the Primary Enrollee may continue dental coverage for himself or herself and any covered dependents.', 'DELTADENTAL INSURANCE COMPANY 1130 Sanctuary Parkway Suite 600 Alpharetta Georgia 30009 (770) 641-5100 (888) 858-5252 Dental Provider Organization Program Cameron County (“Contractholder”) has applied for a group dental insurance contract with DeltaDentalInsuranceCompany (“DeltaDental”).', 'Children including grandchildren under 26 years of age for whom the Eligible Person is required to insure under a medical support order issued under Chapter 154 Family Code or enforceable by a court in Texas.', 'ARTICLE 3 MONTHLY PREMIUMS 3.01 Contractholder will remit the monthly Premium in the amount and manner shown in Appendix A for all Primary Enrollees and Dependent Enrollees to: DeltaDentalInsuranceCompany Post Office Box 7564 San Francisco CA 94120 DeltaDental will receive a full month’s Premium for Enrollees whose coverage is effective on the first (1st) through the 15th calendar day of a month.', 'Continuation of coverage under USERRA may not extend beyond the earlier of: 24 months beginning on the date the leave of absence begins or the date the Primary Enrollee fails to return to work within the time required by USERRA.', 'If an Enrollee receives dental services from a Dentist outside the state of Texas the Dentists will be reimbursed according to DeltaDental’s network payment provisions for said state according to the terms of this Contract.', 'Notice: the premium under this Contract is payable to DeltaDentalInsuranceCompany P.O.', 'ARTICLE 9 ATTACHMENTS These documents are attached to this Contract and made a part of it: Appendix A Group Policy Schedule Appendix B Texas Life Accident Health and Hospital Service Insurance Guaranty Association Copy of Application APPENDIX A GROUP POLICY SCHEDULE Contractholder Name: Cameron County Address: 1100 East Monroe Street Suite 118 Brownsville TX 78250 Group Number: 17404 Effective Date: October 1 2014 Contract Term: October 1 2014 thru September 30 2017 Benefits: In-Network Out-of-Network Diagnostic and Preventive Benefits: 100% 100% Basic Benefits: 80% 80% Major Benefits: 50% 50% Orthodontic Benefits: 50% 50% Waiting Periods: Major Benefits are limited to Enrollees who have been enrolled in this Contract for 12 consecutive months.', 'Box 149104 Austin Texas 78714-9104 800-252-3439 TOA (01/05) DAT 03013-5 TX-DPO-C(2006) 16 17404', 'Texas Life Accident Health and Hospital Service Insurance Guaranty Association 6504 Bridge Point Parkway Suite 450 Austin Texas 78730 800-982-6362 www.txlifega.org Texas Department of Insurance P.O.', 'APPENDIX B TEXAS LIFE ACCIDENT HEALTH AND HOSPITAL SERVICE INSURANCE GUARANTY ASSOCIATION IMPORTANT INFORMATION ABOUT COVERAGE UNDER THE TEXAS LIFE ACCIDENT HEALTH AND HOSPITAL SERVICE INSURANCE GUARANTY ASSOCIATION Texas law establishes a system administered by the Texas Life Accident Health and Hospital Service Insurance Guaranty Association (the “Association”) to protect policyholders if their life or health insurance company fails to or cannot meet its contractual obligations.', 'TEXAS NOTICE OF COMPLAINT IMPORTANT NOTICE To obtain information or make a complaint: You may call DeltaDentalInsuranceCompany’s toll free number for information or to make a complaint at 1-800-521-2651 You may also write to DeltaDentalInsuranceCompany at DeltaDentalInsuranceCompany 1130 Sanctuary Parkway Suite 600 Alpharetta Georgia 30009 You may contact the Texas Department of Insurance to obtain information on companies coverages rights or complaints at 1-800-252-3439 You may write the Texas Department of Insurance at P.O.', 'Eligibility for Protection by the Association When an insurance company which is a member of the Association is designated as impaired by the Texas Commissioner of Insurance the Association provides coverage to policyholders who are: Residents of Texas at the time that their insurance is impaired Residents of other states ONLY if the following conditions are met: The policyholder has a policy with a company based in Texas; The company has never held a license in the policyholder’s state of residence; The policyholder’s state of residence has a similar guaranty association; and The policyholder is not eligible for coverage by the guaranty association of the policyholder’s state of residence.', 'Box 7564 San Francisco CA 94120-7564 The premium under this Contract may be increased upon renewal with 150 days written notice prior to the end of the initial or any subsequent contract terms.', 'Any part of this Contract which on its Effective Date conflicts with the laws of Texas is hereby amended to conform to the minimum requirements of such laws.', '7.11 Notice; Where Directed All formal notice under this Contract must be in writing and sent by first-class United States mail overnight delivery service or personal delivery.', 'Box 149104 Austin TX 78714-9104 FAX # (512) 475-1771 Web: http://www.tdi.state.tx.us E-mail: ConsumerProtection@tdi.state.tx.us PREMIUM OR CLAIM DISPUTES: Should you have a dispute concerning your premium or about a claim you should contact your agent or DeltaDentalInsuranceCompany first.'}

for sent in loc_sents:
    #print(sent + "\n")
    


For USERRA leave that extends beyond 31 days the Premium for continuation of coverage will be the same as for COBRA coverage.
[]
[]
4.07 Exclusions DeltaDental does not pay Benefits for: treatment of injuries or illness covered by workers’ compensation or employers’ liability laws; services received without cost from any federal state or local agency except for services covered by the Medical Assistance Act of 1967 as amended (Article 695j-1 Vernon’s Texas Civil Statutes).
[]
['Vernon']
(The law is found in the Texas Insurance Code Article 21.28-D.) BECAUSE OF STATUTORY LIMITATIONS ON POLICYHOLDER PROTECTION IT IS POSSIBLE THAT THE ASSOCIATION MAY NOT COVER YOUR POLICY OR MAY NOT COVER YOUR POLICY IN FULL.
[]
[]
7.03 Conformity With State Laws All legal questions about this Contract will be governed by the state of Texas where this Contract was entered into and is to be performed.
[]
['Texas']
Premiums: Monthly Amount: Per Primary Enrollee: $20.40 Per Primary Enrollee with one Dependen

## Not done attributes

In [None]:
## Group Information

dfGI=pd.DataFrame(d, index=['uid'])

In [None]:
## Diagnostic and Preventative (D&P) [Appendix A]

d={'key':'D&P Services_PPO','value':'100'}
##Need to pass index since we're only doing string values
dfDP=pd.DataFrame(d, index=['D%P Services'])


In [None]:
## Basic Service

dfBS=pd.DataFrame(d, index=['Basic Service'])

In [None]:
## Endo Perio (Endodontics (Periodontal(?)))

dfEP=pd.DataFrame(d, index=['Endo Perio'])

In [None]:
## Oral (Oral Surgery)

dfOa=pd.DataFrame(d, index=['uid'])

In [None]:
## Perio (Periodontal)

dfPe=pd.DataFrame(d, index=['uid'])

In [None]:
## Major (Major Benefits)

dfMj=pd.DataFrame(d, index=['uid'])

In [None]:
## Prostho (Prosthodontics)

dfPr=pd.DataFrame(d, index=['uid'])

In [None]:
## Ortho (Orthodontics)

dfOt=pd.DataFrame(d, index=['uid'])

In [None]:
#Concatenate all frames created by the above dataset

frames = [df, dfGI, dfDP, dfBS, dfEP, dfOa, dfPe, dfMj, dfPr, dfOt]

result = pd.concat(frames)
transpose=result.transpose()
print(transpose)

## Notes

### Contract Start Date

With contract start date I began by searching through the tokenized sentences with a regex expression. 
I found a datefinder module to use on each flagged sentence to pull out the dates
	Issue: the datefinder module works poorly on large, run -on sentences which are common in the contracts. It tends to find other numbers that aren't dates and try to make a date out of them.
	
	Sol: only take a subset, starting at the flagged word
	
Sometimes a match isn't found with the keywords I've seen related to the start date
	Sol: look for keywords related to contract term and take the earlier date from that sentence
	
Issue: Datefinder focusing on numbers that aren't dates
	Sol: filter for sentences that have a year (ie four digits in a row) and dates that are before Delta Dental existed (in 1966)
	
Issue: Sometimes there are multiple modes. Usually I saw this when there were equal mentions of the end date
	Sol: if there are multiple modes, take the earliest date. 
	



### Testing finding the contract end date

This was much the same as the contract start date Issues

	Looking for keywords Contract Term/Contract End
	Filtering on invalid years
	Filtering on if there IS a year in the sentence (assuming it won't be written out like nineteen ninety-four)
	If there are only two results, take the later one
	If there are more than two, take the top two most common and then take the latter of the two


### Contract Duration:

Call contract start and end and try to get a duration out of them

 ^^^ pretty much worked


### Comments so far:


	Even with trying to be variable, this won't work if they even change the wording a bit. Maybe spend some time looking into using the libraries to get synonyms.
	
	It would also be great to get the other contracts to see exactly where we're going wrong
	


### Working with synonyms in NLTK

	Getting an expanded set of search terms can be done, but I can't yet figure out how to pick the right contexts. For example, the search bigram "contract term" gives back a huge amount of synonyms, with only 3 or 4 actually being equivalent in meaning to "contract term"
	
	We could always manually select ones that are similar but that seems to defeat the purpose: ie we could select only the noun meanings of contract
	
	We could build our own corpus of words based on all of the documents that we have, and then compare the given synonyms to a freq distribution of those words to pick out the ways other contracts might say the same thing
		But this would still miss things for sure
		
	We could include the word type with the seed words and only choose synonyms of the same type, although this does involve more hardcoding


### Folder Format

data/raw

data/processed/[group number - group name]/

data/output

*** Not every file seems to have a name, so we would have to parse the file to get it

*** Additionally each num - name combo may have contracts from multiple dates


### Using NER tagging to identify location sections

Basic NER tagging with nltk works horribly on our files out of the box

Polyglot has a lot of issues getting downloaded

NLTK has a wrapper for the Stanford NER tagger so I'm going to try that next
	Download the model jar file
	
	
The stanford NER tagger is working a bit better
http://www.nltk.org/api/nltk.tag.html#nltk.tag.stanford.StanfordTagger
https://nlp.stanford.edu/software/stanford-ner-2018-02-27.zip (download of jar files)
https://textminingonline.com/how-to-use-stanford-named-entity-recognizer-ner-in-python-nltk-and-other-programming-languages

The stanford one takes FOREVER though
	There is a faster version in CoreNLP but that's all in Java and I don't think the wrapper interacts with it
	
GeoText

Easy to set up and use, but doesn't do states or state abbreviations
And it misses a LOT that the NER tagger got
	It is completely unreliable honestly. 
	
Option 1: Use the NLTK wrapper for Stanford NER tagger and just wait forever
Option 2: Get a giant csv of all US cities/states/abbreviations/counties (exists) and make a data set out of that to compare to
	Cons: not flexible or extensible, is already 4 years out of date
	Pros: much faster, will only have to create the thing once
	
Note: even with the NER it will only give us pieces of the address, we would still have to go into the sentence and try to regex it out

Option 3: create our own trained model from some of the files we already have and see how that does with the Stanford tagger. Might be faster
	Could also see how it does with the native NLTK tagger
