# Exploring and Processing

### Import Statements

In [17]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
import random as rng
import nltk
import fnmatch

import docx2txt
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

from nltk.book import *
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.collocations import *
from nltk.util import ngrams
from nltk.stem.lancaster import LancasterStemmer
from string import punctuation
import sys as sys
from sys import platform
import re as re
from statistics import mode
import datefinder


## Psuedocode

In [18]:
### Analysis Function

## Function which calculates the number of combinations (Not Permutations) of distinct values in a dataset. Would 
## take in a Pandas DataFrame ideally and multiply all distinct value counts for all available columns.

# Expected input: Pandas DataFrame
# Expected output: Int, the number of possible combinations.

In [19]:
### Parsing Function

## Function to collect mentions of States/Districts/Territories in the United States (DC and Puerto Rico are included)
## and rank their relevence, frequency should be a good factor. Note: California, Pennsylvania, and Georgia could 
## appear multiple times from addresses.

# Expected input: A single file, text.
# Expected output: A list of locations mentioned, ranked by most used to least used.

In [20]:
### Parsing Function

## Function for searching a tokenized sentence dictionary for words or phrases. There's probably 
## already something for this.

# Expected input: A tokenized list, and a phrase to search for.
# Expected output: A list of sentences containing the phrase searched.

In [21]:
### Parsing Function

## Function to take a file and produce the Acronyms from them.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, most likely first.

In [22]:
### Parsing Function

## Function to take Acronyms and match them to the best possible N-gram for them from a document. Not all will be 
## possible, but a list of options will help.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, sorted most likely first.

In [23]:
### Generally important

## Need to expand stopwords to include States and Districts, also Delta Dental adjacent names as another available set.

In [24]:
## Function to load in two different text extracted contracts and return a comparison metric between 
## the two (Similarity, possibly as a percent?)

# Expected Input: Two contracts for Comparison
# Expected Output: A measure, some kind of decimal to represent similarity.

## File Prep Functions

### Function: Is it docx or pdf?

In [25]:
def checkFileType(filename):
        if(filename.lower().endswith(('.doc','.docx'))):
            return 0
        elif(filename.lower().endswith(('.pdf'))):
            return 1
        else:
            return -1

### Function: read in file

In [26]:
def makeFilePath(docName):
    raw_data_path = os.path.join(os.getcwd(), 'data', 'raw')
    return os.path.join(raw_data_path, docName)

### Function: clean up text

In [27]:
def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    #text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
    return text

### Function: process dataFrame and group

In [28]:
def processDF(txtFile):
    df = pd.read_csv(txtFile, sep=" ", header=None) #this doesn't work for me bc of variable number of cols
    df = df.T 
    df = df.dropna()

    df['SingleRow']=1

    df=df.rename(columns={0 : 'Words'})
    print("in processDF " + txtFile)
    df.describe(include="all")
    #print(df.groupby('Words').SingleRow.sum().sort_values())
    #print(df)
    return df

### Function: Process a text file

In [29]:
def processTextFile(filePath):
    #print(filePath)
    docxText = docx2txt.process(filePath)
    #print(docxText)
    replacedText = cleanText(docxText)
    #print(replacedText)
    fileName = filePath.split('/')[-1]
    #print(fileName)
    baseFileName = fileName[0:-5]
    #print(baseFileName)
    newFilePath = './data/output/' + baseFileName + ".txt"
    #print(newFilePath)
    singleFileDocx=open(newFilePath, 'wb+')    
    singleFileDocx.write(replacedText.encode("utf-8"))
    singleFileDocx.close()

    #temp_df = processDF('singleTextDocx.txt')
    return newFilePath

### Function: Process pdf file

In [30]:
def processPDFfile(filePath):
        password = ""
        extracted_text = ""

        fileName = filePath.split('/')[-1]
        baseFileName = fileName[0:-4]
    
        fp = open(filePath, "rb")
        parser = PDFParser(fp)
        document = PDFDocument(parser, password)
        
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
            
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()

        # set parameters for analysis
        laparams = LAParams()

        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device 
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    newText = lt_obj.get_text()
                    newText = newText.replace('\n', ' ')
                    extracted_text += newText

        #close the pdf file
        fp.close()
        
        extracted_text = cleanText(extracted_text)#extracted_text.replace("\n", ' ')
        
        newFilePath = './data/output/' + baseFileName + '-pdf' + ".txt"
        with open(newFilePath, 'wb+') as singleFilePDF:
            singleFilePDF.write(extracted_text.encode("utf-8"))
        
        
        #temp_df = processDF('./data/output/' + baseFileName + ".txt")
        return newFilePath

## Docx Extraction -- Currently not using this: see File Prep Functions

DocX extractor for data purposes. Requires customization to each purpose but is important for pulling data out of DocX files. Does not understand tables or bullet points, however is visually consistent with what's on the page.

Strongest values: Order, consistentcy, noise reduction

Weakest values: Completeness, flexibility, whitespace characters, formatting

#### Docx Practice

In [31]:
import docx2txt


def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
        
    return text
fileName = "TX 17404 Contract Regional (7.2.18).docx"
baseFileName = ""
if(fileName.lower().endswith(('.docx'))):
    baseFileName = fileName[0:-5]
    #print(baseFileName)
elif(fileName.lower().endswith(('.pdf'))):
    baseFileName = fileName[0:-4]
else:
    print("ending error")

docText = docx2txt.process("./data/raw/" + fileName)
singleFileDocx=open('./data/output/' + baseFileName + ".txt", 'wb+')
replacedText = cleanText(docText)
#print(replacedText)
singleFileDocx.write(docText.encode("utf-8"))
singleFileDocx.close()


## PDF Extraction -- Currently not using this: see File Prep Functions

#### PDF practice

In [32]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
# pdfTextMiner.py
# Python 2.7.6
# For Python 3.x use pdfminer3k module
# This link has useful information on components of the program
# https://euske.github.io/pdfminer/programming.html
# http://denis.papathanasiou.org/posts/2010.08.04.post.html


''' Important classes to remember
PDFParser - fetches data from pdf file
PDFDocument - stores data parsed by PDFParser
PDFPageInterpreter - processes page contents from PDFDocument
PDFDevice - translates processed information from PDFPageInterpreter to whatever you need
PDFResourceManager - Stores shared resources such as fonts or images used by both PDFPageInterpreter and PDFDevice
LAParams - A layout analyzer returns a LTPage object for each page in the PDF document
PDFPageAggregator - Extract the decive to page aggregator to get LT object elements
'''

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

''' This is what we are trying to do:
1) Transfer information from PDF file to PDF document object. This is done using parser
2) Open the PDF file
3) Parse the file using PDFParser object
4) Assign the parsed content to PDFDocument object
5) Now the information in this PDFDocumet object has to be processed. For this we need
   PDFPageInterpreter, PDFDevice and PDFResourceManager
 6) Finally process the file page by page 
'''

base_path = "C://data"

my_file = os.path.join(base_path + "/" + "test.pdf")
log_file = os.path.join(base_path + "/" + "pdf_log.txt")

password = ""
extracted_text = ""

# Open and read the pdf file in binary mode
fp = open(my_file, "rb")

# Create parser object to parse the pdf content
parser = PDFParser(fp)

# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)

# Check if document is extractable, if not abort
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
    
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()

# set parameters for analysis
laparams = LAParams()

# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device 
interpreter = PDFPageInterpreter(rsrcmgr, device)

# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
    # As the interpreter processes the page stored in PDFDocument object
    interpreter.process_page(page)
    # The device renders the layout from interpreter
    layout = device.get_result()
    # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            extracted_text += lt_obj.get_text()
            
#close the pdf file
fp.close()

# print (extracted_text.encode("utf-8"))
            
with open(log_file, "wb") as my_log:
    my_log.write(extracted_text.encode("utf-8"))
print("Done !!")


FileNotFoundError: [Errno 2] No such file or directory: 'C://data/test.pdf'

## NLTK Tokenizing Functions

### Function: DD specific text cleaning

In [33]:
def ddCleanText(text):
    newText = text.replace('Delta Dental', 'DeltaDental')
    newText = newText.replace('DELTA DENTAL', 'DELTADENTAL')
    newText = newText.replace('DeltaDental Insurance Company', 'DeltaDentalInsuranceCompany')
    return newText



### Function: Make tokenized word list

In [34]:
def getTokens(text):
    words = word_tokenize(text)
    
    customStopWords = set(stopwords.words('english')+list(punctuation))
    wordsWOStop=[word for word in words if word not in customStopWords]
    
    return wordsWOStop

### Function: Make tokenized Sentence list

In [35]:
def getSents(text):
    sents = sent_tokenize(text)

    return sents

### Function: Get Bigrams

In [36]:
def getBigrams(tokens):
    bigram_measures=nltk.collocations.BigramAssocMeasures();
    finder = BigramCollocationFinder.from_words(tokens)
    sorted_bgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))
    
    return sorted_bgs

### Function: Get Trigrams

In [37]:
def getTrigrams(tokens):
    trigram_measures =nltk.collocations.TrigramAssocMeasures();
    finder = TrigramCollocationFinder.from_words(tokens)
    sorted_tgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))

    return sorted_tgs

## MetaData and Attribute Functions

#### Filename
Contract Start
Contract End
Contract Duration
State
Delta Office Involved

#### Group Information
(Group Number)

#### Numeric attributes Only
Basics
Diagnostics
Major
Endo
Oral
Perio
Prostho
Ortho

In [39]:
##Establish a dataframe to capture the attributes
#d = {'key': 'file','value':fileName}
#{'key':'state', 'value':state}
#df = pd.DataFrame(d, index=['uid'])
d = {}

### NLTK synonyms

#### I kept this to processing single words, bigrams or trigrams so as to keep the complexity down

#### Function: get synonyms of a single word. Helper function to Bigram and Trigram function

In [40]:
def getSyns(word):
    syns1 = wn.synsets(word)
    
    set1 = set()
    for syn in syns1:
        for lem in syn.lemmas():
            set1.add(lem.name())
    
    return set1

#### Function: Get a similar bigram

In [41]:
def getSimilarBigrams(word1, word2):
    
    set1 = getSyns(word1)
    if not len(set1):
        set1.add(word1)
    set2 = getSyns(word2)
    if not len(set2):
        set2.add(word2)
    
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            word_set.add(" ".join([word1, word2]))
    #print(word_set)
    
    return word_set

#### Function: get a similar trigram

In [42]:
def getSimilarTrigrams(word1, word2, word3):
    set1 = getSyns(word1)
    set2 = getSyns(word2)
    set3 = getSyns(word3)
    
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            for word3 in set3:
                word_set.add(" ".join([word1, word2, word3]))
    #print(word_set)
    
    return word_set
        

#### Function: Get synonyms from a list of key words. Returns more keywords/phrases

In [43]:
def getSynonymsFromList(keywords):
    matches = []

    for kw in keywords:
        try:
            words = word_tokenize(kw)
        except Exception as e:
            print(str(e))
        #print(words)
        if(len(words) == 1):
            #print("is 1")
            syns = getSyns(words[0])
            for syn in syns:
                matches.append(syn)
            #keywords.append(list(syns))
        elif(len(words) == 2):
           # print("is 2")
            syns = getSimilarBigrams(words[0],words[1])
           # print(syns)
            matches.extend(getSimilarBigrams(words[0],words[1]))
        elif(len(words) == 3):
           # print("is 3")
            matches.extend(getSimilarTrigrams(words[0],words[1],words[2]))
        #print(matches)
    keywords.extend(matches)
    keywords = set(keywords)

    #print(start_keywords)
    
    return keywords

#### POS tagging key

In [None]:
ADJ	adjective	new, good, high, special, big, local
ADV	adverb	really, already, still, early, now
CNJ	conjunction	and, or, but, if, while, although
DET	determiner	the, a, some, most, every, no
EX	existential	there, there's
FW	foreign word	dolce, ersatz, esprit, quo, maitre
MOD	modal verb	will, can, would, may, must, should
N	noun	year, home, costs, time, education
NP	proper noun	Alison, Africa, April, Washington
NUM	number	twenty-four, fourth, 1991, 14:24
PRO	pronoun	he, their, her, its, my, I, us
P	preposition	on, of, at, with, by, into, under
TO	the word to	to
UH	interjection	ah, bang, ha, whee, hmpf, oops
V	verb	is, has, get, do, make, see, run
VD	past tense	said, took, told, made, asked
VG	present participle	making, going, playing, working
VN	past participle	given, taken, begun, sung
WH	wh determiner	who,

#### Attempt with using POS tagging in the synonyms to reduce extraneous syns
We would then have to manually tag all of the original phrases and words we use to seed the decisions
It does seem to reduce though

In [82]:
start_keywords = ["contract/n term\S\s*\S/n","contract/n term/n","contract/n end/n"]

for kw in start_keywords:
    words = word_tokenize(kw)
    
    for word in words:
        tup = word.split('/')
        print(tup)
        syns1 = wn.synsets(tup[0])
        set1 = set()
        for syn in syns1:
            
            if(syn.pos() == tup[1]):
                set1.add(syn)
        print(set1)
#set1 = set()
#for syn in syns1:
#    for lem in syn.lemmas():
#        set1.add(lem.name())



['contract', 'n']
{Synset('contract.n.01'), Synset('contract.n.02'), Synset('contract.n.03')}
['term\\S\\s*\\S', 'n']
set()
['contract', 'n']
{Synset('contract.n.01'), Synset('contract.n.02'), Synset('contract.n.03')}
['term', 'n']
{Synset('term.n.05'), Synset('term.n.02'), Synset('term.n.04'), Synset('term.n.01'), Synset('term.n.06'), Synset('terminus.n.03'), Synset('condition.n.07')}
['contract', 'n']
{Synset('contract.n.01'), Synset('contract.n.02'), Synset('contract.n.03')}
['end', 'n']
{Synset('end.n.01'), Synset('end.n.08'), Synset('end.n.05'), Synset('end.n.09'), Synset('end.n.10'), Synset('end.n.11'), Synset('end.n.14'), Synset('conclusion.n.08'), Synset('end.n.07'), Synset('end.n.13'), Synset('end.n.06'), Synset('end.n.03'), Synset('end.n.02'), Synset('goal.n.01')}


In [70]:
start_keywords = ["term"]# term\S\s*\S", "contract term ", "contract end"]
contract = wn.synsets('term')[1]
print(contract)
#for d in contract:
#    print(d.definition())
for lem in contract.lemmas():
    print(lem.name())
#syn_list = (getSynonymsFromList(start_keywords))
#print(syn_list)

Synset('term.n.02')
term


### Metadata

#### get file name
Get the filename from a full path. Determines the OS and splits the string correctly based on that

In [45]:
def getFileName(fullPath):
    if(platform == "win32"):
        fileName = fullPath.split("\\")[-1]
    else:
        fileName = fullPath.split("/")[-1]
    return fileName

In [46]:
print(getFileName(filePath))

TX-18745 ASC Contract (7.2.18).pdf


#### get group number
Uses regex's made from keywords to attempt to find a group number in the file. Failing that, it searches the filename for the number.

In [47]:
def getGroupNumber(sents_tokens, filePath):
    
    group_keywords = ["group number", "groupnumber"]
    regex_exps = []
    poss_nums = []
    finalGN = None
    
    #Create regex exps out of group number keywords
    for kw in group_keywords:
            temp_re = kw + "\W\s*(?P<gn>\d+)"
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
    #For each sentence, search for the expression, if found add the number to
    #list of possible group numbers
    for sent in sents_tokens:
        #print(sent)
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                temp_gn = result.group('gn')
                poss_nums.append(temp_gn)
    
    #Try and get the number from the file name, looking for list of numeric chars
    num_regex = re.compile("\d+")
    fileName = getFileName(filePath)
    fileGN = num_regex.search(fileName)
    
    if not fileGN==None:#if they filename has a number sequence
        if fileGN.group() in poss_nums:#then if the file group number matches one in the document, choose it
            finalGN = fileGN.group()
        else:
            poss_nums.append(fileGN.group())#otherwise add the filename one to the list and try to get the most co
            try:
                finalGN = mode(poss_nums)
            except:
                print("Unexpected error: Cannot determine group number of file: " + filePath)
    
    return finalGN

#### get contract start
Uses regex and a list of keywords to attempt to find the start date of the contract. It makes multiple passes based on patterns seen in contract samples so far.

Some of the passes are necessary to filter out non-date numbers that the datefinder incorrectly parses to dates

In [50]:
def getContractStart(sents_tokens):
    start_keywords = ["effective date\S\s*\S", "effective"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    start_keywords = getSynonymsFromList(start_keywords)
    print(start_keywords)
    for kw in start_keywords:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
## Original pass through sentence tokens to find possible dates
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                subset = word_tokenize(sent[sent.lower().find(result.group()):])[:20]
                subset = " ".join(subset)
                poss_dates.append(subset)

                
## Second pass through sentence tokens to find possible dates based on a date range format
    regex_exps = []
    backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
    for kw in backup_kw:
        temp_re = kw
        regex = re.compile(temp_re)
        regex_exps.append(regex)
        
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                half_1 = sent[sent.lower().find(result.group()):]
                half_2 = sent[:sent.lower().find(result.group())]
                
                subset_1 = " ".join(word_tokenize(half_1)[:6])
                subset_2 = " ".join(word_tokenize(half_2)[-6:])
                subset = subset_2 + subset_1
                
                m = datefinder.find_dates(subset)
                temp_matches = []
                for match in m:
                    if match.year >= 1966:
                        temp_matches.append(subset)
                if len(temp_matches)>=2:
                    poss_dates.append(subset)
                    #print(subset)
                    
                    
## Second pass through sentences with possible dates to eliminate ones without a year or with an invalid year
## These are likely other values flagged incorrectly as dates by the datefinder
## 1966 is the year Delta Dental was created
    for sent in poss_dates:
        find_year_re = re.compile("\d\d\d\d")
        year = find_year_re.search(sent)
        
        if not year==None:
            m = datefinder.find_dates(sent)
            for match in m:
                if match.year >= 1966:
                    matches.append(match)
## Last pass: try to find the most common date. If there is more than one mode, choose the earliest date
##.           this seems to occur when it is finding the contract start and end in equal quantities
    #print(matches)
    try:
        finalDate = mode(matches)
    except ValueError as e:
        #print(str(e))
        if matches:
            earliestMatch = matches[0]
            for match in matches:
                if(match < earliestMatch):
                    earliestMatch = match
            finalDate = earliestMatch
    except Exception as e:
        print(str(e))
  
    return finalDate

####  get Contract End
Similar to get contract start, it uses regex and keywords over multiple passes to attempt and find the contract end.

In [51]:
def getContractEnd(sents_tokens):
    
    start_keywords = ["contract term\S\s*\S", "contract term ", "contract end"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    start_keywords = getSynonymsFromList(start_keywords)
    print(start_keywords)
    for kw in start_keywords:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
## Original pass through sentence tokens to find possible dates based on keywords
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                subset = word_tokenize(sent[sent.lower().find(result.group()):])[:25]
                subset = " ".join(subset)
                poss_dates.append(subset)
                
                
## Second pass through sentence tokens to find possible dates based on a date range format
    regex_exps = []
    backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
    for kw in backup_kw:
        temp_re = kw
        regex = re.compile(temp_re)
        regex_exps.append(regex)
        
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                half_1 = sent[sent.lower().find(result.group()):]
                half_2 = sent[:sent.lower().find(result.group())]
                
                subset_1 = " ".join(word_tokenize(half_1)[:6])
                subset_2 = " ".join(word_tokenize(half_2)[-6:])
                subset = subset_2 + subset_1
                
                m = datefinder.find_dates(subset)
                temp_matches = []
                for match in m:
                    if match.year >= 1966:
                        temp_matches.append(subset)
                if len(temp_matches)>=2:
                    poss_dates.append(subset)
                    #print(subset)
                
                
## Pass through sentences with possible dates to eliminate ones without a year or with an invalid year
## These are likely other values flagged incorrectly as dates by the datefinder
## 1966 is the year Delta Dental was created
    for sent in poss_dates:
        #print(sent)
        find_year_re = re.compile("\d\d\d\d")
        year = find_year_re.search(sent)
        
        if not year==None:
            m = datefinder.find_dates(sent)
            for match in m:
                if match.year >= 1966:
                    matches.append(match)
                    
   
    #print(matches)
    
### If there are exactly two matches, try to find a max. If error b/c they're the same, choose one
    if(len(matches) == 2):
        try:
            finalDate = max(matches)
        except ValueError as e:
            finalDate = matches[0]
        except Exception as e:
            print(str(e))

## If there are more, try and find the top two most mentioned and take the later. else just take the latest            
    elif(len(matches) > 2):
        
        try:
            date1 = mode(matches)
            matches.remove(date1)
            
            date2 = mode(matches)
            matches.remove(date2)
            
            finalDate = max([date1, date2])
        except ValueError as e:
            #print(str(e))
            if matches:
                latestMatch = matches[0]
                for match in matches:
                    if(match > latestMatch):
                        latestMatch = match
                finalDate = latestMatch
        except Exception as e:
            print(str(e))
    #else:
        #print("could not find contract end for file")
    
    #print("\n")
    return finalDate

#### get Contract Duration
Uses the functions getContractStart and getContractEnd to calculate a duration if possible

In [52]:
def getContractDuration(sents_tokens):
    start = None
    end = None
    duration = None
    
    try:
        start = getContractStart(sents_tokens)
    except Exception as e:
        print("Can't find contract start")
        return
    
    try:
        end = getContractEnd(sents_tokens)
    except Exception as e:
        print("Can't find contract end")
        return
    if(start and end):
        duration = end - start

    return duration

#### get State/Location: -- Not Done

In [53]:
def getContractLocation(sents_tokens):
   
    return

#### Delta Office Involved -- Not Done

In [54]:
def getDeltaOffice():
    
    return

### Batch Run to get attributes
#### Functions to process multiple files and their attributes at once

#### Function: batch pre process: fill output folder

In [55]:
def batchPreProcess():
    cwd = os.getcwd()
    print(cwd)
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "data/raw")

    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                #print(file)
                try:
                    if(checkFileType(filepath) == 0):
                        processedTextPath = processTextFile(filepath)
                    elif(checkFileType(filepath) == 1):
                        processedTextPath = processPDFfile(filepath)
                    else:
                        raise TypeError('This path does not lead to a valid file type!')                     
                except Exception as e:
                    print(str(e))

    else:
        print("data/raw doesn't exist")

#### Function: Batch return token and bigram sets for all output files
Returns file information as an array of objects containing key:value information about the file: 

In [None]:
[ 
    {
        'filepath':'users/sydneyknox...', 
        'wordTokens':[*tokens*], 
        ...  
    }, 
    {  
        'sentenceTokens':[*tokens*],
        'cleanText':"string containing the original text from the processed file..."
    }
]

In [56]:
def batchGetTokens():
    all_tokens = []
    cwd = os.getcwd()
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "data/output")
    print(dataPath)
    
    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                try:
                    #print(file)

                    temp_obj = {}

                    with open(filepath, 'r') as txtFile:
                        text = txtFile.read()

                    temp_obj['filepath'] = filepath

                    text = ddCleanText(text)
                    temp_obj['cleanText'] = text

                    wordTokens = getTokens(text)
                    sentTokens = getSents(text)
                    temp_obj['wordTokens'] = wordTokens
                    temp_obj['sentTokens'] = sentTokens

                    bgs = getBigrams(wordTokens)
                    tgs = getTrigrams(wordTokens)
                    temp_obj['bgs'] = bgs
                    temp_obj['tgs'] = tgs

                    txtFile.close()
                    all_tokens.append(temp_obj)
                except Exception as e:
                    print("Error opening and tokenizing " + file)
                    #print(str(e))

    else:
        print("data/output doesn't exist")
    return all_tokens

#### Function: get metadata attributes
This function takes in a single files info -- in this section because it will be used in a batch function

In [57]:
def getMetaDataAtt(file_info):
    #print(file_info)
    fileName = getFileName(file_info['filepath'])
    print(fileName)
    
    #groupNumber = getGroupNumber(file_info['sentTokens'], file_info['filepath'])
    #if(groupNumber):
    #    print("group number: " , groupNumber)
    
    contractStartDate = getContractStart(file_info['sentTokens'])
    if(contractStartDate):
        print("start: " , contractStartDate)
    
    contractEndDate = getContractEnd(file_info['sentTokens'])
    if(contractEndDate):
        print("end: " , contractEndDate)
    
    #contractDuration = getContractDuration(file_info['sentTokens'])
    #if(contractDuration):
    #    print("duration: " , contractDuration)
    #print("\n")
    
    d={'key':'filename', 'value':fileName}
    dfMD = pd.DataFrame(d, index=['MetaData'])
    
   ## df=pd.DataFrame({'key':'group_number','value':groupNumber}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
   ## df=pd.DataFrame({'key':'contract_start_date','value':contractStartDate}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
    return dfMD

### Workspace

##### Practice with synonyms

In [None]:
def testSyns(sentTokens, word1, word2):
    syns1 = wn.synsets(word1)
    syns2 = wn.synsets(word2)
    #print(syns)
    #print(syns[0].lemmas()[1].name())
    set1 = set()
    for syn in syns1:
        for lem in syn.lemmas():
            set1.add(lem.name())
    #print(set1)
    set2 = set()
    for syn in syns2:
        for lem in syn.lemmas():
            set2.add(lem.name())
    #print(set2)
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            word_set.add(" ".join([word1, word2]).lower())
    #print(word_set)
    for sent in sentTokens:
        words = word_tokenize(sent)
        start = 0
        end = 1
        #while(end < len(words)):
            #print(" ".join([words[start], words[end]]))
            #if(" ".join([words[start], words[end]]).lower() in word_set):
                #print(" ".join([words[start], words[end]]))
                #print(sent)
            #start += 1
            #end += 1
    return
            
#testSyns(sentTokens, "effective", "date")            


#### Call the PreProcessing functions to get all .txt output files

## workspace: batch process files

In [38]:
filePath = "./data/raw/TX-18745 ASC Contract (7.2.18).pdf"
processedTextPath = ""

if(checkFileType(filePath) == 0):
    processedTextPath = processTextFile(filePath)
elif(checkFileType(filePath) == 1):
    processedTextPath = processPDFfile(filePath)
else:
    raise TypeError("File type incorrect") 

#print(processedTextPath)
with open(processedTextPath, 'r') as txtFile:
            text = txtFile.read()


text = ddCleanText(text)
wordTokens = getTokens(text)
sentTokens = getSents(text)
#print(sentTokens)
bgs = getBigrams(wordTokens)
tgs = getTrigrams(wordTokens)


#### Call PreProcess

In [58]:
batchPreProcess()

/Users/sydneyknox/Documents/data-insights/jupyter-pseudocode
This path does not lead to a valid file type!


#### Call the tokenizing functions to get organized data for all the files in the pre-processed folder

In [59]:
base_info = batchGetTokens()

/Users/sydneyknox/Documents/data-insights/jupyter-pseudocode/data/output
Error opening and tokenizing .DS_Store


#### Call the functions to begin extracting and storing the data from all files

In [60]:
for file in base_info:
    dfMD = getMetaDataAtt(file)
    #testSyns(file['sentTokens'], "group","number")
    
    #transpose=dfMD.transpose()
    #print(transpose)
    #print(dfMD)

TX 19015 Attachment A ENT (7.2.18).txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2018-01-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal

TX 17404 EOC (Regional (7.2.18)-pdf.txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2013-07-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goa

TX-19278 ASC-ENT (7.2.18).txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2018-03-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal', 'const

end:  2019-12-31 00:00:00
test2.txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal', 'constrict remnant', 'contra

test2-pdf.txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal', 'constrict remnant', 'contract term ', 'constrict 

TX 19015 Attachment  C ENT (7.2.18).txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2018-01-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goa

end:  2019-12-31 00:00:00
TX-18745-ASC Contract (7.2.18).txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2017-04-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_s

end:  2019-12-31 00:00:00
TX 17404 Contract Regional (7.2.18).txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2014-10-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake fi

end:  2017-09-30 00:00:00
test.txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal', 'constrict remnant', 'contrac

TX 19015 Attachment B ENT (7.2.18).txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2018-01-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal

test-pdf.txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal', 'constrict remnant', 'contract term ', 'constrict f

TX 19015 EOC ENT(7.2.18)-pdf.txt
{'effectual date\\S\\s*\\S', 'efficacious', 'in_effect date\\S\\s*\\S', 'good', 'in_force', 'efficient', 'good date\\S\\s*\\S', 'effectual', 'in_effect', 'effective date\\S\\s*\\S', 'efficient date\\S\\s*\\S', 'in_force date\\S\\s*\\S', 'effective', 'efficacious date\\S\\s*\\S'}
start:  2018-01-01 00:00:00
{'get conclusion', 'compress terminus', 'sign_on full_term', 'abridge terminate', 'squeeze last', 'contract term\\S\\s*\\S', 'take close', 'foreshorten term', 'squeeze stop', 'sign ending', 'declaration term\\S\\s*\\S', 'shorten terminal_figure', 'shrink condition', 'sign remainder', 'reduce final_stage', 'sign_on close', 'abridge terminal', 'declaration end', 'take condition', 'compact stop', 'condense terminal', 'narrow last', 'sign_on last', 'sign terminal', 'declaration terminal_figure', 'shrink terminate', 'constrict stop', 'press last', 'contract_bridge ending', 'shorten finish', 'declaration stop', 'undertake final_stage', 'constrict goal', 'co

## Not done attributes

In [49]:
## Group Information

dfGI=pd.DataFrame(d, index=['uid'])

In [None]:
## Diagnostic and Preventative (D&P) [Appendix A]

d={'key':'D&P Services_PPO','value':'100'}
##Need to pass index since we're only doing string values
dfDP=pd.DataFrame(d, index=['D%P Services'])


In [None]:
## Basic Service

dfBS=pd.DataFrame(d, index=['Basic Service'])

In [None]:
## Endo Perio (Endodontics (Periodontal(?)))

dfEP=pd.DataFrame(d, index=['Endo Perio'])

In [None]:
## Oral (Oral Surgery)

dfOa=pd.DataFrame(d, index=['uid'])

In [None]:
## Perio (Periodontal)

dfPe=pd.DataFrame(d, index=['uid'])

In [None]:
## Major (Major Benefits)

dfMj=pd.DataFrame(d, index=['uid'])

In [None]:
## Prostho (Prosthodontics)

dfPr=pd.DataFrame(d, index=['uid'])

In [None]:
## Ortho (Orthodontics)

dfOt=pd.DataFrame(d, index=['uid'])

In [None]:
#Concatenate all frames created by the above dataset

frames = [df, dfGI, dfDP, dfBS, dfEP, dfOa, dfPe, dfMj, dfPr, dfOt]

result = pd.concat(frames)
transpose=result.transpose()
print(transpose)

## Notes

### Contract Start Date

With contract start date I began by searching through the tokenized sentences with a regex expression. 
I found a datefinder module to use on each flagged sentence to pull out the dates
	Issue: the datefinder module works poorly on large, run -on sentences which are common in the contracts. It tends to find other numbers that aren't dates and try to make a date out of them.
	
	Sol: only take a subset, starting at the flagged word
	
Sometimes a match isn't found with the keywords I've seen related to the start date
	Sol: look for keywords related to contract term and take the earlier date from that sentence
	
Issue: Datefinder focusing on numbers that aren't dates
	Sol: filter for sentences that have a year (ie four digits in a row) and dates that are before Delta Dental existed (in 1966)
	
Issue: Sometimes there are multiple modes. Usually I saw this when there were equal mentions of the end date
	Sol: if there are multiple modes, take the earliest date. 
	



### Testing finding the contract end date

This was much the same as the contract start date Issues

	Looking for keywords Contract Term/Contract End
	Filtering on invalid years
	Filtering on if there IS a year in the sentence (assuming it won't be written out like nineteen ninety-four)
	If there are only two results, take the later one
	If there are more than two, take the top two most common and then take the latter of the two


### Contract Duration:

Call contract start and end and try to get a duration out of them

 ^^^ pretty much worked


### Comments so far:


	Even with trying to be variable, this won't work if they even change the wording a little bit. Maybe spend some time looking into using the libraries to get synonyms.
	
	It would also be great to get the other contracts to see exactly where we're going wrong
	


### Working with synonyms in NLTK

	Getting an expanded set of search terms can be done, but I can't yet figure out how to pick the right contexts. For example, the search bigram "contract term" gives back a huge amount of synonyms, with only 3 or 4 actually being equivalent in meaning to "contract term"
	
	We could always manually select ones that are similar but that seems to defeat the purpose: ie we could select only the noun meanings of contract
	
	We could build our own corpus of words based on all of the documents that we have, and then compare the given synonyms to a freq distribution of those words to pick out the ways other contracts might say the same thing
		But this would still miss things for sure
		
	We could include the word type with the seed words and only choose synonyms of the same type, although this does involve more hardcoding
