# Exploring and Processing

### Import Statements

In [5]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
import random as rng
import nltk
import fnmatch

import docx2txt
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.collocations import *
from nltk.util import ngrams
from nltk.stem.lancaster import LancasterStemmer
from string import punctuation
import sys as sys
from sys import platform
import re as re
from statistics import mode
import datefinder
from nltk import ne_chunk
from nltk.tag import StanfordNERTagger
import csv
import datetime

try:
    if(platform == "win32"):
        import win32com.client as win32
        from win32com.client import constants
except Exception as e:
    print(str(e))

import subprocess
import shutil

## Psuedocode

In [6]:
### Analysis Function

## Function which calculates the number of combinations (Not Permutations) of distinct values in a dataset. Would 
## take in a Pandas DataFrame ideally and multiply all distinct value counts for all available columns.

# Expected input: Pandas DataFrame
# Expected output: Int, the number of possible combinations.

In [7]:
### Parsing Function

## Function to collect mentions of States/Districts/Territories in the United States (DC and Puerto Rico are included)
## and rank their relevence, frequency should be a good factor. Note: California, Pennsylvania, and Georgia could 
## appear multiple times from addresses.

# Expected input: A single file, text.
# Expected output: A list of locations mentioned, ranked by most used to least used.

In [8]:
### Parsing Function

## Function for searching a tokenized sentence dictionary for words or phrases. There's probably 
## already something for this.

# Expected input: A tokenized list, and a phrase to search for.
# Expected output: A list of sentences containing the phrase searched.

In [9]:
### Parsing Function

## Function to take a file and produce the Acronyms from them.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, most likely first.

In [10]:
### Parsing Function

## Function to take Acronyms and match them to the best possible N-gram for them from a document. Not all will be 
## possible, but a list of options will help.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, sorted most likely first.

In [11]:
### Generally important

## Need to expand stopwords to include States and Districts, also Delta Dental adjacent names as another available set.

In [12]:
## Function to load in two different text extracted contracts and return a comparison metric between 
## the two (Similarity, possibly as a percent?)

# Expected Input: Two contracts for Comparison
# Expected Output: A measure, some kind of decimal to represent similarity.

## File Prep Functions

### Function: Is it docx or pdf?

In [13]:
def checkFileType(filename):
        if(filename.lower().endswith(('.docx'))):
            return 0
        elif(filename.lower().endswith(('.pdf'))):
            return 1
        elif(filename.lower().endswith(('.doc'))):
            return 2
        else:
            return -1

### Function: read in file

In [14]:
def makeFilePath(docName):
    raw_data_path = os.path.join(os.getcwd(), 'data', 'raw')
    return os.path.join(raw_data_path, docName)

### Function: clean up text

In [15]:
def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    #text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
    return text

### Function: process dataFrame and group

In [16]:
def processDF(txtFile):
    df = pd.read_csv(txtFile, sep=" ", header=None) #this doesn't work for me bc of variable number of cols
    df = df.T 
    df = df.dropna()

    df['SingleRow']=1

    df=df.rename(columns={0 : 'Words'})
    print("in processDF " + txtFile)
    df.describe(include="all")
    #print(df.groupby('Words').SingleRow.sum().sort_values())
    #print(df)
    return df

### Function: Process a text file

In [58]:
def processDocFileWindows(filepath, errorFile):
    try:
        word = win32.Dispatch("Word.application")
        #word = win32.gencache.EnsureDispatch('Word.Application')
        doc = word.Documents.Open(filepath)
        doc.Activate ()

        # Rename path with .docx
        new_file_abs = os.path.abspath(filepath)
        new_file_abs = re.sub(r'\.\w+$', '.docx', new_file_abs)

        # Save and Close
        word.ActiveDocument.SaveAs(
        new_file_abs, FileFormat=constants.wdFormatXMLDocument
        )
        doc.Close(False)

        return new_file_abs
        
    except Exception as e:
        errorFile.write(filepath + ", " + str(e) + "\n")
        print(str(e))
        return None

    
def processDocFileMac(filepath, errorFile):
    #textutil -convert docx ~/Desktop/mypage.webarchive
    try:
        subprocess.run(["textutil", "-convert", "docx", filepath])
        newFilePath = re.sub(r'\.\w+$', '.docx', filepath)
        #print("in Mac doc file process: " + newFilePath)
        return newFilePath
    except Exception as e:
        errorFile.write(filepath + ", " + str(e) + "\n")
        print("Error processDocFileMac:  " + str(e))
        return None
    

def processDocFile(filePath, errorFile):
    
    try:
        if(platform == "win32"):
            newFilePath = processDocFileWindows(filePath, errorFile)
        else:
            newFilePath = processDocFileMac(filePath, errorFile)
        #print("return from mac vs windows: " + newFilePath)
        processedFilePath = processDocxFile(newFilePath, errorFile)
    except Exception as e:
        errorFile.write(filePath + ", " + str(e) + "\n")
        print("Error processDocFile:  " + str(e))
        return None
    
    
    return processedFilePath




def processDocxFile(filePath, errorFile):
    #print(filePath)
    try:
        docxText = docx2txt.process(filePath)
        #print(docxText)
        replacedText = cleanText(docxText)
        #print(replacedText)
        #print(filePath)
        fileName = os.path.split(filePath)[1]#.split('/')[-1]
        #print(fileName)
        fileName = fileName.replace(",","")
        #print(fileName)
        baseFileName = fileName[0:-5]
        #print(baseFileName)
        cwd = os.getcwd()
        
        newFilePath = os.path.join(cwd,'output',  baseFileName + ".txt")
        
        #print(newFilePath)
        singleFileDocx=open(newFilePath, 'wb+')    
        singleFileDocx.write(replacedText.encode("utf-8"))
        singleFileDocx.close()
    except Exception as e:
        errorFile.write(filePath + ", " + str(e) + "\n")
        return None

    #temp_df = processDF('singleTextDocx.txt')
    return newFilePath

### Function: Process pdf file

In [54]:
def processPDFfile(filePath, errorFile):
    password = ""
    extracted_text = ""
    try:
        #print(filePath)
        fileName = os.path.split(filePath)[1]#.split('/')[-1]
        #print(fileName)
        fileName = fileName.replace(",", " ")
        baseFileName = fileName[0:-4]
    
        fp = open(filePath, "rb")
        parser = PDFParser(fp)
        document = PDFDocument(parser, password)
        
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
            
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()

        # set parameters for analysis
        laparams = LAParams()

        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device 
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    newText = lt_obj.get_text()
                    newText = newText.replace('\n', ' ')
                    extracted_text += newText

        #close the pdf file
        fp.close()
        
        extracted_text = cleanText(extracted_text)#extracted_text.replace("\n", ' ')
        cwd = os.getcwd()
        newFilePath = os.path.join(cwd,'output', baseFileName + '-pdf' + ".txt")
        #print(newFilePath)
        with open(newFilePath, 'wb+') as singleFilePDF:
            singleFilePDF.write(extracted_text.encode("utf-8"))
        
    except Exception as e:
        errorFile.write(filePath + ", " + str(e) + "\n")
        return None
        #temp_df = processDF('./data/output/' + baseFileName + ".txt")
    return newFilePath

## NLTK Tokenizing Functions

### Function: DD specific text cleaning

In [20]:
def ddCleanText(text):
    newText = text.replace('Delta Dental', 'DeltaDental')
    newText = newText.replace('DELTA DENTAL', 'DELTADENTAL')
    newText = newText.replace('DeltaDental Insurance Company', 'DeltaDentalInsuranceCompany')
    return newText



### Function: Make tokenized word list

In [21]:
def getTokens(text):
    words = word_tokenize(text)
    
    customStopWords = set(stopwords.words('english')+list(punctuation))
    wordsWOStop=[word for word in words if word not in customStopWords]
    
    return wordsWOStop

### Function: Make tokenized Sentence list

In [22]:
def getSents(text):
    sents = sent_tokenize(text)

    return sents

### Function: Get Bigrams

In [23]:
def getBigrams(tokens):
    bigram_measures=nltk.collocations.BigramAssocMeasures();
    finder = BigramCollocationFinder.from_words(tokens)
    sorted_bgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))
    
    return sorted_bgs

### Function: Get Trigrams

In [24]:
def getTrigrams(tokens):
    trigram_measures =nltk.collocations.TrigramAssocMeasures();
    finder = TrigramCollocationFinder.from_words(tokens)
    sorted_tgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))

    return sorted_tgs

## MetaData and Attribute Functions

#### Filename
Contract Start
Contract End
Contract Duration
State
Delta Office Involved

#### Group Information
(Group Number)

#### Numeric attributes Only
Basics
Diagnostics
Major
Endo
Oral
Perio
Prostho
Ortho

In [25]:
##Establish a dataframe to capture the attributes
#d = {'key': 'file','value':fileName}
#{'key':'state', 'value':state}
#df = pd.DataFrame(d, index=['uid'])
d = {}

### NLTK synonyms

#### I kept this to processing single words, bigrams or trigrams so as to keep the complexity down

#### Function: translate from syn POS to nltk POS

In [26]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return ''

#### Function: Use POS to cull wordnet synonyms

In [27]:
def getSynonyms_usingPOS(word_tuple):
    #print(word_tuple)
    word_tagged = word_tuple[0]
    word_pos = get_wordnet_pos(word_tuple[1])
    syns = wn.synsets(word_tagged, pos=word_pos)
    
    set1 = set()
    for syn in syns:
        for lem in syn.lemmas():
            set1.add(lem.name())
    #print(syns)
    return set1

#### Function: get synonyms of a single word. Helper function to Bigram and Trigram function

In [28]:
## You can't cull this one down with the POS b/c you can't tag a single word
def getSyns(word):
    syns1 = wn.synsets(word)
    
    set1 = set()
    for syn in syns1:
        for lem in syn.lemmas():
            set1.add(lem.name())
    
    return set1

#### Function: Get a similar bigram

In [29]:
def getSimilarBigrams(word1, word2):
    
    tagged_words = nltk.pos_tag([word1,word2])
    set1 = getSynonyms_usingPOS(tagged_words[0])
    if not len(set1):
        set1.add(word1)
    set2 = getSynonyms_usingPOS(tagged_words[1])
    if not len(set2):
        set2.add(word2)
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            word_set.add(" ".join([word1, word2]))
    
    return word_set

#### Function: get a similar trigram

In [30]:
def getSimilarTrigrams(word1, word2, word3):
    tagged_words = nltk.pos_tag([word1,word2,word3])
    
    set1 = getSynonyms_usingPOS(tagged_words[0])
    if not len(set1):
        set1.add(word1)
    set2 = getSynonyms_usingPOS(tagged_words[1])
    if not len(set2):
        set2.add(word2)
    set3 = getSynonyms_usingPOS(tagged_words[2])
    if not len(set3):
        set3.add(word3)
    
    word_set = set()
    for word1 in set1:
        for word2 in set2:
            for word3 in set3:
                word_set.add(" ".join([word1, word2, word3]))
    #print(word_set)
    
    return word_set
        

#### Function: Get synonyms from a list of key words. Returns more keywords/phrases

In [31]:
def getSynonymsFromList(keywords):
    matches = []

    for kw in keywords:
        try:
            words = word_tokenize(kw)
        except Exception as e:
            print(str(e))
        #print(words)
        if(len(words) == 1):
            #print("is 1")
            syns = getSyns(words[0])
            for syn in syns:
                matches.append(syn)
            #keywords.append(list(syns))
        elif(len(words) == 2):
            #print("is 2")
            syns = getSimilarBigrams(words[0],words[1])
            #print(syns)
            matches.extend(getSimilarBigrams(words[0],words[1]))
        elif(len(words) == 3):
            #print("is 3")
            matches.extend(getSimilarTrigrams(words[0],words[1],words[2]))
        else:
            print("keyword string too long")
        #print(matches)
    keywords.extend(matches)
    keywords = set(keywords)

    #print(start_keywords)
    
    return keywords

### Metadata

#### get file name
Get the filename from a full path. Determines the OS and splits the string correctly based on that

In [32]:
def getFileName(fullPath):
    try:
        if(platform == "win32"):
            fileName = fullPath.split("\\")[-1]
        else:
            fileName = fullPath.split("/")[-1]
        return fileName
    except Exception as e:
        return None

#### get group number
Uses regex's made from keywords to attempt to find a group number in the file. Failing that, it searches the filename for the number.

In [33]:
def getGroupNumber(sents_tokens, filePath):
    
    group_keywords = ["group number", "groupnumber"]
    regex_exps = []
    poss_nums = []
    finalGN = None
    
    try:
        #Create regex exps out of group number keywords
        for kw in group_keywords:
                temp_re = kw + "\W\s*(?P<gn>\d+)(?P<gn2>[-\s]\d+)"
                regex = re.compile(temp_re)
                regex_exps.append(regex)

        #For each sentence, search for the expression, if found add the number to
        #list of possible group numbers
        for sent in sents_tokens:
            #print(sent)
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    temp_gn = result.group('gn')
                    temp_gn2 = result.group('gn2')
                    if(temp_gn2):
                        if(len(temp_gn2[1:])>len(temp_gn)):
                            poss_nums.append(temp_gn2[1:])
                        else:
                            poss_nums.append(temp_gn)
                    else:
                        #temp_gn2 = temp_gn2[1:]
                        #print(temp_gn2)
                        
                        poss_nums.append(temp_gn)

        #Try and get the number from the file name, looking for list of numeric chars
        num_regex = re.compile("(?P<gn>\d+)(?P<gn2>[-\s]+\d+)?")
        fileName = getFileName(filePath)
        fileGN = num_regex.search(fileName)

        if not fileGN==None:#if they filename has a number sequence
            #print(fileGN.group('gn'))
            #print(fileGN.group('gn2'))
            temp_gn = None
            if(fileGN.group('gn2')):
                if(len(fileGN.group('gn'))>=len(fileGN.group('gn2')[1:])):
                    temp_gn = fileGN.group('gn')
                else:
                    temp_gn = fileGN.group('gn2')[1:]

            #print(fileGN.group('gn'))
            #print(fileGN.group('gn2'))
            #print(fileGN.group())
            else:
                temp_gn = fileGN.group()
            if (temp_gn in poss_nums):#then if the file group number matches one in the document, choose it
                finalGN = temp_gn
            else:
                poss_nums.append(temp_gn)#otherwise add the filename one to the list and try to get the most co
                try:
                    finalGN = mode(poss_nums)
                except Exception as e:
                    #print(str(e))
                    #print(poss_nums)
                    return -1
                    #print("Unexpected error: Cannot determine group number of file: " + filePath)
        else: #it is none and there was no group number in the filename
            try:
                finalGN = mode(poss_nums)
            except Exception as e:
                #no mode found, couldn't find a group number
                #print(str(e))
                #print(poss_nums)
                return -1
    except Exception as e:
        #print(str(e))
        return None
    
    return finalGN

#### get contract start
Uses regex and a list of keywords to attempt to find the start date of the contract. It makes multiple passes based on patterns seen in contract samples so far.

Some of the passes are necessary to filter out non-date numbers that the datefinder incorrectly parses to dates

In [34]:
def getContractStart(sents_tokens):
    start_keywords = ["effective date\S\s*\S", "effective"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    try:
        start_keywords = getSynonymsFromList(start_keywords)
        #print(start_keywords)
        for kw in start_keywords:
                temp_re = kw
                regex = re.compile(temp_re)
                regex_exps.append(regex)

    ## Original pass through sentence tokens to find possible dates
        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    subset = word_tokenize(sent[sent.lower().find(result.group()):])[:20]
                    subset = " ".join(subset)
                    poss_dates.append(subset)
                    if "effective date" in subset.lower():
                        poss_dates.append(subset)

                
    ## Second pass through sentence tokens to find possible dates based on a date range format
        regex_exps = []
        backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
        for kw in backup_kw:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)

        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    half_1 = sent[sent.lower().find(result.group()):]
                    half_2 = sent[:sent.lower().find(result.group())]

                    subset_1 = " ".join(word_tokenize(half_1)[:6])
                    subset_2 = " ".join(word_tokenize(half_2)[-6:])
                    subset = subset_2 + subset_1

                    m = datefinder.find_dates(subset, strict=True)
                    temp_matches = []
                    for match in m:
                        if match.year >= 1966:
                            temp_matches.append(subset)
                    if len(temp_matches)>=2:
                        poss_dates.append(subset)
                        #print(subset)

                    
    ## Second pass through sentences with possible dates to eliminate ones without a year or with an invalid year
    ## These are likely other values flagged incorrectly as dates by the datefinder
    ## 1966 is the year Delta Dental was created
        for sent in poss_dates:
            #print(sent)
            find_year_re = re.compile("\d\d\d\d")
            year = find_year_re.search(sent)

            if not year==None:
                #print(sent)
                #print(year.group())
                m = datefinder.find_dates(sent, strict=True)
                for match in m:
                    if match.year >= 1966:
                        matches.append(match)
## Last pass: try to find the most common date. If there is more than one mode, choose the earliest date
##.           this seems to occur when it is finding the contract start and end in equal quantities
        #print(matches)
        try:
            finalDate = mode(matches)
        except ValueError as e:
            #print(str(e))
            if matches:
                earliestMatch = matches[0]
                for match in matches:
                    if(match < earliestMatch):
                        earliestMatch = match
                finalDate = earliestMatch
            else:
                finalDate = datetime.datetime(1066, 1, 1)
        except Exception as e:
            return None #i.e. not only could they not find a start date, something failed
    except Exception as e:
        return None
    return finalDate

####  get Contract End
Similar to get contract start, it uses regex and keywords over multiple passes to attempt and find the contract end.

In [35]:
def getContractEnd(sents_tokens):
    
    start_keywords = ["contract term\S\s*\S", "contract term ", "contract end"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    try:
        start_keywords = getSynonymsFromList(start_keywords)
        #print(start_keywords)
        for kw in start_keywords:
                temp_re = kw
                regex = re.compile(temp_re)
                regex_exps.append(regex)

    ## Original pass through sentence tokens to find possible dates based on keywords
        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    subset = word_tokenize(sent[sent.lower().find(result.group()):])[:30]
                    subset = " ".join(subset)
                    #print(subset)
                    poss_dates.append(subset)


    ## Second pass through sentence tokens to find possible dates based on a date range format
        regex_exps = []
        backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]

        for kw in backup_kw:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)

        for sent in sents_tokens:
            for my_regex in regex_exps:
                result = my_regex.search(sent.lower())
                if not result==None:
                    half_1 = sent[sent.lower().find(result.group()):]
                    half_2 = sent[:sent.lower().find(result.group())]

                    subset_1 = " ".join(word_tokenize(half_1)[:12])
                    subset_2 = " ".join(word_tokenize(half_2)[-12:])
                    subset = subset_2 + subset_1

                    m = datefinder.find_dates(subset, strict=True)
                    temp_matches = []
                    for match in m:
                        if match.year >= 1966:
                            temp_matches.append(subset)
                    if len(temp_matches)>=2:
                        poss_dates.append(subset)
                        #print(subset)


    ## Pass through sentences with possible dates to eliminate ones without a year or with an invalid year
    ## These are likely other values flagged incorrectly as dates by the datefinder
    ## 1966 is the year Delta Dental was created
        for sent in poss_dates:
            #print(sent)
            find_year_re = re.compile("\d\d\d\d")
            year = find_year_re.findall(sent)
            #print(year)
            if len(year)>=2:
                #print(year)
                m = datefinder.find_dates(sent, strict=True)
                maxMatch = datetime.datetime(1066,1,1)
                if not m == None:
                    #print(m)
                    for match in m:
                        if match > maxMatch:
                            maxMatch = match
                    if maxMatch.year >= 1966:
                        matches.append(match)


        #print(matches)

    ### If there are exactly two matches, try to find a max. If error b/c they're the same, choose one
        if(len(matches) == 2):
            try:
                finalDate = max(matches)
            except ValueError as e:
                finalDate = matches[0]
            except Exception as e:
                return None

    ## If there are more, try and find the top two most mentioned and take the later. else just take the latest            
        elif(len(matches) > 2):

            try:
                date1 = mode(matches)
                matches.remove(date1)

                date2 = mode(matches)
                matches.remove(date2)

                finalDate = max([date1, date2])
            except ValueError as e:
                #print(str(e))
                if matches:
                    latestMatch = matches[0]
                    for match in matches:
                        if(match > latestMatch):
                            latestMatch = match
                    finalDate = latestMatch
            except Exception as e:
                return None
        else:
            return datetime.datetime(1066, 1, 1)
            #print("could not find contract end for file")
    except Exception as e:
        return None
    #print("\n")
    return finalDate

#### get Contract Duration
Uses the functions getContractStart and getContractEnd to calculate a duration if possible

In [36]:
def getContractDuration(sents_tokens):
    start = None
    end = None
    duration = None
    
    try:
        start = getContractStart(sents_tokens)
        end = getContractEnd(sents_tokens)
        
        if(start and end):
            if (start.year==1066) or (end.year == 1066):
                return -1
            else:
                duration = (end - start).days
                if duration <= 0:
                    return -1
        else:
            return -1
    except Exception as e:
        print(str(e))
        return None

    return duration

#### get State/Location: -- Not Done

##### Helper function to create location data set from csv

In [37]:
def makeLocationDataStruct():
    categories = []
    location_data = {}
    
    us_filename = 'us_cities_states_counties.csv'
    cwd = os.getcwd()
    filepath = os.path.join(cwd, us_filename)
    #print(filepath)
    with open(filepath, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter='|',escapechar=',')
        category_row = 1
        for row in spamreader:
            if category_row:
                for cat in row:
                    categories.append(cat)
                    location_data[cat]=set()
                category_row = 0
            else:
                
                for item in range(len(row)):
                    #print(row[item])
                    if len(row[item]):
                        location_data[categories[item]].add(row[item])
        location_data['State full'].add("Washington , DC")
        location_data['State full'].add("Washington , D.C.")
    csvfile.close()
    
    states_filename = 'state_abbrv_to_name.csv'
    filepath = os.path.join(cwd, states_filename)
    
    location_data['translate_s2l'] = {}
    location_data['translate_l2s'] = {}
    
    with open(filepath, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        category_row = 1
        for row in spamreader:
            if category_row:
                category_row = 0
            else:
                location_data['translate_s2l'][row[0]] = row[1]
                location_data['translate_l2s'][row[1]] = row[0]
    csvfile.close()
    #print(location_data)
    return location_data

##### Call function to set up location data

In [38]:
location_data = makeLocationDataStruct()
print(location_data['State full'])

{'Idaho', 'Indiana', 'New Jersey', 'California', 'Palau', 'Alaska', 'Washington , D.C.', 'Mississippi', 'US Armed Forces Europe', 'Virginia', 'Nevada', 'Iowa', 'Georgia', 'Michigan', 'Maryland', 'Arkansas', 'Montana', 'Federated States of Micronesia', 'Missouri', 'West Virginia', 'North Carolina', 'Nebraska', 'Guam', 'Washington , DC', 'Wyoming', 'American Samoa', 'Florida', 'Virgin Islands', 'Puerto Rico', 'New Mexico', 'Washington DC', 'New York', 'Alabama', 'Minnesota', 'North Dakota', 'Washington', 'New Hampshire', 'Maine', 'Louisiana', 'Ohio', 'Illinois', 'Kentucky', 'Rhode Island', 'US Armed Forces Pacific', 'Pennsylvania', 'Oregon', 'Northern Mariana Islands', 'South Carolina', 'Connecticut', 'Tennessee', 'Hawaii', 'Marshall Islands', 'Washington D.C.', 'Delaware', 'Arizona', 'Wisconsin', 'Kansas', 'Massachusetts', 'Vermont', 'Oklahoma', 'District of Columbia', 'Utah', 'Texas', 'South Dakota', 'Colorado'}


##### Get Client Location

In [39]:
def checkIfCity(loc_str, loc_data, isDelta, isContractholder):
    cities = []
    
    if (loc_str in loc_data['City']):
        cities.append(loc_str)
    return cities

def checkIfState(loc_str, loc_data):
    states = []
    
    if(loc_str in loc_data['State full']):
        states.append(loc_str.lower())
    if(loc_str in location_data['State short']):
        try:
            states.append(location_data['translate_s2l'][loc_str].lower())
        except Exception as e:
            return states
    return states
    
def getClientLocation(sents_tokens, bgs, tgs, location_data, filename):
    loc_sents = set()
    cities = []
    states = []
    
    
    try:
        
        for sent in sents_tokens:

            sent_cities = []
            sent_states = []

            isDelta = "deltadental" in sent.lower()
            isContractholder = "contractholder" in sent.lower()
            isNotice = "notice to contractholder" in sent.lower()
            DCtext = ["washington , d.c.","washington , dc", "washington dc","district of columbia","washington d.c."]

            #Used to filter out sentences in ALL caps. They interfere with the Abbreviated States lists
            if sent.isupper():
                sent = sent.lower()


            text = nltk.word_tokenize(sent)

            i = 0

            while (i < len(text)) and (len(text) > 2):
                if(i < len(text)-2):
                    text_bg = " ".join([text[i], text[i+1]])
                else:
                    text_bg = ""
                if(i < len(text)-2):
                    text_tg = " ".join([text[i], text[i+1], text[i+2]])
                else:
                    text_tg = ""

                sent_cities.extend(checkIfCity(text[i], location_data, isDelta, isContractholder))
                sent_cities.extend(checkIfCity(text_bg, location_data, isDelta, isContractholder))
                sent_cities.extend(checkIfCity(text_tg, location_data, isDelta, isContractholder))

                sent_states.extend(checkIfState(text[i], location_data))
                sent_states.extend(checkIfState(text_bg, location_data))
                sent_states.extend(checkIfState(text_tg, location_data))

                i+=1

            if (len(sent_states)>0) and not isDelta:
                
                if bool(set(sent_states).intersection(DCtext)):
                    
                    for st in sent_states:
                        if st in DCtext:
                            states.append(st)
                            if(isNotice):
                                states.append(st)
                           
                short_sent_states = []
                for st in sent_states:
                    try:
                        short_sent_states.append(location_data['translate_l2s'][st.title()].lower())
                    except Exception as e:
                        e=e

                for city in sent_cities:
                    for state in sent_states+short_sent_states:
                        
                        if (len(checkIfState(city + " " + state.title(), location_data))==0) and not (city is "New York"):
                                
                                add_regex_str = str(city) + "[-,\s]+" + str(state) + "[-,\s]"
                                add_regex = re.compile(add_regex_str, re.IGNORECASE)
                                matches = add_regex.findall(sent)
                                if matches:
                                    cities.append(city)
                                    states.append(state)
                                    if(isNotice):
                                        states.append(state)
                                    
            
        FN_chunks = re.findall(r"[\w]+|[-\s_]", filename)
        
        for chunk in FN_chunks:
            FN_state = checkIfState(chunk, location_data)
            if len(FN_state)>0:
                for i in range(0,5):
                    states.append(FN_state[0])
                    i+=1
                
              
        try:
            print(states)  
            final_state = mode(states)
            print(final_state)
            if(len(final_state)<3):
                try:
                    final_state = location_data['translate_s2l'][final_state.upper()]
                except Exception as e:
                    #print(str(e))
                    final_state = final_state
        except ValueError as ve:
            #print(str(ve))
            final_state = "not_a_state"
        except Exception as e:
            #print(str(e))
            final_state = None
        try:
            final_city = mode(cities)
        except ValueError as ve:
            #print(str(ve))
            final_city = "not_a_city"
            #print(final_city)
        except Exception as e:
           # print(str(e))
            final_city = None
    
    except Exception as e:
        print(str(e))
        return None #error in function execution
    return final_state

##### To keep: address regex

In [40]:
#    isDelta = "deltadental" in sent.lower()
            #    isContractholder = "contractholder" in sent.lower()
            #    if not isDelta and isContractholder:
                   # print(sent)
                   # print("\n")
                    #[aA]ddress[-: ]
                    #"\s+\d+\D+" + "["+"|".join(cities) + "]" "\s+" + "[" + "|".join(sent_states) + "]" + "\s+\d\d\d\d\d"
            #        add_regex_str = "[aA]ddress[-: ]\s+\d+[\D\d]+" + "["+"|".join(sent_cities) + "]*" "\s+" + "[" + "|".join(sent_states) + "]*" + "\s+\d\d\d\d\d"
                   # print(add_regex_str)
            #        add_regex = re.compile(add_regex_str)
                   # print(str(sent_cities))
                  #  print(sent_states)
            #        matches = add_regex.findall(sent)
                    #print(matches)
                    #if matches:
                     #   for m in matches:
                   #         print(m)
                   # else:
                   #     add_regex_str = "\s+\d+[\D\d]+" + "["+"|".join(sent_cities) + "]*" "\s+" + "[" + "|".join(sent_states) + "]*" + "\s+\d\d\d\d\d"
                   #     add_regex = re.compile(add_regex_str)
                   #     matches = add_regex.findall(sent)
                        #loc_sents.add(matches.group())
        

#### Delta Office Involved -- Not Done

In [41]:
def getDeltaOffice(sents_tokens, bgs, tgs, location_data):
    loc_sents = set()
    cities = []
    states = []
    
    try:
        for sent in sents_tokens:
           
            sent_cities = []
            sent_states = []

            isDelta = "deltadental" in sent.lower()
            isContractholder = "contractholder" in sent.lower()
            isNotice = "notice to delta dental" in sent.lower()
            DCtext = ["washington , d.c.","washington , dc", "washington dc","district of columbia","washington d.c."]



            if isDelta:
                
                if sent.isupper():
                    sent = sent.lower()
                    
                text = nltk.word_tokenize(sent)

                i = 0

                while (i < len(text)) and (len(text) > 2):
                    if(i < len(text)-2):
                        text_bg = " ".join([text[i], text[i+1]])
                    else:
                        text_bg = ""
                    if(i < len(text)-2):
                        text_tg = " ".join([text[i], text[i+1], text[i+2]])
                    else:
                        text_tg = ""

                    sent_cities.extend(checkIfCity(text[i], location_data, isDelta, isContractholder))
                    sent_cities.extend(checkIfCity(text_bg, location_data, isDelta, isContractholder))
                    sent_cities.extend(checkIfCity(text_tg, location_data, isDelta, isContractholder))

                    sent_states.extend(checkIfState(text[i], location_data))
                    sent_states.extend(checkIfState(text_bg, location_data))
                    sent_states.extend(checkIfState(text_tg, location_data))

                    i+=1

                if (len(sent_states)>0):
                    if bool(set(sent_states).intersection(DCtext)):
                        
                        for st in sent_states:
                            if st in DCtext:
                                states.append(st)
                                if(isNotice):
                                    states.append(st)
                                
                    short_sent_states = []
                    for st in sent_states:
                        try:
                            short_sent_states.append(location_data['translate_l2s'][st.title()].lower())
                        except Exception as e:
                            e=e
                           
                    for city in sent_cities:
                        for state in sent_states+short_sent_states:
                            if (len(checkIfState(city + " " + state.title(), location_data))==0) and not (city is "New York"):

                                add_regex_str = str(city) + "[-,\s]+" + str(state) + "[-,\s]"
                                add_regex = re.compile(add_regex_str, re.IGNORECASE)
                                matches = add_regex.findall(sent)
                                if matches:
                                    cities.append(city)
                                    states.append(state)
                                    if(isNotice):
                                        states.append(state)
                                   
        try:
            final_state = mode(states)
            if(len(final_state)<3):
                try:
                    final_state = location_data['translate_s2l'][final_state.upper()]
                except:
                    final_state = final_state
        except ValueError as ve:
            final_state = "not_a_state"
        except Exception as e:
            #print(e)
            final_state = None
        try:
            final_city = mode(cities)
            #print(final_city)
        except ValueError as ve:
            final_city = "not_a_city"
        except Exception as e:
            #print(str(e))
            final_city = None

    except Exception as e:
        final_state = None
        
    return final_state

### Batch Run to get attributes
#### Functions to process multiple files and their attributes at once

#### Function: batch pre process: fill output folder

In [55]:
def batchPreProcess(errorFile):
    cwd = os.getcwd()
    ##print(cwd)
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "processed")
    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                print("pre-processing: " + file)
                try:
                    
                    if(checkFileType(filepath) == 0):
                        processedTextPath = processDocxFile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " +  filepath)
                            
                    elif(checkFileType(filepath) == 1):
                        processedTextPath = processPDFfile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " + filepath)
                            
                    elif(checkFileType(filepath) == 2):
                        processedTextPath = processDocFile(filepath, errorFile)
                        if not processedTextPath:
                            print("Error pre-processing file: " + filepath)
                            
                    else:
                        errorFile.write(filepath + ", pre-processing: invalid filetype\n")
                        raise TypeError('This path does not lead to a valid file type!')                     
                except Exception as e:
                    print(str(e))
                    errorFile.write(filepath + ", pre-processing," + str(e) + "\n")
                    print("Error pre-processing file: " + filepath)

    else:
        print("Folder data/raw doesn't exist")
        return None
    return "success"

#### Function: Batch return token and bigram sets for all output files
Returns file information as an array of objects containing key:value information about the file: 

[ 

    {
    
        'filepath':'users/sydneyknox...', 
        
        'wordTokens':[*tokens*], 
        
        ...
        
    }, 
    
    {  
    
        'sentenceTokens':[*tokens*],
        
        'cleanText':"string containing the original text from the processed file..."
        
    }
    
]

In [60]:
def batchGetTokens(errorFile):
    all_tokens = []
    cwd = os.getcwd()
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "output")
    
    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                try:

                    temp_obj = {}

                    with open(filepath, 'r', encoding='utf-8') as txtFile:
                        text = txtFile.read()

                    temp_obj['filepath'] = filepath

                    text = ddCleanText(text)
                    temp_obj['cleanText'] = text

                    wordTokens = getTokens(text)
                    sentTokens = getSents(text)
                    temp_obj['wordTokens'] = wordTokens
                    temp_obj['sentTokens'] = sentTokens

                    bgs = getBigrams(wordTokens)
                    tgs = getTrigrams(wordTokens)
                    temp_obj['bgs'] = bgs
                    temp_obj['tgs'] = tgs

                    txtFile.close()
                    all_tokens.append(temp_obj)
                except Exception as e:
                    errorFile.write(filepath + ", " + str(e) + "\n")
                    print("Error opening and tokenizing " + file)
                    #print(str(e))

    else:
        print("Folder /output doesn't exist. Pre-processing failed.")
        return None
    return all_tokens

#### Function: get metadata attributes
This function takes in a single files info -- in this section because it will be used in a batch function

In [44]:
def getMetaDataAtt(file_info):
    #print(file_info)
    file_attr = {}
    file_attr['filepath'] = file_info['filepath']
    
    fileName = getFileName(file_info['filepath'])
    if not fileName:
        fileName = file_info['filepath']
        file_attr['fileName'] = fileName
    else:
        print(fileName)
        file_attr['fileName'] = fileName
    
    groupNumber = getGroupNumber(file_info['sentTokens'], file_info['filepath'])
    file_attr['groupNumber'] = groupNumber
    #print(groupNumber)
    #if not (groupNumber):
        #Function failed to execute correctly
    #    print("function getGroupNumber failed to execute.\n")
    #elif groupNumber==-1:
    #     print("could not find valid group number in file")
    #else:
    #    print("group number: " , groupNumber)
    
    #contractStartDate = getContractStart(file_info['sentTokens'])
    #file_attr['contractStartDate'] = contractStartDate
    #print(contractStartDate)
    #if not (contractStartDate):
        # Function failed to execute
    #elif contractStartDate.year == 1066:
        #could not find valid start date
    #else:
    #    print("start: " , contractStartDate)
    #print("\n")
    
    #contractEndDate = getContractEnd(file_info['sentTokens'])
    #file_attr['contractEndDate'] = contractEndDate
    #if not (contractEndDate):
        #Function failed to execute
    #elif contractEndDate.year == 1066:
        #could not find valid start date
    #else:
    #    print("end: " , contractEndDate)
    
    #contractDuration = getContractDuration(file_info['sentTokens'])
    #file_attr['contractDuration'] = contractDuration
    #if not (contractDuration):
        #Function failed to execute
    #elif contractDuration == -1:
        #Could not calculate valid duration
    #else:
    #    print("duration: " , contractDuration)
    #print("\n")
    
    clientLocation = getClientLocation(file_info['sentTokens'], file_info['bgs'], file_info['tgs'], location_data, fileName)
    file_attr['clientLocation'] = clientLocation
    #if not (clientLocation):
        #Failed to execute
    #elif clientLocation == "not_a_state":
        #Failed to find a valid location
    #else:
    #    print("Client Office in: " + clientLocation)
    #print("\n")
    
    deltaOfficeLocation = getDeltaOffice(file_info['sentTokens'], file_info['bgs'], file_info['tgs'], location_data)
    file_attr['deltaOfficeLocation'] = deltaOfficeLocation
    #if not (deltaOfficeLocation):
        #failed to execute
    #elif deltaOfficeLocation == "not_a_state":
        #Failed to find valid location for DD office
    #else:
     #   print("Delta Office in: " + str(deltaOfficeLocation))
    #print("\n")
    
    d={'key':'filename', 'value':fileName}
    dfMD = pd.DataFrame(d, index=['MetaData'])
    
   ## df=pd.DataFrame({'key':'group_number','value':groupNumber}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
   ## df=pd.DataFrame({'key':'contract_start_date','value':contractStartDate}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
    return file_attr

## Workspace Prep Functions

### Function: Initial setup/fill Raw data folder

In [48]:
## Iterate through folder: if pdf/doc/docx move into new folder/make new folder
def setupWorkspace():
    cwd = os.getcwd()
    #print("cwd: " + cwd)
    
    rawPath = os.path.join(cwd, "raw")
    processedPath = os.path.join(cwd, "processed")
    outputPath = os.path.join(cwd, "output")
    dataPath = os.path.join(cwd, "data")
    
    try:
        if(os.path.isdir(rawPath)):
            raise Exception(rawPath + " already exists.")
        else:
            os.makedirs(rawPath)
        
        if(os.path.isdir(processedPath)):
            raise Exception(processedPath + " already exists.")
        else:
            os.makedirs(processedPath)
            
        if(os.path.isdir(outputPath)):
            raise Exception(outputPath + " already exists.")
        else:
            os.makedirs(outputPath)
            
        if(os.path.isdir(dataPath)):
            raise Exception(dataPath + " already exists.")
        else:
            os.makedirs(dataPath)
            
        
        for file in os.listdir(cwd):
            if(os.path.isfile(file)):
                if not(checkFileType(file) == -1):
                    #print("moving: " + file)
                    shutil.move(os.path.join(cwd,file), os.path.join(rawPath, file))
        
        
        
    except Exception as e:
        print(str(e))
        

In [None]:
setupWorkspace()

### Function: Move to processed folder

In [49]:
def processedFolder():
    cwd = os.getcwd()
    #print("cwd: " + cwd)
    
    rawPath = os.path.join(cwd, "raw")
    processedPath = os.path.join(cwd, "processed")
    outputPath = os.path.join(cwd, "output")
    dataPath = os.path.join(cwd, "data")
    
    try:
        if(os.path.isdir(rawPath) and os.path.isdir(processedPath)):
            #is it empty?
            if not os.listdir(rawPath):
                raise Exception("raw data folder is empty.")
            else:
                for file in os.listdir(rawPath):
                    #move a copy to processed Path
                    try:
                        shutil.copy(os.path.join(rawPath, file), os.path.join(processedPath, file))
                    #catch copy exception here so it doesn't stop all files (?)
                    except Exception as e:
                        print(str(e))
            
        else:
            raise Exception("Expected file structure doesn't exist.")
        
        
    except Exception as e:
        print(str(e))

In [51]:
processedFolder()

## Full run through of batch processing with error checking

In [62]:
errorFilePath = os.path.join(os.getcwd(),'data','cannot_process.csv')
errorFile = open(errorFilePath, 'w')


base_info = None
pp = batchPreProcess(errorFile)
if pp == None:
    print("Error in pp")
else:
    base_info = batchGetTokens(errorFile)
    if not base_info:
        print("Error in getting tokens")
    else:
        outputFilePath = os.path.join(os.getcwd(),'data','raw_attribute_data.csv')
        outputFile=open(outputFilePath, 'w')    
        first_row = 1

        for file in base_info:
            file_attr = getMetaDataAtt(file)
            if first_row:
                for key in file_attr:
                    outputFile.write(key + ",")
                outputFile.write("\n")
                first_row = 0
            else:
                for attr in file_attr:
                    #print(attr + ": " + file_attr[attr])
                    outputFile.write(str(file_attr[attr])+ ",")
                outputFile.write("\n")

        outputFile.close()
errorFile.close()

Error opening and tokenizing .DS_Store
19168 EOC.txt
['ca', 'pa', 'washington , d.c.', 'pa', 'wv', 'wv']
10041 Appendix C EOCc Meridian Union (Janb2014)-pdf.txt
['ms', 'ms', 'ms', 'ms', 'ms', 'ms', 'ms', 'ms', 'ms', 'washington , d.c.']
ms
TX 19015 Attachment A ENT (7.2.18).txt
['texas', 'texas', 'texas', 'texas', 'texas']
texas
19223 Attach B (01-01-18).txt
[]
TX 17404 EOC Regional (7.2.18).txt
['texas', 'texas', 'texas', 'texas', 'texas']
texas
01248 (SI) Eff. 10-1-14.txt
[]
1036 - SI (Eff. 7-1-09).txt
[]
19168 Contract-pdf.txt
['ca', 'washington , d.c.', 'wv', 'wv', 'wv']
wv
1036 Sched I 7-1-09-pdf.txt
[]
TX 17404 EOC (Regional (7.2.18)-pdf.txt
['ca', 'texas', 'texas', 'texas', 'texas', 'texas']
texas
01094 - SI - Eff 7-1-17 to 6-30-19-pdf.txt
[]
19223 PPO EBB (01-01-18)-pdf.txt
['ca', 'washington , d.c.', 'ca']
ca
01255 (Schedule I) January 1 2014.txt
[]
TX-19278 ASC-ENT (7.2.18).txt
['tx', 'ga', 'ga', 'ga', 'washington , d.c.', 'ga', 'texas', 'texas', 'texas', 'texas', 'texas']
te

['ms', 'ms', 'ms', 'ms', 'ms', 'ms', 'washington , d.c.']
ms
19168 EOC-pdf.txt
['ca', 'washington , d.c.', 'wv', 'wv', 'ca']
1036 - R30 (Eff. 7-1-09).txt
[]
10041 Appendix B EOC Non-Union (Jan2018).txt
['ms', 'ms', 'ms', 'ms', 'ms', 'ms', 'washington , d.c.']
ms
10294-01002 & 02001 PPO EOC (07-01-12).txt
['ca', 'ca', 'ca', 'washington , d.c.', 'ca']
ca
19380 - JACKSON CHILDREN'S SERVICES INC - CONTRACT - Eff 7-1-18 to 6-30-19.txt
[]


## Not done attributes

In [None]:
## Group Information

dfGI=pd.DataFrame(d, index=['uid'])

In [None]:
## Diagnostic and Preventative (D&P) [Appendix A]

d={'key':'D&P Services_PPO','value':'100'}
##Need to pass index since we're only doing string values
dfDP=pd.DataFrame(d, index=['D%P Services'])


In [None]:
## Basic Service

dfBS=pd.DataFrame(d, index=['Basic Service'])

In [None]:
## Endo Perio (Endodontics (Periodontal(?)))

dfEP=pd.DataFrame(d, index=['Endo Perio'])

In [None]:
## Oral (Oral Surgery)

dfOa=pd.DataFrame(d, index=['uid'])

In [None]:
## Perio (Periodontal)

dfPe=pd.DataFrame(d, index=['uid'])

In [None]:
## Major (Major Benefits)

dfMj=pd.DataFrame(d, index=['uid'])

In [None]:
## Prostho (Prosthodontics)

dfPr=pd.DataFrame(d, index=['uid'])

In [None]:
## Ortho (Orthodontics)

dfOt=pd.DataFrame(d, index=['uid'])

In [None]:
#Concatenate all frames created by the above dataset

frames = [df, dfGI, dfDP, dfBS, dfEP, dfOa, dfPe, dfMj, dfPr, dfOt]

result = pd.concat(frames)
transpose=result.transpose()
print(transpose)

## Notes

### Contract Start Date

With contract start date I began by searching through the tokenized sentences with a regex expression. 
I found a datefinder module to use on each flagged sentence to pull out the dates
	Issue: the datefinder module works poorly on large, run -on sentences which are common in the contracts. It tends to find other numbers that aren't dates and try to make a date out of them.
	
	Sol: only take a subset, starting at the flagged word
	
Sometimes a match isn't found with the keywords I've seen related to the start date
	Sol: look for keywords related to contract term and take the earlier date from that sentence
	
Issue: Datefinder focusing on numbers that aren't dates
	Sol: filter for sentences that have a year (ie four digits in a row) and dates that are before Delta Dental existed (in 1966)
	
Issue: Sometimes there are multiple modes. Usually I saw this when there were equal mentions of the end date
	Sol: if there are multiple modes, take the earliest date. 
	



### Testing finding the contract end date

This was much the same as the contract start date Issues

	Looking for keywords Contract Term/Contract End
	Filtering on invalid years
	Filtering on if there IS a year in the sentence (assuming it won't be written out like nineteen ninety-four)
	If there are only two results, take the later one
	If there are more than two, take the top two most common and then take the latter of the two


### Contract Duration:

Call contract start and end and try to get a duration out of them

 ^^^ pretty much worked


### Comments so far:


	Even with trying to be variable, this won't work if they even change the wording a bit. Maybe spend some time looking into using the libraries to get synonyms.
	
	It would also be great to get the other contracts to see exactly where we're going wrong
	


### Working with synonyms in NLTK

	Getting an expanded set of search terms can be done, but I can't yet figure out how to pick the right contexts. For example, the search bigram "contract term" gives back a huge amount of synonyms, with only 3 or 4 actually being equivalent in meaning to "contract term"
	
	We could always manually select ones that are similar but that seems to defeat the purpose: ie we could select only the noun meanings of contract
	
	We could build our own corpus of words based on all of the documents that we have, and then compare the given synonyms to a freq distribution of those words to pick out the ways other contracts might say the same thing
		But this would still miss things for sure
		
	We could include the word type with the seed words and only choose synonyms of the same type, although this does involve more hardcoding


### Folder Format

data/raw

data/processed/[group number - group name]/

data/output

*** Not every file seems to have a name, so we would have to parse the file to get it

*** Additionally each num - name combo may have contracts from multiple dates


### Using NER tagging to identify location sections

#### NLTK NER tagging
Basic NER tagging with nltk works horribly on our files out of the box

#### Polyglot
Polyglot has a lot of issues getting downloaded

#### NLTK wrapper for Stanford NER
NLTK has a wrapper for the Stanford NER tagger so I'm going to try that next
	Download the model jar file
	
	
The stanford NER tagger is working a bit better
http://www.nltk.org/api/nltk.tag.html#nltk.tag.stanford.StanfordTagger
https://nlp.stanford.edu/software/stanford-ner-2018-02-27.zip (download of jar files)
https://textminingonline.com/how-to-use-stanford-named-entity-recognizer-ner-in-python-nltk-and-other-programming-languages

The stanford one takes FOREVER though
	There is a faster version in CoreNLP but that's all in Java and I don't think the wrapper interacts with it

#### GeoText
GeoText

Easy to set up and use, but doesn't do states or state abbreviations
And it misses a LOT that the NER tagger got
	It is completely unreliable honestly. 

#### Options
Option 1: Use the NLTK wrapper for Stanford NER tagger and just wait forever
Option 2: Get a giant csv of all US cities/states/abbreviations/counties (exists) and make a data set out of that to compare to
	Cons: not flexible or extensible, is already 4 years out of date
	Pros: much faster, will only have to create the thing once
	
Note: even with the NER it will only give us pieces of the address, we would still have to go into the sentence and try to regex it out

Option 3: create our own trained model from some of the files we already have and see how that does with the Stanford tagger. Might be faster
	Could also see how it does with the native NLTK tagger


### Location Function Issues

Trying to Regex out a full address proves difficult

You can kind of get it down to the right sentence by looking for ones that contain 'contractholder' and avoiding ones that contain 'deltadental'

But it's still not perfect

Tabling getting the entire address for now, I'm looking at getting the contractholder state and city

Getting the state so far works okay, but there are some strange cities out there that cause issues. IE DPO is apparently a city in the US, as is Premium. These words show up enough in other locations that they interfere with trying to find the most popular ACTUAL city mentioned. Even filtering on the above keywords comes back with Premium as the city.

Honestly, without a bit of work the city is completely unreliable

Running into issues with filtering by keyword because the keyword is often cut off from parts of the sentence containing the location information by punctuation within the address itself


Issues with a lot of false positives. I think it would be easier to find the location of the Delta office handling it instead of the client address, which doesn't seem to be clearly marked anywhere




### Docx vs Doc issues

#### Right now we are using a docx specific library

#### They gave us a bunch of doc files, docx2text can't handle those

#### Catdoc software and the python subprocess module
       Catdoc does NOT pay attention to formatting, so that could get messy. it simply looks for readable text and extracts it in the order it finds it
       Catdoc works natively on Mac but not windows
       https://blog.brush.co.nz/2009/09/catdoc-windows/ is a pckg for windows...but just from some random guy
       

#### Antiword
        Linux specific, you can get packages for both Mac and Windows...the windows one looks especially iffy
        Also just the fact that we would need to have a separate setup is not very desirable. 
        
#### Textutil
        Can be used on Mac pretty easily with python subprocess
        
        
       


### Mac vs Windows Issues

#### Running on windows

#### Before going through any of this, ensure your windows parallels setup is done from the OneNote directions!
#### i.e., make sure Anaconda is installed :| 

        when importing modules using pip install
                Instead of "pip install pdfminer" use "pip install pdfminer.six"
        when installing datefinder
                It won't work and will tell you that you need Visual Studio 2015
                Instead, download the src code from https://github.com/akoumjian/datefinder
                Open file: setup.py and look for line 
                    install_requires=['regex==2016.01.10', 'python-dateutil>=2.4.2', 'pytz'],
                and change the first == to >=
                    install_requires=['regex>=2016.01.10', 'python-dateutil>=2.4.2', 'pytz'],
                save and then run "pip install './pathToDatefinderSrc'
                
                see: https://stackoverflow.com/questions/44016287/error-in-pip-install-datefinder?noredirect=1&lq=1
              May also need to run pip install --upgrade setuptools
              
         Some things don't get installed automatically:
                 in python window run : import nltk 
                                        nltk.download('punkt')
                                        nltk.download('stopwords')
                                        nltk.download('averaged_perceptron_tagger')
                                        nltk.download('wordnet')
              
              