# Exploring and Processing

### Import Statements

In [246]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
import random as rng
import nltk
import fnmatch

import docx2txt
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

from nltk.book import *
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.collocations import *
from nltk.util import ngrams
from nltk.stem.lancaster import LancasterStemmer
from string import punctuation
import sys as sys
from sys import platform
import re as re
from statistics import mode
import datefinder


## Psuedocode

In [82]:
### Analysis Function

## Function which calculates the number of combinations (Not Permutations) of distinct values in a dataset. Would 
## take in a Pandas DataFrame ideally and multiply all distinct value counts for all available columns.

# Expected input: Pandas DataFrame
# Expected output: Int, the number of possible combinations.

In [83]:
### Parsing Function

## Function to collect mentions of States/Districts/Territories in the United States (DC and Puerto Rico are included)
## and rank their relevence, frequency should be a good factor. Note: California, Pennsylvania, and Georgia could 
## appear multiple times from addresses.

# Expected input: A single file, text.
# Expected output: A list of locations mentioned, ranked by most used to least used.

In [84]:
### Parsing Function

## Function for searching a tokenized sentence dictionary for words or phrases. There's probably 
## already something for this.

# Expected input: A tokenized list, and a phrase to search for.
# Expected output: A list of sentences containing the phrase searched.

In [85]:
### Parsing Function

## Function to take a file and produce the Acronyms from them.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, most likely first.

In [86]:
### Parsing Function

## Function to take Acronyms and match them to the best possible N-gram for them from a document. Not all will be 
## possible, but a list of options will help.

# Expected Input: Text File to be parsed, the Acronym to search
# Expected output: A list of possible phrases, sorted most likely first.

In [87]:
### Generally important

## Need to expand stopwords to include States and Districts, also Delta Dental adjacent names as another available set.

In [88]:
## Function to load in two different text extracted contracts and return a comparison metric between 
## the two (Similarity, possibly as a percent?)

# Expected Input: Two contracts for Comparison
# Expected Output: A measure, some kind of decimal to represent similarity.

## File Prep Functions

### Function: Is it docx or pdf?

In [89]:
def checkFileType(filename):
        if(filename.lower().endswith(('.doc','.docx'))):
            return 0
        elif(filename.lower().endswith(('.pdf'))):
            return 1
        else:
            return -1

### Function: read in file

In [90]:
def makeFilePath(docName):
    raw_data_path = os.path.join(os.getcwd(), 'data', 'raw')
    return os.path.join(raw_data_path, docName)

### Function: clean up text

In [536]:
def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    #text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
    return text

### Function: process dataFrame and group

In [92]:
def processDF(txtFile):
    df = pd.read_csv(txtFile, sep=" ", header=None) #this doesn't work for me bc of variable number of cols
    df = df.T 
    df = df.dropna()

    df['SingleRow']=1

    df=df.rename(columns={0 : 'Words'})
    print("in processDF " + txtFile)
    df.describe(include="all")
    #print(df.groupby('Words').SingleRow.sum().sort_values())
    #print(df)
    return df

### Function: Process a text file

In [395]:
def processTextFile(filePath):
    #print(filePath)
    docxText = docx2txt.process(filePath)
    #print(docxText)
    replacedText = cleanText(docxText)
    #print(replacedText)
    fileName = filePath.split('/')[-1]
    #print(fileName)
    baseFileName = fileName[0:-5]
    #print(baseFileName)
    newFilePath = './data/output/' + baseFileName + ".txt"
    #print(newFilePath)
    singleFileDocx=open(newFilePath, 'wb+')    
    singleFileDocx.write(replacedText.encode("utf-8"))
    singleFileDocx.close()

    #temp_df = processDF('singleTextDocx.txt')
    return newFilePath

### Function: Process pdf file

In [497]:
def processPDFfile(filePath):
        password = ""
        extracted_text = ""

        fileName = filePath.split('/')[-1]
        baseFileName = fileName[0:-4]
    
        fp = open(filePath, "rb")
        parser = PDFParser(fp)
        document = PDFDocument(parser, password)
        
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
            
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()

        # set parameters for analysis
        laparams = LAParams()

        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device 
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    newText = lt_obj.get_text()
                    newText = newText.replace('\n', ' ')
                    extracted_text += newText

        #close the pdf file
        fp.close()
        
        extracted_text = cleanText(extracted_text)#extracted_text.replace("\n", ' ')
        
        newFilePath = './data/output/' + baseFileName + '-pdf' + ".txt"
        with open(newFilePath, 'wb+') as singleFilePDF:
            singleFilePDF.write(extracted_text.encode("utf-8"))
        
        
        #temp_df = processDF('./data/output/' + baseFileName + ".txt")
        return newFilePath

## Docx Extraction

DocX extractor for data purposes. Requires customization to each purpose but is important for pulling data out of DocX files. Does not understand tables or bullet points, however is visually consistent with what's on the page.

Strongest values: Order, consistentcy, noise reduction

Weakest values: Completeness, flexibility, whitespace characters, formatting

#### Docx Practice

In [62]:
import docx2txt


def cleanText(text):
    text = text.replace("\n", ' ')
    text = text.replace("\t", ' ')
    text = text.replace(",", ' ')
    
    dblSpacesRemaining = True
    while(dblSpacesRemaining):
        text = text.replace("  ", " ")
        if not "  " in text:
            dblSpacesRemaining = False
        
    return text
fileName = "TX 17404 Contract Regional (7.2.18).docx"
baseFileName = ""
if(fileName.lower().endswith(('.docx'))):
    baseFileName = fileName[0:-5]
    #print(baseFileName)
elif(fileName.lower().endswith(('.pdf'))):
    baseFileName = fileName[0:-4]
else:
    print("ending error")

docText = docx2txt.process("./data/raw/" + fileName)
singleFileDocx=open('./data/output/' + baseFileName + ".txt", 'wb+')
replacedText = cleanText(docText)
#print(replacedText)
singleFileDocx.write(docText.encode("utf-8"))
singleFileDocx.close()


## PDF Extraction

#### PDF practice

In [22]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import sklearn as skl
# pdfTextMiner.py
# Python 2.7.6
# For Python 3.x use pdfminer3k module
# This link has useful information on components of the program
# https://euske.github.io/pdfminer/programming.html
# http://denis.papathanasiou.org/posts/2010.08.04.post.html


''' Important classes to remember
PDFParser - fetches data from pdf file
PDFDocument - stores data parsed by PDFParser
PDFPageInterpreter - processes page contents from PDFDocument
PDFDevice - translates processed information from PDFPageInterpreter to whatever you need
PDFResourceManager - Stores shared resources such as fonts or images used by both PDFPageInterpreter and PDFDevice
LAParams - A layout analyzer returns a LTPage object for each page in the PDF document
PDFPageAggregator - Extract the decive to page aggregator to get LT object elements
'''

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

''' This is what we are trying to do:
1) Transfer information from PDF file to PDF document object. This is done using parser
2) Open the PDF file
3) Parse the file using PDFParser object
4) Assign the parsed content to PDFDocument object
5) Now the information in this PDFDocumet object has to be processed. For this we need
   PDFPageInterpreter, PDFDevice and PDFResourceManager
 6) Finally process the file page by page 
'''

base_path = "C://data"

my_file = os.path.join(base_path + "/" + "test.pdf")
log_file = os.path.join(base_path + "/" + "pdf_log.txt")

password = ""
extracted_text = ""

# Open and read the pdf file in binary mode
fp = open(my_file, "rb")

# Create parser object to parse the pdf content
parser = PDFParser(fp)

# Store the parsed content in PDFDocument object
document = PDFDocument(parser, password)

# Check if document is extractable, if not abort
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
    
# Create PDFResourceManager object that stores shared resources such as fonts or images
rsrcmgr = PDFResourceManager()

# set parameters for analysis
laparams = LAParams()

# Create a PDFDevice object which translates interpreted information into desired format
# Device needs to be connected to resource manager to store shared resources
# device = PDFDevice(rsrcmgr)
# Extract the decive to page aggregator to get LT object elements
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create interpreter object to process page content from PDFDocument
# Interpreter needs to be connected to resource manager for shared resources and device 
interpreter = PDFPageInterpreter(rsrcmgr, device)

# Ok now that we have everything to process a pdf document, lets process it page by page
for page in PDFPage.create_pages(document):
    # As the interpreter processes the page stored in PDFDocument object
    interpreter.process_page(page)
    # The device renders the layout from interpreter
    layout = device.get_result()
    # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            extracted_text += lt_obj.get_text()
            
#close the pdf file
fp.close()

# print (extracted_text.encode("utf-8"))
            
with open(log_file, "wb") as my_log:
    my_log.write(extracted_text.encode("utf-8"))
print("Done !!")


FileNotFoundError: [Errno 2] No such file or directory: 'C://data/test.pdf'

## NLTK Tutorial

### Function: DD specific text cleaning

In [95]:
def ddCleanText(text):
    newText = text.replace('Delta Dental', 'DeltaDental')
    newText = newText.replace('DELTA DENTAL', 'DELTADENTAL')
    newText = newText.replace('DeltaDental Insurance Company', 'DeltaDentalInsuranceCompany')
    return newText



### Function: Make tokenized word list

In [96]:
def getTokens(text):
    words = word_tokenize(text)
    
    customStopWords = set(stopwords.words('english')+list(punctuation))
    wordsWOStop=[word for word in words if word not in customStopWords]
    
    return wordsWOStop

### Function: Make tokenized Sentence list

In [115]:
def getSents(text):
    sents = sent_tokenize(text)

    return sents

### Function: Get Bigrams

In [98]:
def getBigrams(tokens):
    bigram_measures=nltk.collocations.BigramAssocMeasures();
    finder = BigramCollocationFinder.from_words(tokens)
    sorted_bgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))
    
    return sorted_bgs

### Function: Get Trigrams

In [99]:
def getTrigrams(tokens):
    trigram_measures =nltk.collocations.TrigramAssocMeasures();
    finder = TrigramCollocationFinder.from_words(tokens)
    sorted_tgs = (sorted(list(finder.ngram_fd.items()), key=lambda item: item[-1], reverse=True))

    return sorted_tgs

## workspace: Set up text/tokens to search

In [592]:
filePath = "./data/raw/TX-18745 ASC Contract (7.2.18).pdf"
processedTextPath = ""

if(checkFileType(filePath) == 0):
    processedTextPath = processTextFile(filePath)
elif(checkFileType(filePath) == 1):
    processedTextPath = processPDFfile(filePath)
else:
    raise TypeError("File type incorrect") 

print(processedTextPath)
with open(processedTextPath, 'r') as txtFile:
            text = txtFile.read()


text = ddCleanText(text)
wordTokens = getTokens(text)
sentTokens = getSents(text)
print(sentTokens)
bgs = getBigrams(wordTokens)
tgs = getTrigrams(wordTokens)


./data/output/TX-18745 ASC Contract (7.2.18)-pdf.txt
['DENTAL\xa0ADMINISTRATIVE\xa0SERVICES\xa0CONTRACT\xa0 \xa0 This\xa0Contract\xa0is\xa0made\xa0and\xa0entered\xa0into\xa0as\xa0of\xa0the\xa01st\xa0day\xa0of\xa0April,\xa02017\xa0(Effecitve\xa0Date)\xa0by\xa0and\xa0between\xa0Coca‐Cola\xa0 Southwest\xa0Beverages\xa0LLC\xa0(Contractholder)\xa0and\xa0Delta\xa0Dental\xa0Insurance\xa0Company\xa0(hereinafter\xa0referred\xa0to\xa0as\xa0 Delta\xa0Dental).', 'Whereas,\xa0Contractholder\xa0has\xa0adopted\xa0an\xa0employee\xa0dental\xa0benefit\xa0plan\xa0(the\xa0Plan),\xa0which\xa0is\xa0set\xa0forth\xa0in\xa0the\xa0 employee\xa0benefit\xa0booklet,\xa0as\xa0shown\xa0in\xa0Section\xa07,\xa0mutually\xa0agreed\xa0upon\xa0by\xa0Contractholder\xa0and\xa0Delta\xa0Dental,\xa0and\xa0 for\xa0which\xa0Contractholder\xa0retains\xa0all\xa0liabilities;\xa0 \xa0 Whereas,\xa0Contractholder\xa0has\xa0requested\xa0Delta\xa0Dental\xa0to\xa0provide\xa0certain\xa0administrative\xa0services\xa0to\xa0the\xa0Plan\xa0an

## Attributes

#### Filename
Contract Start
Contract End
Contract Duration
State
Delta Office Involved

#### Group Information
(Group Number)

#### Numeric attributes Only
Basics
Diagnostics
Major
Endo
Oral
Perio
Prostho
Ortho

In [125]:
##Establish a dataframe to capture the attributes
#d = {'key': 'file','value':fileName}
#{'key':'state', 'value':state}
#df = pd.DataFrame(d, index=['uid'])
d = {}

### Metadata

#### get file name

In [397]:
def getFileName(fullPath):
    if(platform == "win32"):
        fileName = fullPath.split("\\")[-1]
    else:
        fileName = fullPath.split("/")[-1]
    return fileName

In [398]:
print(getFileName(filePath))

TX 19015 Attachment A ENT (7.2.18).docx


#### get group number

In [763]:
def getGroupNumber(sents_tokens, filePath):
    
    group_keywords = ["group number", "groupnumber"]
    regex_exps = []
    poss_nums = []
    finalGN = None
    
    #Create regex exps out of group number keywords
    for kw in group_keywords:
            temp_re = kw + "\W\s*(?P<gn>\d+)"
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
    #For each sentence, search for the expression, if found add the number to
    #list of possible group numbers
    for sent in sents_tokens:
        #print(sent)
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                temp_gn = result.group('gn')
                poss_nums.append(temp_gn)
    
    #Try and get the number from the file name, looking for list of numeric chars
    num_regex = re.compile("\d+")
    fileName = getFileName(filePath)
    fileGN = num_regex.search(fileName)
    
    if not fileGN==None:#if they filename has a number sequence
        if fileGN.group() in poss_nums:#then if the file group number matches one in the document, choose it
            finalGN = fileGN.group()
        else:
            poss_nums.append(fileGN.group())#otherwise add the filename one to the list and try to get the most co
            try:
                finalGN = mode(poss_nums)
            except:
                print("Unexpected error: Cannot determine group number of file: " + filePath)
    
    return finalGN

In [764]:
gn = getGroupNumber(sentTokens)
print(gn)

TypeError: getGroupNumber() missing 1 required positional argument: 'filePath'

In [110]:
## Group Information

dfGI=pd.DataFrame(d, index=['uid'])

#### get contract start

In [737]:
def oldversion_getContractStart(sents_tokens):
    start_keywords = ["effective date", "effective"]
    regex_exps = []
    poss_dates = set()
    finalDate = ""
    
    testString = ""
    for kw in start_keywords:
            temp_re = kw + "\S\s*\S"
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    #sents_tokens.append("Effective Date:   10/1/2014  Contract term: October 1st, 2014 to September 2nd, 2019")
    
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                subset = word_tokenize(sent[sent.lower().find(result.group()):])[:10]
                subset2 = word_tokenize(sent[sent.lower().find(result.group()):])[-10:]
                subset = " ".join(subset)
                subset2 = " ".join(subset2)
                poss_dates.add(subset)
                poss_dates.add(subset2)
                
    matches = set()
    for sent in poss_dates:
        #print(sent)
        m = datefinder.find_dates(sent)
        for match in m:
            matches.add(match)
    #print(matches)
    
    try:
        finalDate = mode(matches)
    except:
        print("Unexpected error: Cannot determine contract start date of file ")
    
  
    return finalDate

In [753]:
def getContractStart(sents_tokens):
    start_keywords = ["effective date\S\s*\S", "effective"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    for kw in start_keywords:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
## Original pass through sentence tokens to find possible dates
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                subset = word_tokenize(sent[sent.lower().find(result.group()):])[:20]
                subset = " ".join(subset)
                poss_dates.append(subset)

                
## Second pass through sentence tokens to find possible dates based on a date range format
    regex_exps = []
    backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
    for kw in backup_kw:
        temp_re = kw
        regex = re.compile(temp_re)
        regex_exps.append(regex)
        
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                half_1 = sent[sent.lower().find(result.group()):]
                half_2 = sent[:sent.lower().find(result.group())]
                
                subset_1 = " ".join(word_tokenize(half_1)[:6])
                subset_2 = " ".join(word_tokenize(half_2)[-6:])
                subset = subset_2 + subset_1
                
                m = datefinder.find_dates(subset)
                temp_matches = []
                for match in m:
                    if match.year >= 1966:
                        temp_matches.append(subset)
                if len(temp_matches)>=2:
                    poss_dates.append(subset)
                    #print(subset)
                    
                    
## Second pass through sentences with possible dates to eliminate ones without a year or with an invalid year
## These are likely other values flagged incorrectly as dates by the datefinder
## 1966 is the year Delta Dental was created
    for sent in poss_dates:
        find_year_re = re.compile("\d\d\d\d")
        year = find_year_re.search(sent)
        
        if not year==None:
            m = datefinder.find_dates(sent)
            for match in m:
                if match.year >= 1966:
                    matches.append(match)
## Last pass: try to find the most common date. If there is more than one mode, choose the earliest date
##.           this seems to occur when it is finding the contract start and end in equal quantities
    #print(matches)
    try:
        finalDate = mode(matches)
    except ValueError as e:
        #print(str(e))
        if matches:
            earliestMatch = matches[0]
            for match in matches:
                if(match < earliestMatch):
                    earliestMatch = match
            finalDate = earliestMatch
    except Exception as e:
        print(str(e))
  
    return finalDate

####  get Contract End

In [751]:
def getContractEnd(sents_tokens):
    
    start_keywords = ["contract term\S\s*\S", "contract term ", "contract end"]
    regex_exps = []
    poss_dates = []
    finalDate = ""
    matches = []
    
    for kw in start_keywords:
            temp_re = kw
            regex = re.compile(temp_re)
            regex_exps.append(regex)
    
## Original pass through sentence tokens to find possible dates based on keywords
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                subset = word_tokenize(sent[sent.lower().find(result.group()):])[:25]
                subset = " ".join(subset)
                poss_dates.append(subset)
                
                
## Second pass through sentence tokens to find possible dates based on a date range format
    regex_exps = []
    backup_kw = ["\S\sthrough\s\S","\S\sthru\s\S"]
    
    for kw in backup_kw:
        temp_re = kw
        regex = re.compile(temp_re)
        regex_exps.append(regex)
        
    for sent in sents_tokens:
        for my_regex in regex_exps:
            result = my_regex.search(sent.lower())
            if not result==None:
                half_1 = sent[sent.lower().find(result.group()):]
                half_2 = sent[:sent.lower().find(result.group())]
                
                subset_1 = " ".join(word_tokenize(half_1)[:6])
                subset_2 = " ".join(word_tokenize(half_2)[-6:])
                subset = subset_2 + subset_1
                
                m = datefinder.find_dates(subset)
                temp_matches = []
                for match in m:
                    if match.year >= 1966:
                        temp_matches.append(subset)
                if len(temp_matches)>=2:
                    poss_dates.append(subset)
                    #print(subset)
                
                
## Pass through sentences with possible dates to eliminate ones without a year or with an invalid year
## These are likely other values flagged incorrectly as dates by the datefinder
## 1966 is the year Delta Dental was created
    for sent in poss_dates:
        #print(sent)
        find_year_re = re.compile("\d\d\d\d")
        year = find_year_re.search(sent)
        
        if not year==None:
            m = datefinder.find_dates(sent)
            for match in m:
                if match.year >= 1966:
                    matches.append(match)
                    
   
    #print(matches)
    
### If there are exactly two matches, try to find a max. If error b/c they're the same, choose one
    if(len(matches) == 2):
        try:
            finalDate = max(matches)
        except ValueError as e:
            finalDate = matches[0]
        except Exception as e:
            print(str(e))

## If there are more, try and find the top two most mentioned and take the later. else just take the latest            
    elif(len(matches) > 2):
        
        try:
            date1 = mode(matches)
            matches.remove(date1)
            
            date2 = mode(matches)
            matches.remove(date2)
            
            finalDate = max([date1, date2])
        except ValueError as e:
            #print(str(e))
            if matches:
                latestMatch = matches[0]
                for match in matches:
                    if(match > latestMatch):
                        latestMatch = match
                finalDate = latestMatch
        except Exception as e:
            print(str(e))
    #else:
        #print("could not find contract end for file")
    
    #print("\n")
    return finalDate

#### get Contract Duration

In [727]:
def getContractDuration(sents_tokens):
    start = None
    end = None
    duration = None
    
    try:
        start = getContractStart(sents_tokens)
    except Exception as e:
        print("Can't find contract start")
        return
    
    try:
        end = getContractEnd(sents_tokens)
    except Exception as e:
        print("Can't find contract end")
        return
    if(start and end):
        duration = end - start

    return duration

#### get State/Location:

In [679]:
def getContractLocation(sents_tokens):
   
    return

#### Delta Office Involved

In [680]:
def getDeltaOffice():
    
    return

### Batch Run to get attributes

#### Function: batch pre process: fill output folder

In [411]:
def batchPreProcess():
    cwd = os.getcwd()
    print(cwd)
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "data/raw")

    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                #print(file)
                try:
                    if(checkFileType(filepath) == 0):
                        processedTextPath = processTextFile(filepath)
                    elif(checkFileType(filepath) == 1):
                        processedTextPath = processPDFfile(filepath)
                    else:
                        raise TypeError('This path does not lead to a valid file type!')                     
                except Exception as e:
                    print(str(e))

    else:
        print("data/raw doesn't exist")

#### Function: Batch return token and bigram sets for all output files

In [505]:
def batchGetTokens():
    all_tokens = []
    cwd = os.getcwd()
    processedTextPath = ""
    
    dataPath = os.path.join(cwd, "data/output")
    print(dataPath)
    
    if(os.path.isdir(dataPath)):

        for file in os.listdir(dataPath):
            filepath = os.path.join(dataPath, file)
            if(os.path.isfile(filepath)):
                try:
                    #print(file)

                    temp_obj = {}

                    with open(filepath, 'r') as txtFile:
                        text = txtFile.read()

                    temp_obj['filepath'] = filepath

                    text = ddCleanText(text)
                    temp_obj['cleanText'] = text

                    wordTokens = getTokens(text)
                    sentTokens = getSents(text)
                    temp_obj['wordTokens'] = wordTokens
                    temp_obj['sentTokens'] = sentTokens

                    bgs = getBigrams(wordTokens)
                    tgs = getTrigrams(wordTokens)
                    temp_obj['bgs'] = bgs
                    temp_obj['tgs'] = tgs

                    txtFile.close()
                    all_tokens.append(temp_obj)
                except Exception as e:
                    print("Error opening and tokenizing " + file)
                    #print(str(e))

    else:
        print("data/output doesn't exist")
    return all_tokens

In [579]:
## MetaData DataFrame Creation
## Key                       -     Value
## Column that we're pulling -    Number or field that we're pulling

batchPreProcess()

/Users/sydneyknox/Documents/data-insights/jupyter-pseudocode
This path does not lead to a valid file type!


#### Function: get metadata attributes (for single file...batch use the functions)

In [765]:
def getMetaDataAtt(file_info):
    #print(file_info)
    fileName = getFileName(file_info['filepath'])
    print(fileName)
    
    groupNumber = getGroupNumber(file_info['sentTokens'], file_info['filepath'])
    if(groupNumber):
        print("group number: " , groupNumber)
    
    contractStartDate = getContractStart(file_info['sentTokens'])
    if(contractStartDate):
        print("start: " , contractStartDate)
    
    contractEndDate = getContractEnd(file_info['sentTokens'])
    if(contractEndDate):
        print("end: " , contractEndDate)
    
    contractDuration = getContractDuration(file_info['sentTokens'])
    if(contractDuration):
        print("duration: " , contractDuration)
    print("\n")
    
   ## d={'key':'filename', 'value':fileName}
   ## dfMD = pd.DataFrame(d, index=['MetaData'])
    
   ## df=pd.DataFrame({'key':'group_number','value':groupNumber}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
   ## df=pd.DataFrame({'key':'contract_start_date','value':contractStartDate}, index=['MetaData'])
   ## dfMD = pd.concat([dfMD, df])
    
    return dfMD

### Workspace

In [None]:
base_info = batchGetTokens()

In [766]:
for file in base_info:
    dfMD = getMetaDataAtt(file)
    
    #transpose=dfMD.transpose()
    #print(transpose)
    #print(dfMD)

TX 19015 Attachment A ENT (7.2.18).txt
group number:  19015
start:  2018-01-01 00:00:00


TX 17404 EOC Regional (7.2.18).txt
group number:  17404
start:  2017-10-01 00:00:00


TX 17404 EOC (Regional (7.2.18)-pdf.txt
group number:  17404
start:  2017-10-01 00:00:00


TX-19278 ASC-ENT (7.2.18).txt
group number:  19278
start:  2018-03-01 00:00:00
end:  2019-12-31 00:00:00
duration:  670 days, 0:00:00


test2.txt
group number:  2


TX 19015 Contract ENT (7.2.18).txt
group number:  19015


test2-pdf.txt
group number:  2


TX 17404 Contract Regional (7.2.18)-pdf.txt
group number:  17404
start:  2014-10-01 20:14:00
end:  2017-09-30 00:00:00
duration:  1094 days, 3:46:00


TX 19015 Attachment  C ENT (7.2.18).txt
group number:  19015
start:  2018-01-01 00:00:00
end:  2019-12-31 00:00:00
duration:  729 days, 0:00:00


TX 19015 Contract ENT (7.2.18)-pdf.txt
group number:  19015
start:  2018-01-01 00:00:00
end:  2019-12-31 00:00:00
duration:  729 days, 0:00:00


TX-18745-ASC Contract (7.2.18).txt


In [459]:
## Diagnostic and Preventative (D&P) [Appendix A]

d={'key':'D&P Services_PPO','value':'100'}
##Need to pass index since we're only doing string values
dfDP=pd.DataFrame(d, index=['D%P Services'])


In [321]:
## Basic Service

dfBS=pd.DataFrame(d, index=['Basic Service'])

In [123]:
## Endo Perio (Endodontics (Periodontal(?)))

dfEP=pd.DataFrame(d, index=['Endo Perio'])

In [113]:
## Oral (Oral Surgery)

dfOa=pd.DataFrame(d, index=['uid'])

In [114]:
## Perio (Periodontal)

dfPe=pd.DataFrame(d, index=['uid'])

In [115]:
## Major (Major Benefits)

dfMj=pd.DataFrame(d, index=['uid'])

In [116]:
## Prostho (Prosthodontics)

dfPr=pd.DataFrame(d, index=['uid'])

In [117]:
## Ortho (Orthodontics)

dfOt=pd.DataFrame(d, index=['uid'])

In [124]:
#Concatenate all frames created by the above dataset

frames = [df, dfGI, dfDP, dfBS, dfEP, dfOa, dfPe, dfMj, dfPr, dfOt]

result = pd.concat(frames)
transpose=result.transpose()
print(transpose)

                    uid      D%P Services     Basic Service        Endo Perio  \
key    D&P Services_PPO  D&P Services_PPO  D&P Services_PPO  D&P Services_PPO   
value               100               100               100               100   

                    uid               uid               uid               uid  \
key    D&P Services_PPO  D&P Services_PPO  D&P Services_PPO  D&P Services_PPO   
value               100               100               100               100   

                    uid  
key    D&P Services_PPO  
value               100  
