In [208]:
import pandas as pd, numpy as np
import os, glob, re, sys, time
sys.path.append("..")

from project.apps.bidinterpreter.doctools import DocTools
from tqdm.notebook import tqdm
from nltk import sent_tokenize, download

%matplotlib inline

### Setting up the "punkt" nltk punctuation tool for breaking text into a list of sentences.

Only need to do this 1x.

In [209]:
download('punkt')

[nltk_data] Downloading package punkt to /home/dave/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [210]:
# !conda install -c conda-forge ipywidgets --yes

## Loading Text into DataFrame

Prior to running this notebook, the `matching_performance_stats` notebook must be run which creates the "searchable" PDF version of the original PDF documents.  This bit of code will load each file by index, name, and original text.  The end goal of this notebook is to produce a list of sentences with their cooresponding documents.

In [211]:
dt = DocTools(use_django_paths = False)  # this is the new class for general document handling which may be expanded to more than just "PDFs"

pdf_directory = "processed/*.pdf.png.processed.pdf"
source_path   = f"{os.getcwd()}/processed"

filtered = [
    (index, file) for index, file in enumerate(glob.iglob(pdf_directory, recursive=True)) # find all files in uploads directory
    #if re.findall(".+", file) and "png" not in file         # filter to files with pdf extension but not having the PNG files (original, unprocessed docs)
]

docs = []
for index, doc in filtered:
    print(f"Processing ({index + 1}/{len(filtered)}) {doc}...")
    text, vocabulary = dt.pdf_to_text_coordinates(doc)
    docs.append(dict(
        document_index = index,
        document   = doc.split("/")[::-1][0], # get only the docname
        text       = text,
        vocabulary = vocabulary
    ))

df = pd.DataFrame(docs)    

setting use_django_paths False
Processing (1/338) processed/010 400 Divisadero SHAC LOI 09.10.18.pdf.png.processed.pdf...
Processing (2/338) processed/010 AvalonBay LOI - Gateway - 2019-11-7.pdf.png.processed.pdf...
Processing (3/338) processed/010 Cambridge Capital LOI 2016-04-07.pdf.png.processed.pdf...
Processing (4/338) processed/010 DR Horton LOI Crescent Heights 2017-11-1.pdf.png.processed.pdf...
Processing (5/338) processed/010 Dutchints LOI 2018-11-17.pdf.png.processed.pdf...
Processing (6/338) processed/010 Fairfield - Summit at Lime Ridge LOI 2018-4-26.pdf.png.processed.pdf...
Processing (7/338) processed/010 Fairfield LOI - 10-18-11.pdf.png.processed.pdf...
Processing (8/338) processed/010 Greystar LOI 2018-9-25.pdf.png.processed.pdf...
Processing (9/338) processed/010 JDA LOI 2013-4-25.pdf.png.processed.pdf...
Processing (10/338) processed/010 MCRT LOI_400 Divis final.pdf.png.processed.pdf...
Processing (11/338) processed/010 Pulte LOI 2016-2-12.pdf.png.processed.pdf...
Pro

### Convert Raw Text to Series of Sentences

In [212]:
def text_to_sentences(row):
    sentences = sent_tokenize(row['text'])
    expanded = []
    for sentence in sentences:
        expanded.append(dict(
            document_index = row['document_index'],
            document = row['document'],
            sentence = sentence,
            target_class   = np.nan
        ))
    return expanded
preformatted = df.apply(text_to_sentences, axis = 1)
df_sentences = pd.DataFrame([sentence for row in preformatted for sentence in row]) # flatted result and convert to dataframe
df_sentences.index = df_sentences.index.rename("sentence_index")

### Check DataFrame

Each document will have multiple sentences.  Each row represents a sentence in a document.  The target_class is blank (NaN = not a number = NULL) and the data from this DataFrame is what will be exported to Google docs for manual labling of the "target_class" field.  


#### Labeling
- For sentences that don't belong to a target class, no need to update them.
- Each document should have an entity of interest for:
   - Purchase price
   - Due Diligence 
   - Deposit
   - Others
-  If a sentence has a purchase price in it, we should label it accordingly.

In [213]:
df_sentences

Unnamed: 0_level_0,document_index,document,sentence,target_class
sentence_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,010 400 Divisadero SHAC LOI 09.10.18.pdf.png.p...,SUMMERHILL .,
1,0,010 400 Divisadero SHAC LOI 09.10.18.pdf.png.p...,.,
2,0,010 400 Divisadero SHAC LOI 09.10.18.pdf.png.p...,"COMMUNITIES OF DISTINCTION September 10, 2018 ...",
3,0,010 400 Divisadero SHAC LOI 09.10.18.pdf.png.p...,Our Experience SummerHill is based in Palo Alt...,
4,0,010 400 Divisadero SHAC LOI 09.10.18.pdf.png.p...,"Over the past 40 years, SummerHill has establi...",
...,...,...,...,...
19762,337,Waterton LOI 8-28-12.pdf.png.processed.pdf,If you have any questions or if I can be of fu...,
19763,337,Waterton LOI 8-28-12.pdf.png.processed.pdf,"Sincerely, Waterton Associates L.L.C.",
19764,337,Waterton LOI 8-28-12.pdf.png.processed.pdf,Mark H. Stem Senior Vice President.,
19765,337,Waterton LOI 8-28-12.pdf.png.processed.pdf,Acquisitions Accepted and agreed to this day o...,


### Write CSV Doc for Google docs

In [214]:
df_sentences.to_csv("document_sentences.csv", encoding = "utf8")

### Tag Sentences Based on Keyword

In [215]:
# mask1 = df_sentences.sentence.str.contains("price|purchase")
# mask2 = df_sentences.sentence.str.contains("\$")

# df_sentences.loc[(mask1 & mask2), 'target_class'] = "purchase_price"

In [216]:
# df_sentences[(mask1 & mask2)].values

### Use Original Regexes for Matching

In [217]:
## Original regexs:

dollar_amount               = """\$ {0,}[0-9]{1,3}(,[0-9]{3}){0,}"""
first_dollaramt_insentence  = """([^.])+""" + dollar_amount
numberswrittenout           = """((one|eleven|ten|two|twelve|twenty|three|thirteen|thirty|fourteen|forty|four|fifteen|fifty|five|sixteen|sixty|six|seventeen|seventy|seven|eighteen|eighty|eight|nineteen|ninety|nine|hundred|thousand|million|billion)[- ]{0,1})+"""
time_period                 = """(""" + numberswrittenout +"""|\(?[0-9]{1,3}[) -])+((business|calendar)[ -])?(day|month|week|year)s?"""
first_timeperiod_insentence = """([^.])+""" + time_period
monthsofyear_regex          = """(january|jan\.{0,1}|february|feb\.{0,1}|march|mar\.{0,1}|april|apr\.{0,1}|may|june|jun\.{0,1}|july|jul\.{0,1}|august|aug\.{0,1}|september|sept\.{0,1}|october|oct\.{0,1}|november|nov\.{0,1}|december|dec\.{0,1})"""
date_slashtype_regex        = """(\d{1,2}/\d{1,2}/\d{4})"""
date_writtenout_regex       = """(""" + monthsofyear_regex + """ \d{1,2}, \d{4}""" + """)"""
date_eithertype_regex       = """(""" + date_writtenout_regex + """|""" + date_slashtype_regex + """)"""
first_date_or_timeperiod_insentence = """([^.])+""" + """(""" + time_period + """|""" +date_eithertype_regex + """)"""
deposit_regex               = """((initial|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth) )?deposit"""
price_regex                 = """((purchase) )?price"""
dd_regex                    = """(dd|due diligence|feasibility|inspection)( period)?"""
closing_regex               = """((closing)( (period|date))?|close of escrow)"""

### Sentences per Document Count

Mainly, just want to make sure the original document count matches the original 338.  Looks good.

In [218]:
df_sentences.document.value_counts()

Essex LOI 9-16-05.pdf.png.processed.pdf                                               361
Mission Pointe Essex LOI 7-10-06.pdf.png.processed.pdf                                303
LOI_Pacific Property_091605.pdf.png.processed.pdf                                     178
Essex LOI - 11.7.2012.pdf.png.processed.pdf                                           170
082 Wolff LOI 2015-11-5.pdf.png.processed.pdf                                         167
                                                                                     ... 
MCIP Villa Morada LOI.pdf.png.processed.pdf                                            16
Bridgecourt LOI  - Fairstead - 2017-9-14.pdf.png.processed.pdf                         14
IMT LOI - The Courtyards at 65th Street - 4.18.19.pdf.png.processed.pdf                10
032 KB Home VC LOI Alameda Point 8.22.16 COE @ InTract Start.pdf.png.processed.pdf      9
Related LOI 5-23-08.pdf.png.processed.pdf                                               8
Name: docu

## Labeling

### Setting Purchase Price
Matches all sentences matching purchase price regex pattern.

In [219]:
## THis one works pretty well, keeping it
mask1 = df_sentences.sentence.str.contains(r"(purchase)?price[^.]+\$ {0,}[0-9]{1,3}(,[0-9]{3}){0,}", flags = re.IGNORECASE, regex = True)
df_sentences.loc[mask1, 'target_class'] = "purchase_price"

  return func(self, *args, **kwargs)


### Setting Deposit

In [220]:
mask1 = df_sentences.sentence.str.contains(f"(initial|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth)?deposit[^.]+\$", flags = re.IGNORECASE, regex = True)
df_sentences.loc[mask1, 'target_class'] = "deposit"

### Setting Closing

In [221]:
mask1 = df_sentences.sentence.str.contains(closing_regex + first_date_or_timeperiod_insentence, flags = re.IGNORECASE, regex = True)
df_sentences.loc[mask1, 'target_class'] = "closing"

### Setting Due Diligence

In [223]:
mask1 = df_sentences.sentence.str.contains(dd_regex + first_date_or_timeperiod_insentence, flags = re.IGNORECASE, regex = True)
df_sentences.loc[mask1, 'target_class'] = "dd"

  return func(self, *args, **kwargs)


### Documents Having Lots of False Positive Matches

In [242]:
df_sentences[~df_sentences['target_class'].isnull()].document.value_counts().head(30)

032 SARES REGIS LOI 11-7-11.pdf.png.processed.pdf                          11
030 SARES REGIS LOI 10-18-11.pdf.png.processed.pdf                          9
LOI 400 Divusadero 10.19.16.pdf.png.processed.pdf                           8
012 Pulte LOI 2016-8-5.pdf.png.processed.pdf                                8
051 TrueLife LOI Crescent Heights 2017-11-22.pdf.png.processed.pdf          8
The Heights LOI_Pulte.pdf.png.processed.pdf                                 7
011 Pulte LOI 2016-2-27.pdf.png.processed.pdf                               7
LOI Caltrain Bldgs 4 and 1.pdf.png.processed.pdf                            7
081 Khov LOI Crescent Heights 2017-11-13.pdf.png.processed.pdf              7
030 MCRT LOI_400 Divis final_REDLINE 011420.pdf.png.processed.pdf           7
170217_Alameda Point 6_7_NWHM  LOI.pdf.png.processed.pdf                    7
082 Khov LOI Crescent Heights 2017-11-27.pdf.png.processed.pdf              7
010 Pulte LOI 2016-2-12.pdf.png.processed.pdf                   

In [243]:
subset = df_sentences[~df_sentences['target_class'].isnull()].document.value_counts()
df_sentences[df_sentences.document.isin(subset[subset > 4].index)].to_csv("documents_with_complex_chunk_varients.csv", encoding = "utf8")

In [248]:
(subset > 4).sum()

72

In [249]:
df

Unnamed: 0,document_index,document,text,vocabulary
0,0,010 400 Divisadero SHAC LOI 09.10.18.pdf.png.p...,SUMMERHILL . . COMMUNITIES OF DISTINCTION Sept...,"[{'x0': 182.057, 'x1': 416.573, 'top': 115.213..."
1,1,010 AvalonBay LOI - Gateway - 2019-11-7.pdf.pn...,"AvalonBay COMMUNITIES November 7, 2019 Hunter ...","[{'x0': 1293.943, 'x1': 1505.830, 'top': 289.4..."
2,2,010 Cambridge Capital LOI 2016-04-07.pdf.png.p...,"CAMBRIDGE CAPITAL INVESTORS, LLC 1620 Park Vis...","[{'x0': 556.457, 'x1': 758.058, 'top': 220.534..."
3,3,010 DR Horton LOI Crescent Heights 2017-11-1.p...,"ll-R-HOHIDN 35% ﬂmer/ca’s W November 1, 2017 B...","[{'x0': 740.571, 'x1': 956.571, 'top': 173.093..."
4,4,010 Dutchints LOI 2018-11-17.pdf.png.processed...,DocuSign E nvelope ID: C431689D-FF59-4CAD-BG34...,"[{'x0': 50.400, 'x1': 146.058, 'top': 40.242, ..."
...,...,...,...,...
333,333,Waterford Place JPM LOI 101129.pdf.png.process...,"JPMorgan 0 Asset Management November 30, 2010 ...","[{'x0': 691.200, 'x1': 993.601, 'top': 178.809..."
334,334,Waterford Place JPM LOI 101202.pdf.png.process...,"JPMorgan 0 Asset Management December 2, 2010 V...","[{'x0': 705.600, 'x1': 1007.999, 'top': 181.36..."
335,335,Waterford Place Signed LOI 12-15-10.pdf.png.pr...,UBS Realty Investors LLC UB S Global Asset Man...,"[{'x0': 1192.114, 'x1': 1231.200, 'top': 127.6..."
336,336,Waterford Place UBS LOI (12.2.2010).pdf.png.pr...,UBS Realty Investors LLC UBS Global Asset Mana...,"[{'x0': 1200.343, 'x1': 1241.486, 'top': 84.18..."


### Save Varients to CSV

Relevent Slack thread: 
- https://bid-central.slack.com/archives/C01A261EFGT/p1599251204029700

### Saving Labled Sentence Data

This is the datset we will use for our first generation ML models that will predict which types of sentences exist in documents for the relevant entity match algorithms we develop.

In [253]:
df_sentences.to_csv("./documents_to_sentences_labeled-09-04-2020.csv", encoding="utf8")