In [19]:
import itertools
import json
import os
import re
import requests
import scipy

from pathlib import Path

import numpy as np
import pandas as pd
import sklearn as sk
import tika

from IPython.core.display import display, HTML
display(HTML("<style>.container { width: 90% !important; }</style>"))

pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Driver functions

In [20]:
def get_consent_agenda_df(text: str) -> pd.DataFrame:
    """
    Get all the consent agenda items out into a parsed format.
    """
    try:
        consent_agenda = get_consent_agenda(text)
        consent_df = pd.DataFrame(consent_agenda, columns=['raw_text'])
        consent_df['name'] = consent_df['raw_text'].str.split('\n\n').apply(lambda x: x[0])
        consent_df = consent_df[~consent_df['name'].str.contains('Minutes')]
        consent_df['type'] = consent_df['name'].str.replace('\n', '').str.split(': ').apply(lambda x: x[0])
        consent_df['name'] = consent_df['name'].str.replace('\n', '').str.split(': ').apply(lambda x: x[1:]).apply(lambda x: ' '.join(x))
        consent_df['dollar_amount'] = consent_df['name'].apply(extract_dollar_amount)
        consent_df.loc[consent_df['dollar_amount'] == '', 'dollar_amount'] = consent_df['raw_text'].apply(extract_dollar_amount)
        consent_df['name'] = consent_df.apply(lambda x: x['name'].replace(x['dollar_amount'], ''), axis=1).str.replace('-', '')
        consent_df['status'] = consent_df['raw_text'].apply(extract_status)
    except (AttributeError, IndexError):
        return pd.DataFrame([])
    return consent_df

def get_other_items_df(text: str) -> pd.DataFrame:
    """
    Get the other discussion items out into a parsed format.
    """
    other_items = get_other_items(text)
    other_items_df = pd.DataFrame(other_items, columns=['type', 'raw_text'])
    other_items_df['name'] = other_items_df['raw_text'].str.split('\n \n').apply(lambda x: x[0]).str.replace(': ', '').str.replace('*', '').str.replace('\n', ' ')
    other_items_df.loc[other_items_df['name'].str.len() > 200, 'name'] = other_items_df['raw_text'].str.split('\n\n').apply(lambda x: x[0]).str.replace(': ', '').str.replace('*', '').str.replace('\n', ' ')
    other_items_df['dollar_amount'] = other_items_df['name'].apply(extract_dollar_amount)
    other_items_df['name'] = other_items_df.apply(lambda x: x['name'].replace(x['dollar_amount'], ''), axis=1).str.replace('-', '')
    other_items_df['status'] = other_items_df['name'].apply(extract_status)
    other_items_df['name'] = other_items_df.apply(lambda x: x['name'].replace(x['status'], ''), axis=1)
    other_items_df['name'] = other_items_df['name'].str.replace('OTHER BUSINESS', '')
    other_items_df = other_items_df[other_items_df['name'].str.len() > 5]
    other_items_df['type'] = other_items_df['type'].str.replace('\n', '').str.strip()
    return other_items_df


## Helper functions

In [21]:
from itertools import * 
import re
from typing import List, Tuple

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

def get_consent_agenda(text: str) -> List[str]:
    try:
        consent_agenda = re.split(r'\n{1}[a-z]\.\s', text)[1:]  # Get the incidence of ('a.', 'b.', ...)
        consent_agenda[-1] = consent_agenda[-1].split('\n\n')[0]
    except IndexError:
        return []
    return consent_agenda

def get_manager_response(text: str, consent_agenda: str) -> str:
    manager_response = text.split(consent_agenda[-1])[1].split('\nCOMMUNITY MATTERS')[0]
    return manager_response

def get_other_items(text: str) -> List[Tuple[str, str]]:
    known_headings = ['APPROPRIATION', 'RESOLUTION', 'REPORT', 'ORDINANCE', 'PUBLIC HEARING', 'A RESOLUTION']
    perms = list(itertools.permutations(known_headings, 2))
    headings = list(itertools.chain.from_iterable([  # Minutes are very lax about how they join together
        map(lambda x: '/'.join(x), perms),
        map(lambda x: ' / '.join(x), perms),
        map(lambda x: '/ '.join(x), perms),
        map(lambda x: ' /'.join(x), perms),
    ]))
    headings.extend(known_headings)
    potential_headings = ['\n' + heading for heading in headings]
    regex_string = '(' + '|'.join(potential_headings) + ')'
    other_items = re.split(regex_string, text)[1:]
    labeled_other_items = list(grouper(other_items, 2, 'xxx'))
    try:
        labeled_other_items[-1] = (labeled_other_items[-1][0], labeled_other_items[-1][1].split('OTHER BUSINESS')[0])
    except IndexError:
        return []
    return labeled_other_items

def replace_newlines(text: str) -> str:
    return text.replace('\n', '')

def extract_dollar_amount(text: str) -> str:
    pat = re.compile(r'\$\d+\,*\d*\.*\d*')
    if pat.findall(str(text)):
        return pat.findall(str(text))[0]
    else:
        return ''
    
def extract_status(text: str) -> str:
    pat = re.compile(r'\([a-z]+.*\)')
    if pat.findall(str(text)):
        return pat.findall(str(text))[0]
    pat = re.compile(r'\(\d+[a-z].*\)')
    if pat.findall(str(text)):
        return pat.findall(str(text))[0]
    else:
        return ''
    
def extract_all_caps(text: str) -> str:
    pat = re.compile(r'(([A-Z]|\s)+)')
    if pat.findall(str(text)):
        return pat.findall(str(text))[0]
    

In [22]:
minutes_path = Path('../data/cville_pdfs/minutes/').glob('*.txt')

In [23]:
with open('../data/cville_pdfs/minutes/36377.txt') as file:  # Use your path to the file!
    text = file.read()
consent_agenda_df = get_consent_agenda_df(text)
other_items_df = get_other_items_df(text)


In [25]:
Path("../intermediate").mkdir(parents=True, exist_ok=True)
consent_agenda_df.to_csv('../intermediate/minutes_28434_consent_agenda.csv', sep='|')
other_items_df.to_csv('../intermediate/minutes_28434_other_discussion_items.csv', sep='|')

In [26]:
print(consent_agenda_df.name)

1     Virginia Juvenile Community Crime Control Act Grant                                                  
2     Adult Drug Treatment Court Grant Award   (2nd                                                        
3     State Criminal Alien Assistance Program 2015 Grant                                                   
4     Charlottesville Area Transit FY2016 Grants   (2nd                                                    
5     Runaway Emergency Shelter Program Grant                                                              
6     Sidewalk Waiver Request for 219 Lankford Avenue   Sidewalk Waiver Request for 219 Lankford Avenue    
7     Initiate ZTA for Microbreweries   Initiate ZTA for Microbreweries                                    
8     VDOT Transportation Alternative Program Grant Application for                                        
9     Short Term Rental Tax amendment (2nd reading)  Short Term Rental Tax amendment                       
10    Increase Limit of Maxi