In [6]:
import datetime
import math
import numpy as np
import pandas as pd
import re

from bs4 import BeautifulSoup

In [7]:
original_raw_filename = "UNDEFINED"
original_upleveled_filename = "UNDEFINED"
original_upleveled_sorted_filename = "UNDEFINED"
cleaned_raw_filename = "UNDEFINED"
cleaned_raw_bill_id_replaced_filename="UNDEFINED"
cleaned_upleveled_filename = "UNDEFINED"
bill_start_end_times_all_filename = "UNDEFINED"
bill_start_end_times_longest_filename = "UNDEFINED"

with open("CONSTANTS") as constants_file:
    for line in constants_file:
        line_splits = line.rstrip("\n").split("=")
        
        if (line_splits[0] == "ORIGINAL_RAW"):
            original_raw_filename = line_splits[1]
        elif (line_splits[0] == "ORIGINAL_UPLEVELED"):
            original_upleveled_filename = line_splits[1]
        elif (line_splits[0] == "ORIGINAL_UPLEVELED_SORTED"):
            original_upleveled_sorted_filename = line_splits[1]
        elif (line_splits[0] == "CLEANED_RAW"):
            cleaned_raw_filename = line_splits[1]
        elif (line_splits[0] == "CLEANED_RAW_BILL_ID_REPLACED"):
            cleaned_raw_bill_id_replaced_filename = line_splits[1]
        elif (line_splits[0] == "CLEANED_UPLEVELED"):
            cleaned_upleveled_filename = line_splits[1]
        elif (line_splits[0] == "BILL_START_END_TIMES_ALL"):
            bill_start_end_times_all_filename = line_splits[1]
        elif (line_splits[0] == "BILL_START_END_TIMES_LONGEST"):
            bill_start_end_times_longest_filename = line_splits[1]

# Raw Processing

In [8]:
raw = pd.read_table(original_raw_filename, sep='~~~~~', engine='python')
raw.head()

Unnamed: 0,video_id,raw_transcript
0,4221,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
1,4229,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
2,4228,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
3,4226,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
4,4222,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."


In [9]:
# parse a string 00:00:00.470 to hours, minutes, seconds
# return time in seconds
def parse_time(time):
    time = time.split(":")
    hours = int(time[0])
    minutes = int(time[1])
    seconds = int(float(time[2])) 
    
    return (hours*360)+(minutes*60)+seconds

In [10]:
def parse_raw_data(raw):
    r = raw['raw_transcript']
    ids = raw['video_id']
    res = {'start':[], 'end':[], 'text':[], 'video_id': []}
    for transcript, vid in zip(r, ids):
        soup = BeautifulSoup(transcript, "lxml")
        letters = soup.find_all("p")

        for p in letters[1:]:
            res['start'].append(parse_time(p.get('begin')))
            res['end'].append(parse_time(p.get('end')))
            res['text'].append(p.contents[0])
            res['video_id'].append(vid)

    tidy = pd.DataFrame(res, columns=['start', 'end', 'text', 'video_id'])
    return (tidy)

In [11]:
cleaned_raw = parse_raw_data(raw)
cleaned_raw.sort_values(["video_id", "start"]).to_csv(cleaned_raw_filename, sep="~", index=False)
cleaned_raw.head()

Unnamed: 0,start,end,text,video_id
0,0,2,We don't have a quorum yet I don't believe.,4221
1,6,8,We don't have a quorum yet.,4221
2,8,13,We'll ask the sergeants to please call the mem...,4221
3,13,21,that we can establish a quorum for this partic...,4221
4,21,26,This is the Assembly's 2nd Extraordinary Sessi...,4221


## Text Formatting and Bill Id Replacement

In [12]:
bill_id_pattern_1_1 = "ab[0-9]+"
bill_id_pattern_1_2 = "sb[0-9]+"
bill_id_pattern_1_3 = "aca[0-9]+"
bill_id_pattern_1_4 = "acr[0-9]+"
bill_id_pattern_1_5 = "ajr[0-9]+"
bill_id_pattern_1_6 = "ar[0-9]+"
bill_id_pattern_1_7 = "hr[0-9]+"
bill_id_pattern_1_8 = "sca[0-9]+"
bill_id_pattern_1_9 = "scr[0-9]+"
bill_id_pattern_1_10 = "sjr[0-9]+"

bill_id_pattern_2_1 = ["ab", "[0-9]+"]
bill_id_pattern_2_2 = ["sb", "[0-9]+"]
bill_id_pattern_2_3 = ["aca", "[0-9]+"]
bill_id_pattern_2_4 = ["acr", "[0-9]+"]
bill_id_pattern_2_5 = ["ajr", "[0-9]+"]
bill_id_pattern_2_6 = ["ar", "[0-9]+"]
bill_id_pattern_2_7 = ["hr", "[0-9]+"]
bill_id_pattern_2_8 = ["sca", "[0-9]+"]
bill_id_pattern_2_9 = ["scr", "[0-9]+"]
bill_id_pattern_2_10 = ["sjr", "[0-9]+"]

bill_id_pattern_3_1 = ["assembly", "bill", "[0-9]+"]
bill_id_pattern_3_2 = ["senate", "bill", "[0-9]+"]

bill_id_pattern_4_1 = ["assembly", "bill", "number", "[0-9]+"]
bill_id_pattern_4_2 = ["senate", "bill", "number", "[0-9]+"]

In [13]:
def re_match_lists_helper(pattern_list, word_list):
    for p in range(len(pattern_list)):
        if not (re.match(pattern_list[p], word_list[p])):
            return False
    return True

def re_match_lists(pattern_list_list, word_list):
    for pl in range(len(pattern_list_list)):
        if (re_match_lists_helper(pattern_list_list[pl], word_list)):
            return True
    return False

def matches_any_4_word_pattern(word1, word2, word3, word4):
    pattern_list_list = [bill_id_pattern_4_1, bill_id_pattern_4_2]
    word_list = [word1, word2, word3, word4]
    
    return re_match_lists(pattern_list_list, word_list)

def matches_any_3_word_pattern(word1, word2, word3):
    pattern_list_list = [bill_id_pattern_3_1, bill_id_pattern_3_2]
    word_list = [word1, word2, word3]
    
    return re_match_lists(pattern_list_list, word_list)
    
def matches_any_2_word_pattern(word1, word2):
    pattern_list_list = [bill_id_pattern_2_1, bill_id_pattern_2_2,
                         bill_id_pattern_2_3, bill_id_pattern_2_4,
                         bill_id_pattern_2_5, bill_id_pattern_2_6,
                         bill_id_pattern_2_7, bill_id_pattern_2_8,
                         bill_id_pattern_2_9, bill_id_pattern_2_10]
    word_list = [word1, word2]
    
    return re_match_lists(pattern_list_list, word_list)

def matches_any_1_word_pattern(word):
    return (re.match(bill_id_pattern_1_1, word) or
            re.match(bill_id_pattern_1_2, word) or
            re.match(bill_id_pattern_1_3, word) or
            re.match(bill_id_pattern_1_4, word) or
            re.match(bill_id_pattern_1_5, word) or
            re.match(bill_id_pattern_1_6, word) or
            re.match(bill_id_pattern_1_7, word) or
            re.match(bill_id_pattern_1_8, word) or
            re.match(bill_id_pattern_1_9, word) or
            re.match(bill_id_pattern_1_10, word))

In [14]:
def shift_words_over(words, word_ix, shift_amount):
    words_length = len(words)
    
    for i in range(word_ix, words_length - shift_amount):
        words[i] = words[i+shift_amount]
    while(len(words) > (words_length-shift_amount)):
        del words[-1]
        
    return words

In [15]:
def replace_bill_ids_in_utterance(utterance, last_bill_number, t1, t2, t3, t4):
    words = utterance.lower().split()
    utterance_length = len(words)
    word_ix = 0
    bill_id_replaced = False
    while(word_ix < utterance_length):
        if (word_ix < (utterance_length-3) and
            matches_any_4_word_pattern(words[word_ix],
                                         words[word_ix+1],
                                         words[word_ix+2],
                                         words[word_ix+3])):
            last_bill_number = words[word_ix+3]
            words[word_ix] = "<BILL_ID>"
            words = shift_words_over(words, word_ix+1, 3)
            utterance_length -= 3
            bill_id_replaced = True
            t4 += 1
        elif (word_ix < (utterance_length-2) and
              matches_any_3_word_pattern(words[word_ix],
                                         words[word_ix+1],
                                         words[word_ix+2])):
            last_bill_number = words[word_ix+2]
            words[word_ix] = "<BILL_ID>"
            words = shift_words_over(words, word_ix+1, 2)
            utterance_length -= 2
            bill_id_replaced = True
            t3 += 1
        elif (word_ix < (utterance_length-1) and
            matches_any_2_word_pattern(words[word_ix],
                                         words[word_ix+1])):
            last_bill_number = words[word_ix+1]
            words[word_ix] = "<BILL_ID>"
            words = shift_words_over(words, word_ix+1, 1)
            utterance_length -= 1
            bill_id_replaced = True
            t2 += 1
        elif (matches_any_1_word_pattern(words[word_ix])):
            last_bill_number = words[word_ix].split("[a-z]+")[-1]
            words[word_ix] = "<BILL_ID>"
            bill_id_replaced = True
            t1 += 1

        word_ix += 1
            
    return (" ".join(words), last_bill_number, bill_id_replaced, t1, t2, t3, t4)

In [16]:
def replace_bill_ids(old, new):
    t1 = 0  #keeps track of how many bill id replacements there were
    t2 = 0
    t3 = 0
    t4 = 0
    
    last_bill_number = ""
    last_bill_number_line = 0
    transition_window_list = []
    line_number = 0
    for line in old:
        line_splits = line.lower().rstrip("\n").split("~")
        
        (new_text, current_bill_number, bill_id_replaced, t1, t2, t3, t4) = replace_bill_ids_in_utterance(line_splits[2], last_bill_number, t1, t2, t3, t4)
        
        if (bill_id_replaced):
            if (current_bill_number != last_bill_number):
                transition_window_list.append((last_bill_number_line, line_number))
                last_bill_number = current_bill_number
                last_bill_number_line = line_number
            elif (current_bill_number == last_bill_number):
                last_bill_number_line = line_number
        
        new.write(line_splits[0] + "~" + line_splits[1] + "~" + new_text + "~" + line_splits[3] + "\n")
        line_number += 1
        
    print("Length of Bill Patterns Replaced\n1: " + str(t1) + "\n2: " + str(t2) + "\n3: " + str(t3) + "\n4: " + str(t4))
    return transition_window_list

In [17]:
transition_window_list = [] #not currently used, but is available for use

with open(cleaned_raw_filename, 'r') as old:
    with open(cleaned_raw_bill_id_replaced_filename, 'w') as new:
        # consume/write headings
        h = old.readline()
        new.write(h)
            
        #actually iterate through the file
        transition_window_list = replace_bill_ids(old, new)
        print(transition_window_list)

Length of Bill Patterns Replaced
1: 1350
2: 16744
3: 5355
4: 4
[(0, 364), (364, 436), (436, 481), (481, 484), (484, 628), (628, 824), (824, 874), (874, 907), (907, 909), (909, 914), (914, 1122), (1122, 1124), (1124, 1128), (1156, 1208), (1208, 1209), (1209, 1212), (1212, 1248), (1248, 1252), (1252, 1285), (1285, 1290), (1290, 1306), (1306, 1331), (1331, 1342), (1342, 1344), (1344, 1348), (1348, 1370), (1370, 1401), (1401, 1419), (1419, 1444), (1444, 1505), (1505, 1507), (1507, 1516), (1518, 1521), (1521, 1537), (1537, 1553), (1553, 1560), (1560, 1562), (1562, 1572), (1572, 1595), (1595, 1606), (1606, 1608), (1608, 1635), (1635, 1636), (1640, 1652), (1652, 1669), (1669, 1671), (1675, 1695), (1748, 1780), (1780, 1781), (1781, 1785), (1785, 1803), (1803, 1830), (1830, 1832), (1832, 1839), (1845, 1857), (1857, 1858), (1858, 1914), (1914, 1915), (1919, 1941), (1941, 1942), (1942, 1947), (1947, 2017), (2017, 2041), (2041, 2053), (2053, 2055), (2055, 2057), (2057, 2059), (2059, 2061), (2061, 

# Upleveled Processing

In [None]:
upleveled = pd.read_table(original_upleveled_filename, sep='~~~~~', engine='python')
upleveled = upleveled.sort_values(["video_id", "hearing_id", "speaker_start_time"])
upleveled.to_csv(original_upleveled_sorted_filename, sep="~", index=False)

In [None]:
def tag_bill_change_lines(original, cleaned):
    line = original.readline()
    current_bill_id = line.split("~")[0]
    i = 0
    cleaned.write(line.rstrip("\n") + "~0\n")
    
    for line in original:
        line_splits = line.split("~")
        
        if (line_splits[0] != current_bill_id):
            current_bill_id = line_splits[0]
            i += 1
        
        cleaned.write(line.rstrip("\n") + "~" + str(i) + "\n")

In [None]:
with open(original_upleveled_sorted_filename, 'r') as original:
    with open(cleaned_upleveled_filename, 'w') as cleaned:
        #consume/write headings
        h = original.readline()
        cleaned.write(h.rstrip("\n") + "~bill_change_tag\n")
            
        tag_bill_change_lines(original, cleaned)

In [None]:
tagged_upleveled = pd.read_table(cleaned_upleveled_filename, sep='~')

In [None]:
bill_start_times = tagged_upleveled.groupby(["bill_change_tag"]).head(1)
bill_end_times = tagged_upleveled.groupby(["bill_change_tag"]).tail(1)
bill_start_end_times = pd.merge(bill_start_times[["bill_id", "hearing_id", "video_id", "speaker_start_time", "bill_change_tag"]],
                                bill_end_times[["speaker_end_time", "bill_change_tag"]],
                                on=["bill_change_tag"]).drop(["bill_change_tag"], axis=1)
bill_start_end_times["length"] = bill_start_end_times["speaker_end_time"] - bill_start_end_times["speaker_start_time"]
bill_start_end_times = bill_start_end_times.sort_values(["video_id", "speaker_start_time"])

In [None]:
longest_bill_discussions = bill_start_end_times.sort_values(["bill_id", "length"]).groupby(["bill_id"]).tail(1)
longest_bill_discussions = longest_bill_discussions.sort_values(["video_id", "speaker_start_time"])

In [None]:
bill_start_end_times.to_csv(bill_start_end_times_all_filename, sep="~", index=False)
longest_bill_discussions.to_csv(bill_start_end_times_longest_filename, sep="~", index=False)