In [1]:
import datetime
import math
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [3]:
data_dir = "/Users/soniamannan/Documents/DATA401/capstone/digitaldemocracy_ds_capstone_2018/"

In [4]:
original_raw_filename = data_dir + "dd_capstone_raw_transcripts.txt"
original_upleveled_filename = data_dir + "dd_capstone_data.txt"
cleaned_raw_filename =data_dir + "raw.csv"
bill_start_end_times_filename = data_dir + "bill_start_end_times.csv"

# Raw Processing

In [5]:
raw = pd.read_table(original_raw_filename, sep='~~~~~', engine='python')
raw.head()

Unnamed: 0,video_id,raw_transcript
0,4221,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
1,4229,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
2,4228,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
3,4226,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."
4,4222,"?<?xml version=""1.0"" encoding=""utf-8""?><tt xml..."


In [6]:
# parse a string 00:00:00.470 to hours, minutes, seconds
# return time in seconds
def parse_time(time):
    time = time.split(":")
    hours = int(time[0])
    minutes = int(time[1])
    seconds = int(float(time[2])) 
    
    return (hours*360)+(minutes*60)+seconds

In [7]:
def parse_raw_data(raw):
    r = raw['raw_transcript']
    ids = raw['video_id']
    res = {'start':[], 'end':[], 'text':[], 'video_id': []}
    for transcript, vid in zip(r, ids):
        soup = BeautifulSoup(transcript, "lxml")
        letters = soup.find_all("p")

        for p in letters[1:]:
            res['start'].append(parse_time(p.get('begin')))
            res['end'].append(parse_time(p.get('end')))
            res['text'].append(p.contents[0])
            res['video_id'].append(vid)

    tidy = pd.DataFrame(res, columns=['start', 'end', 'text', 'video_id'])
    return (tidy)

In [8]:
cleaned_raw = parse_raw_data(raw)
cleaned_raw.to_csv(cleaned_raw_filename, sep="~", index=False)
cleaned_raw.head()

Unnamed: 0,start,end,text,video_id
0,0,2,We don't have a quorum yet I don't believe.,4221
1,6,8,We don't have a quorum yet.,4221
2,8,13,We'll ask the sergeants to please call the mem...,4221
3,13,21,that we can establish a quorum for this partic...,4221
4,21,26,This is the Assembly's 2nd Extraordinary Sessi...,4221


# Upleveled Processing

In [9]:
upleveled = pd.read_table(original_upleveled_filename, sep='~~~~~', engine='python')
upleveled.head()

Unnamed: 0,bill_id,hearing_id,video_id,video_start_time,video_end_time,speaker_start_time,speaker_end_time,text
0,CA_201520162AB15,539,4221,0,1981,0,2,We don't have a quorum yet I don't believe.
1,CA_201520162AB15,539,4221,0,1981,6,8,We don't have a quorum yet.
2,CA_201520162AB15,539,4221,0,1981,8,13,We'll ask the sergeants to please call the mem...
3,CA_201520162AB15,539,4221,0,1981,13,21,that we can establish a quorum for this partic...
4,CA_201520162AB15,539,4221,0,1981,21,26,This is the Assembly's 2nd Extraordinary Sessi...


In [10]:
bill_start_times = upleveled.sort_values(["video_id", "hearing_id", "bill_id", "speaker_start_time"]).groupby(["bill_id", "hearing_id", "video_id"]).head(1)
bill_end_times = upleveled.sort_values(["video_id", "hearing_id", "bill_id", "speaker_start_time"]) .groupby(["bill_id", "hearing_id", "video_id"]).tail(1)
bill_start_end_times = pd.merge(bill_start_times[["bill_id", "hearing_id", "video_id", "speaker_start_time"]],
                                bill_end_times[["bill_id", "hearing_id", "video_id", "speaker_end_time"]],
                                on=["bill_id", "hearing_id", "video_id"])

In [11]:
bill_start_end_times.to_csv(bill_start_end_times_filename, sep="~", index=False)
bill_start_end_times.head()

Unnamed: 0,bill_id,hearing_id,video_id,speaker_start_time,speaker_end_time
0,CA_201520160SB114,542,4161,91,430
1,CA_201520160SB313,542,4161,1516,1800
2,CA_201520160SB322,542,4161,991,1234
3,CA_201520160SB376,542,4161,1,91
4,CA_201520160SB660,542,4161,446,991
