# Scrape DC hearing decisions



## Imports and functions

In [14]:

import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## link processing
import urllib
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests 
import itertools
import re
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

In [16]:
print("Loaded imports successfully")

path_write = "/Users/raj2/Dropbox/dph_hearing_decisions/data/dc/hearings/"

Loaded imports successfully


## Define scraping functions

In [18]:
def parse_https_page(link):
    
    parsed_page = BeautifulSoup(urlopen(Request(link, headers={'User-Agent': 'Mozilla/5.0'})).read(),
                    "html.parser")
    return(parsed_page)
    
def extract_links_frompage(parsed_page, pattern_tosearch):
    
    empty_list = []
    for link in parsed_page.findAll('a', attrs={'href': re.compile(pattern_tosearch)}):
        empty_list.append(link.get('href'))
    
    return(empty_list)

# Navigate pages to download 2018 and 2019 hearing pdfs

In [19]:
## navigate to main hearing page
all_years = "https://osse.dc.gov/service/hearing-officer-determinations"

## get links to all years
all_years_page = parse_https_page(all_years)
all_years_links = extract_links_frompage(parsed_page = all_years_page, 
                                        pattern_tosearch = "^http(s)?://osse.dc.gov/page/20[0-9][0-9]")

## 2018 missing; manual links
manual_links = ['https://osse.dc.gov/page/' + str(i) + "-hearing-officers-determinations" for i
               in np.arange(2014, 2017)]
manual_links

"https://osse.dc.gov/page/2014-hearing-officer-determinations"
    


['https://osse.dc.gov/page/2014-hearing-officers-determinations',
 'https://osse.dc.gov/page/2015-hearing-officers-determinations',
 'https://osse.dc.gov/page/2016-hearing-officers-determinations']

In [22]:
## iterate through links and parse pdfs 
months_acrossyears = []
for year_link in manual_links:
    try: 
        one_year = parse_https_page(year_link)
        months_links = extract_links_frompage(one_year, pattern_tosearch = "node|publication")
        months_links_final = ["https://osse.dc.gov" + str(link) if  re.search(r'^/', str(link)) 
                             else str(link)
                             for link in months_links] # appendix prefix if it starts with suffix
        all_hod = []
        for one_month in months_links_final:
            one_month_page = parse_https_page(one_month)
            pdfs_onpage = extract_links_frompage(parsed_page = one_month_page, 
                                                     pattern_tosearch = "\\.pdf")
            hod_pdfs = [pdf for pdf in pdfs_onpage if "HOD" in pdf]
            if hod_pdfs != []:
                for hod in hod_pdfs:
                    resp = requests.get(hod, verify=False,stream=True)
                    filename = path_write + re.sub('.*attachments/', '', hod) 
                    fd = open(filename, 'wb')
                    fd.write(resp.content)
                    fd.close() 
    except:
        pass 


788153

1485159

906500

801620

793382

894995

771592

798350

969293

854161

660700

621561

699899

766697

684198

817498

684183

708363

507750

740246

615741

538946

546525

649294

696667

642781

634717

492806

630018

646175

643365

639697

640470

618142

610668

501101

685266

604060

492734

605721

622628

488988

602356

522404

670790

626373

504147

504126

536693

537494

622453

645680

536572

684844

595452

637617

718283

500636

488466

614199

1161149

661907

524661

524652

638696

693487

508144

663613

646443

644566

507417

658055

531866

667211

631165

682535

518649

518947

646584

668968

500205

506922

529211

1112415

572485

533189

736265

571025

1213190

675831

684320

675847

506048

624738

639849

681593

7133048

501339

655333

626935

632909

632730

627548

520020

765813

514656

624731

518029

628179

690542

609772

349626

630111

8344230

547641

789080

527710

525925

499193

499178

582364

512325

659478

698877

667991

679622

521717

['https://osse.dc.gov/page/2014-hearing-officers-determinations',
 'https://osse.dc.gov/page/2015-hearing-officers-determinations',
 'https://osse.dc.gov/page/2016-hearing-officers-determinations']

In [24]:
final_manual = ["https://osse.dc.gov/page/2014-hearing-officer-determinations",
               "https://osse.dc.gov/page/2015-hearing-officer-determinations",
                "https://osse.dc.gov/page/2017-hearing-officers-determinations"]

In [25]:
## remaining manual links
months_acrossyears = []
for year_link in final_manual:
    try: 
        one_year = parse_https_page(year_link)
        months_links = extract_links_frompage(one_year, pattern_tosearch = "node|publication")
        months_links_final = ["https://osse.dc.gov" + str(link) if  re.search(r'^/', str(link)) 
                             else str(link)
                             for link in months_links] # appendix prefix if it starts with suffix
        all_hod = []
        for one_month in months_links_final:
            one_month_page = parse_https_page(one_month)
            pdfs_onpage = extract_links_frompage(parsed_page = one_month_page, 
                                                     pattern_tosearch = "\\.pdf")
            hod_pdfs = [pdf for pdf in pdfs_onpage if "HOD" in pdf]
            if hod_pdfs != []:
                for hod in hod_pdfs:
                    resp = requests.get(hod, verify=False,stream=True)
                    filename = path_write + re.sub('.*attachments/', '', hod) 
                    fd = open(filename, 'wb')
                    fd.write(resp.content)
                    fd.close() 
    except:
        pass 

788153

1485159

906500

801620

793382

894995

771592

798350

969293

854161

492289

694525

621347

650130

497036

669635

620745

520108

586973

482197

536839

672397

639891

657351

518425

627352

481648

651338

40402

681443

676706

687291

507842

609764

518433

606524

698775

696993

505414

614420

551806

589166

672920

521678

478458

643177

497339

643440

578089

527676

666344

497451

504799

488343

512601

516803

871769

495443

610974

492100

570706

539279

655248

662666

571649

533650

485175

686309

611538

652807

484042

905563

579179

546662

691886

514321

532516

675061

601120

524207

696270

529581

488318

663311

661188

494236

529696

511844

552948

487266

784706

552265

604276

535155

740577

703774

496364

507443

672211

670602

517018

727901

688868

693769

680296

653689

489027

581690

663649

505174

548141

505847

516732

505888

488800

661093

510318

584382

489191

674521

483718

620925

681777

590146

886326

499250

496902

678552

685699

707346

710215

584642

687117

632907

492434

711688

672791

536169

687283

573894

486451

653666

793777

671033

672542

696856

678540

509160

784861

707523

555010

507695

670415

484245

547361

501162

575913

544258

607761

632980

812310

509073

604835

625727

484961

676507

628689

735644

688892

638369

566162

889155

969452

520574

496340

499392

527040

676497

621287

605545

490153

652744

701386

495757

618637

684083

695962

537339

522990

665542

648313

574454

497893

491901

496116

491070

497332

500674

721334

555228

575252

651291

669865

692613

485089

480915

670474

681696

565442

498399

863578

480533

663218

686850

733527

536183

695231

730554

738719

502985

805519

555675

662637

669349

511634

493604

650905

519360

651910

693684

694035

514611

484079

678495

788153

1485159

906500

801620

793382

894995

771592

798350

969293

854161

665875

691767

681588

476831

657899

703719

639056

510433

540418

531329

685850

671600

494632

530411

526671

655122

476729

551250

648786

524185

699373

683640

842596

506678

648662

680890

608308

641329

702222

497508

540967

489216

540554

664825

511949

539127

703115

488601

615655

676779

659255

751523

749499

506109

513769

637511

1216091

843976

496909

746405

533509

615421

527535

754458

532569

603209

740789

733121

501129

637498

499729

652434

631760

519737

523062

633620

595940

488135

502505

735875

535972

995877

666627

590508

502977

776549

596226

720132

698021

737999

721670

658492

491815

505718

815539

484743

513109

560592

859674

499236

988320

599410

605663

623731

506885

499670

639878

647772

735296

750811

545580

490100

499584

515800

679690

620143

1005338

499194

618556

734894

483340

524371

631085

794301

662086

483497

614428

502142

561765

625852

504750

594606

882263

881819

641714

693542

525525

607650

701209

496472

497849

684238

498466

480751

626544

531966

719399

495427

486995

788153

1485159

906500

801620

793382

894995

771592

798350

969293

854161

747931

693739

693887

1599841

507059

971638

1134208

705140

535183

2674102

651177

571969

1067084

542737

2575650

3110368

1062296

1055128

528500

553418

606036

685175

494846

730063

2153532

1717271

696288

897270

755094

2154878

1624467

515631

620841

489005

526547

644822

661901

547559

2056592

1093323

493612

549627

2090836

2119191

1227095

629258

732259

530365

1697143

1190087

734855

1616814

2871938

847449

529291

838574

1110597

508500

731104

1139584

543446

624269

510380

1147568

544670

696253

1045431

747054

698888

535386

711488

711583

631043

531093

1513645

642258

731004

518058

512093

653472

649235

770493

712916

568117

553730

529220

642900

696245

529687

666944

663776

486309

627433

648576

645536

529498

529313

639078

800745

667095

644228

635981

646636

654006

481773

633779

664691

617935

711675

690887

675758

495402

637231

734395

718534

595701

690291

594542

597432

739665

658356

583303

718654

588729

583787

601051

806453

607853

740848

587452

588314

705951

794809

741414

698017

800407

514652

732483

515629

590949

643675

743998