Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 204 lines (180 sloc) 4.93 KB
#!/usr/bin/env perl
# Copyright 2014 Michal Špaček <tupinek@gmail.com>
# Pragmas.
use strict;
use warnings;
# Modules.
use Database::DumpTruck;
use Encode qw(decode_utf8 encode_utf8);
use English;
use HTML::TreeBuilder;
use LWP::UserAgent;
use POSIX qw(strftime);
use URI;
use URI::QueryParam;
use Time::Local;
# Don't buffer.
$OUTPUT_AUTOFLUSH = 1;
# Mode (0 - Process all items, 1 - After one in database skip others).
my $MODE = 1;
# First page.
my $PAGE = 1;
# Page timeout.
my $TIMEOUT = 1;
# Decoding og months.
my $DATE_WORD_HR = {
decode_utf8('leden') => 1,
decode_utf8('únor') => 2,
decode_utf8('březen') => 3,
decode_utf8('duben') => 4,
decode_utf8('květen') => 5,
decode_utf8('červen') => 6,
decode_utf8('červenec') => 7,
decode_utf8('srpen') => 8,
decode_utf8('září') => 9,
decode_utf8('říjen') => 10,
decode_utf8('listopad') => 11,
decode_utf8('prosinec') => 12,
};
# District ids.
my $DISTRICT_IDS_HR = {
3701 => decode_utf8('Blansko'),
3702 => decode_utf8('Brno-město'),
3703 => decode_utf8('Brno-venkov'),
3704 => decode_utf8('Břeclav'),
3706 => decode_utf8('Hodonín'),
3712 => decode_utf8('Vyškov'),
3713 => decode_utf8('Znojmo'),
};
# URI of service.
my $base_uri = URI->new("http://www.firebrno.cz/modules/incidents/index.php?page=$PAGE");
# Open a database handle.
my $dt = Database::DumpTruck->new({
'dbname' => 'data.sqlite',
'table' => 'data',
});
# Create a user agent object.
my $ua = LWP::UserAgent->new(
'agent' => 'Mozilla/5.0',
);
# Get items.
my $page_uri = $base_uri;
while ($page_uri) {
$page_uri = process_page($page_uri);
sleep $TIMEOUT;
}
# Get database date from web datetime.
sub get_db_datetime {
my $date_web = shift;
my ($date, $time_web) = split m/,/ms, $date_web;
my ($year, $mon, $day);
if ($date =~ m/^\s*(\d+)\s*\.\s*(\w+)\s*(\d+)\s*$/ms) {
$day = $1;
$mon = $DATE_WORD_HR->{lc($2)};
$year = $3;
}
remove_trailing(\$time_web);
my ($hour, $min) = split m/:/ms, $time_web;
my $time = timelocal(0, $min, $hour, $day, $mon - 1, $year - 1900);
return strftime('%Y-%m-%d %H:%M', localtime($time));
}
# Get root of HTML::TreeBuilder object.
sub get_root {
my $uri = shift;
my $get = $ua->get($uri->as_string);
my $data;
if ($get->is_success) {
$data = $get->content;
} else {
die "Cannot GET '".$uri->as_string." page.";
}
my $tree = HTML::TreeBuilder->new;
$tree->parse(decode_utf8($data));
return $tree->elementify;
}
# Process page.
sub process_page {
my $uri = shift;
print 'Page: '.$uri->as_string."\n";
my $root = get_root($uri);
my @items = $root->find_by_attribute('class', 'inc-item');
foreach my $item (@items) {
my $date_div = $item->find_by_attribute('class', 'inc-date');
my $datetime = get_db_datetime($date_div->as_text);
remove_trailing(\$datetime);
my $link = URI->new($base_uri->scheme.'://'.$base_uri->host.
'/modules/incidents/'.
$item->find_by_tag_name('a')->attr('href'));
my $id = $link->query_param('filter[id]');
my $district = $DISTRICT_IDS_HR->{$link->query_param('district_id')};
my @divs = $item->find_by_tag_name('div');
my $inc_content_div = $divs[3];
my $summary = $inc_content_div->as_text;
remove_trailing(\$summary);
my $date = $date_div->as_text;
remove_trailing(\$date);
$summary =~ s/${date}//ms;
my $detail_root = get_root($link);
my $inc_info = $detail_root->find_by_attribute('class', 'inc-info');
my $type = $inc_info->find_by_tag_name('h2')->as_text;
remove_trailing(\$type);
my $inc_content = $detail_root->find_by_attribute('class', 'inc-content col-md-12 col-sm-12 col-xs-12');
my @ps = $inc_content->find_by_tag_name('p');
my $details = '';
foreach my $p (@ps) {
if ($p->find_by_tag_name('strong')->as_text eq 'Popis') {
$details = $p->as_text;
last;
}
}
$details =~ s/^Popis: //ms;
remove_trailing(\$details);
# Save.
my $ret_ar = eval {
$dt->execute('SELECT COUNT(*) FROM data WHERE ID = ?',
$id);
};
if ($EVAL_ERROR || ! @{$ret_ar} || ! exists $ret_ar->[0]->{'count(*)'}
|| ! defined $ret_ar->[0]->{'count(*)'}
|| $ret_ar->[0]->{'count(*)'} == 0) {
print "ID: $id - ".encode_utf8($summary)."\n";
$dt->insert({
'Summary' => $summary,
'Details' => $details,
'ID' => $id,
'District' => $district,
'Link' => $link->as_string,
'Datetime' => $datetime,
'Type' => $type,
});
# TODO Move to begin with create_table().
$dt->create_index(['ID'], 'data', 1, 1);
} else {
if ($MODE == 1) {
return;
}
}
}
return next_link($uri, $root);
}
# Get next link.
sub next_link {
my ($uri, $root) = @_;
my @pag_a = $root->find_by_attribute('class', 'pager')
->find_by_tag_name('a');
my $next_uri;
foreach my $pag_a (@pag_a) {
if ($pag_a->as_text eq decode_utf8('')) {
$next_uri = URI->new($uri->scheme.'://'.$uri->host.
$pag_a->attr('href'));
}
}
return $next_uri;
}
# Removing trailing whitespace.
sub remove_trailing {
my $string_sr = shift;
${$string_sr} =~ s/^\s*//ms;
${$string_sr} =~ s/\s*$//ms;
return;
}