Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

refactored DataFlow::Proc::HTMLFilter out of core

Signed-off-by: Alexei Znamensky <russoz@cpan.org>
  • Loading branch information...
commit 9e5fa369895c623f14623ab55e3709db7acb5850 1 parent fc8f680
Alexei Znamensky authored
164 lib/DataFlow/Proc/HTMLFilter.pm
... ... @@ -1,164 +0,0 @@
1   -package DataFlow::Proc::HTMLFilter;
2   -
3   -use strict;
4   -use warnings;
5   -
6   -# ABSTRACT: A HTML filtering processor
7   -
8   -# VERSION
9   -
10   -use Moose;
11   -extends 'DataFlow::Proc';
12   -
13   -use namespace::autoclean;
14   -use DataFlow::Types qw(HTMLFilterTypes);
15   -use HTML::TreeBuilder::XPath;
16   -use MooseX::Aliases;
17   -
18   -has 'search_xpath' => (
19   - 'is' => 'ro',
20   - 'isa' => 'Str',
21   - 'required' => 1,
22   - 'alias' => 'xpath',
23   -);
24   -
25   -has 'result_type' => (
26   - 'is' => 'ro',
27   - 'isa' => 'HTMLFilterTypes',
28   - 'default' => 'HTML',
29   - 'alias' => 'type',
30   -);
31   -
32   -has 'ref_result' => (
33   - 'is' => 'ro',
34   - 'isa' => 'Bool',
35   - 'default' => 0,
36   -);
37   -
38   -has 'nochomp' => (
39   - 'is' => 'ro',
40   - 'isa' => 'Bool',
41   - 'default' => 0,
42   -);
43   -
44   -has '+p' => (
45   - 'lazy' => 1,
46   - 'default' => sub {
47   - my $self = shift;
48   -
49   - my $proc = sub {
50   - my $html = HTML::TreeBuilder::XPath->new_from_content($_);
51   -
52   - #warn 'xpath is built';
53   - #warn 'values if VALUES';
54   - return $html->findvalues( $self->search_xpath )
55   - if $self->result_type eq 'VALUE';
56   -
57   - #warn 'not values, find nodes';
58   - my @result = $html->findnodes( $self->search_xpath );
59   -
60   - #use Data::Dumper; warn 'result = '.Dumper(\@result);
61   - return () unless @result;
62   - return @result if $self->result_type eq 'NODE';
63   -
64   - #warn 'wants HTML';
65   - return map { $_->as_HTML } @result;
66   - };
67   -
68   - my $proc2 = $self->nochomp ? $proc : sub { return chomp $proc->(@_) };
69   - my $proc3 =
70   - $self->ref_result ? sub { return [ $proc2->(@_) ] } : $proc2;
71   -
72   - return $self->ref_result ? sub { return [ $proc->(@_) ] } : $proc;
73   - },
74   -);
75   -
76   -__PACKAGE__->meta->make_immutable;
77   -
78   -1;
79   -
80   -__END__
81   -
82   -=pod
83   -
84   -=head1 SYNOPSIS
85   -
86   - use DataFlow::Proc::HTMLFilter;
87   -
88   - my $filter_html = DataFlow::Proc::HTMLFilter->new(
89   - search_xpath => '//td',
90   - result_type => 'HTML',
91   - );
92   -
93   - my $filter_value = DataFlow::Proc::HTMLFilter->new(
94   - search_xpath => '//td',
95   - result_type => 'VALUE',
96   - );
97   -
98   - my $input = <<EOM;
99   - <html><body>
100   - <table>
101   - <tr><td>Line 1</td><td>L1, Column 2</td>
102   - <tr><td>Line 2</td><td>L2, Column 2</td>
103   - </table>
104   - </html></body>
105   - EOM
106   -
107   - $filter_html->process( $input );
108   - # @result == '<td>Line 1</td>', ... '<td>L2, Column 2</td>'
109   -
110   - $filter_value->process( $input );
111   - # @result == q{Line 1}, ... q{L2, Column 2}
112   -
113   -=head1 DESCRIPTION
114   -
115   -This processor type provides a filter for HTML content.
116   -Each item will be considered as a HTML content and will be filtered
117   -using L<HTML::TreeBuilder::XPath>.
118   -
119   -=attr search_xpath
120   -
121   -This attribute is a XPath string used to filter down the HTML content.
122   -The C<search_xpath> attribute is mandatory.
123   -
124   -=attr result_type
125   -
126   -This attribute is a string, but its value B<must> be one of:
127   -C<HTML>, C<VALUE>, C<NODE>. The default is C<HTML>.
128   -
129   -=over
130   -
131   -=item HTML
132   -
133   -The result will be the HTML content specified by C<search_xpath>.
134   -
135   -=item VALUE
136   -
137   -The result will be the literal value enclosed by the tag and/or attribute
138   -specified by C<search_xpath>.
139   -
140   -=item NODE
141   -
142   -The result will be a list of L<HTML::Element> objects, as returned by the
143   -C<findnodes> method of L<HTML::TreeBuilder::XPath> class.
144   -
145   -=back
146   -
147   -Most people will probably use C<HTML> or C<VALUE>, but this option is also
148   -provided in case someone wants to manipulate the HTML elements directly.
149   -
150   -=attr ref_result
151   -
152   -This attribute is a boolean, and it signals whether the result list should be
153   -added as a list of items to the output queue, or as a reference to an array
154   -of items. The default is 0 (false).
155   -
156   -There is a semantic subtlety here: if C<ref_result> is 1 (true),
157   -then one HTML item (input) may generate one or zero ArrayRef item (output),
158   -i.e. it is a one-to-one mapping.
159   -On the other hand, by keeping C<ref_result> as 0 (false), one HTML item
160   -may produce any number of items as result,
161   -i.e. it is a one-to-many mapping.
162   -
163   -=cut
164   -
20 lib/DataFlow/Types.pm
@@ -143,10 +143,6 @@ coerce 'Encoder' => from 'Str' => via {
143 143 return sub { return encode( $encoding, shift ) };
144 144 };
145 145
146   -# subtype for DataFlow::Proc::HTMLFilter ######################
147   -
148   -enum 'HTMLFilterTypes', [qw(NODE HTML VALUE)];
149   -
150 146 1;
151 147
152 148 =pod
@@ -300,21 +296,5 @@ encoding.
300 296 It will automagically create a C<sub> that uses function C<< encode() >> from
301 297 module L<Encode> to encode to a named encoding.
302 298
303   -=head2 HTMLFilterTypes
304   -
305   -An enumeration used by type L<DataFlow::Proc::HTMLFilter>,
306   -containing three elements, representing the type of result the HTMLFilter
307   -object will provide:
308   -
309   -=for :list
310   -* NODE
311   -Results will be L<HTML::Element> objects
312   -* HTML
313   -Results will be HTML content.
314   -* VALUE
315   -Results will be literal values
316   -
317   -See DataFlow::Proc::HTMLFilter for more information.
318   -
319 299 =cut
320 300
60 t/64-htmlfilter.t
... ... @@ -1,60 +0,0 @@
1   -
2   -use Test::More tests => 12;
3   -
4   -BEGIN {
5   - use_ok('DataFlow::Proc::HTMLFilter');
6   -}
7   -
8   -my $fail = eval q{DataFlow::Proc::HTMLFilter->new};
9   -ok($@);
10   -
11   -my $filter1 = DataFlow::Proc::HTMLFilter->new( search_xpath => '//td', );
12   -ok($filter1);
13   -ok( !defined( $filter1->process() ) );
14   -
15   -my $html = <<HTML_END;
16   -<html>
17   - <body>
18   - <table>
19   - <tr>
20   - <th>A</th>
21   - <th>B</th>
22   - <th>C</th>
23   - </tr>
24   - <tr>
25   - <td>a1 yababaga </td>
26   - <td>b1 bugalu</td>
27   - <td>c1 potatoes</td>
28   - </tr>
29   - </table>
30   - </body>
31   -</html>
32   -HTML_END
33   -
34   -my @res = $filter1->process($html);
35   -is( scalar @res, 3, 'result has the right size' );
36   -is( $res[2], '<td>c1 potatoes</td>' );
37   -
38   -my $filter2 = DataFlow::Proc::HTMLFilter->new(
39   - search_xpath => '//td',
40   - result_type => 'VALUE',
41   -);
42   -ok($filter2);
43   -
44   -my @res2 = $filter2->process($html);
45   -is( scalar @res2, 3, 'result has the right size' );
46   -is( $res2[1], 'b1 bugalu' );
47   -
48   -my $filter3 = DataFlow::Proc::HTMLFilter->new(
49   - search_xpath => '//th',
50   - result_type => 'VALUE',
51   - ref_result => 1,
52   -);
53   -ok($filter3);
54   -
55   -my @res3 = $filter3->process($html);
56   -is( scalar @{ $res3[0] }, 3, 'result has the right size' );
57   -is_deeply( $res3[0], [qw/A B C/], 'produces the expected result' );
58   -
59   -# TODO: add tests to check the 'nochomp' option
60   -

0 comments on commit 9e5fa36

Please sign in to comment.
Something went wrong with that request. Please try again.