]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/) + list.each do |block| + next unless block + block.strip! + next if has_only_tags(block) + continuous /= continuous_factor if body.length > 0 + + # リンク除外&リンクリスト判定 + notlinked = eliminate_link(block) + next if notlinked.length < min_length + + # スコア算出 + c = (notlinked.length + notlinked.scan(punctuations).length * punctuation_weight) * factor + factor *= decay_factor + not_body_rate = block.scan(waste_expressions).length + block.scan(/amazon[a-z0-9\.\/\-\?&]+-22/i).length / 2.0 + c *= (0.72 ** not_body_rate) if not_body_rate>0 + c1 = c * continuous + puts "----- #{c}*#{continuous}=#{c1} #{notlinked.length} \n#{strip_tags(block)[0,100]}\n" if debug + + # ブロック抽出&スコア加算 + if c1 > threshold + body += block + "\n" + score += c1 + continuous = continuous_factor + elsif c > threshold # continuous block end + bodylist << [body, score] + body = block + "\n" + score = c + continuous = continuous_factor + end + end + bodylist << [body, score] + body = bodylist.inject{|a,b| if a[1]>=b[1] then a else b end } + [strip_tags(body[0]), title] + end + + # Extracts title. + def self.extract_title(st) + if st =~ /