Permalink
Browse files

Convert HTML parsing from html-conduit to xml-html-conduit-lens

Lens magic \o/
  • Loading branch information...
sdroege committed Aug 29, 2016
1 parent 610fdee commit 7dacdab658651c633aa02f168c51077cdb1a791f
Showing with 241 additions and 275 deletions.
  1. +3 −3 html-rss-proxy.cabal
  2. +2 −0 src/Main.hs
  3. +59 −65 src/MakThes.hs
  4. +75 −102 src/ThePressProject.hs
  5. +89 −93 src/ToVima.hs
  6. +13 −12 src/Utils.hs
View
@@ -38,12 +38,12 @@ executable html-rss-proxy
, containers
, transformers
, scotty
, conduit
, http-conduit
, xml-types
, xml-conduit
, xml-conduit-writer
, html-conduit
, xml-html-conduit-lens
, split
, lens
, cereal
, cereal-text
, filelock
View
@@ -4,6 +4,7 @@ import Types
import Rss
import Db
import Config
import Utils
import Data.Monoid
@@ -101,6 +102,7 @@ updateChannels path errorChannels = getCurrentMonotonicTime >>= go
handlers name = [ Handler (\(e :: IOException) -> storeException name (T.pack (show e)))
, Handler (\(e :: HttpException) -> storeException name (T.pack (show e)))
, Handler (\(e :: XmlException) -> storeException name (T.pack (show e)))
, Handler (\(e :: ParsingException) -> storeException name (T.pack (show e)))
, Handler (\(e :: SomeException) -> putStrLn ("Exception while updating " ++ T.unpack name ++ ": " ++ show e) >> exitFailure)
]
View
@@ -12,16 +12,19 @@ import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Catch
import Data.String
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import Data.Conduit
import Network.HTTP.Simple (httpSink)
import Network.HTTP.Conduit (parseUrl)
import qualified Data.XML.Types as XT
import qualified Text.XML.Stream.Parse as XP
import qualified Text.HTML.DOM as HTML
import Data.List.Split
import qualified Data.ByteString.Lazy as BL
import Control.Lens
import Text.Xml.Lens
import qualified Network.HTTP.Simple as HTTP
data MakThesArticle = MakThesArticle
{ articleTitle :: Text
@@ -35,67 +38,58 @@ urlBase, url :: String
urlBase = "http://www.makthes.gr"
url = "http://www.makthes.gr/ItemsFlow/GR/Roi_Eidiseon"
getChannel :: (MonadIO m) => m Channel
getChannel :: (MonadIO m, MonadThrow m) => m Channel
getChannel = do
articles <- fmap (fmap toArticle) $ do
req <- liftIO $ parseUrl url
liftIO $ httpSink req $ \_ ->
HTML.eventConduit =$= parseHtml
pure Channel { channelTitle = "ΜακΘεσ"
, channelLink = "http://www.makthes.gr"
, channelDescription = "Εφημερίδα Μακεδονία της Θεσσαλονίκης"
, channelArticles = articles
}
resp <- HTTP.httpLBS (fromString url)
let body = HTTP.getResponseBody resp
articles = fmap (fmap toArticle) (parseHtml body)
case articles of
Nothing -> throwM ParsingException
Just articles -> pure Channel { channelTitle = "ΜακΘεσ"
, channelLink = "http://www.makthes.gr"
, channelDescription = "Εφημερίδα Μακεδονία της Θεσσαλονίκης"
, channelArticles = articles
}
-- FIXME: The date without a proper time is not useful
toArticle :: MakThesArticle -> Article
toArticle (MakThesArticle title date text link) = Article title link description Nothing
toArticle (MakThesArticle title date summary link) = Article title (T.pack urlBase <> link) description Nothing
where
description = date <> "<br/><br/>" <> text
-- FIXME: How to make this a Conduit XT.Event m MakThesArticle instead, i.e.
-- produce the articles lazily?
parseHtml :: (MonadThrow m) => ConduitM XT.Event o m [MakThesArticle]
parseHtml = XP.force "html" $
XP.tagName "html" XP.ignoreAttrs $ \_ -> do
void (XP.many (XP.ignoreTree (/= "body")))
articles <- XP.force "body" $ XP.tagName "body" XP.ignoreAttrs $ \_ ->
mconcat <$> XP.many' parseForm1
void (XP.many XP.ignoreAllTreesContent)
pure articles
parseForm1 :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
parseForm1 = tagNameWithAttrValue "form" "id" "form1" $
mconcat <$> XP.many' parseDivMainPlaceholder
parseDivMainPlaceholder :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
parseDivMainPlaceholder = tagNameWithAttrValue "div" "id" "mainplaceholder" $
mconcat <$> XP.many' parseDivInnerPage
parseDivInnerPage :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
parseDivInnerPage = tagNameWithAttrValue "div" "id" "innerpage" $
mconcat <$> XP.many' parseDivMainContet
parseDivMainContet :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
parseDivMainContet = tagNameWithAttrValue "div" "class" "maincontet" $
mconcat <$> XP.many' parseDivItemsFlow
parseDivItemsFlow :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
parseDivItemsFlow = tagNameWithAttrValue "div" "id" "divitemsflow" $
XP.many parseItem
parseItem :: (MonadThrow m) => ConduitM XT.Event o m (Maybe MakThesArticle)
parseItem = do
date <- tagNameWithAttrValue "div" "class" "date red" XP.content
case date of
Nothing -> pure Nothing
Just realDate -> do
(title, link) <- XP.force "div class title" $ tagNameWithAttrValue "div" "class" "title" $
XP.force "a title" $ XP.tagName "a" (XP.requireAttr "href" <* XP.ignoreAttrs) $ \link -> do
title <- XP.content
pure (title, T.pack urlBase <> link)
text <- XP.force "text" $ tagNameWithAttrValue "div" "class" "text" XP.content
void (XP.ignoreTagName "hr")
pure (Just (MakThesArticle title realDate text link))
description = date <> "<br/><br/>" <> summary
parseHtml :: BL.ByteString -> Maybe [MakThesArticle]
parseHtml body = case articles of
Nothing -> Nothing
Just [] -> Nothing
Just xs -> Just xs
where
-- Extract the list of divs that contain what we're interested in
extractedArticleDivs =
body ^.. html ...
named (only "body") ...
named (only "form") . withAttribute "id" "form1" ...
named (only "div") . withAttribute "id" "mainplaceholder" ...
named (only "div") . withAttribute "id" "innerpage" ...
named (only "div") . withAttribute "class" "maincontet" ...
named (only "div") . withAttribute "id" "divitemsflow" .
plate
-- Group them, each article is 3 divs followed by an hr
groupedArticleDivs = (split . dropFinalBlank . dropDelims . whenElt) (\a -> a ^. name == "hr") extractedArticleDivs
-- Convert the groups of divs to our data structure
articles = sequenceA $ fmap divGroupToArticle groupedArticleDivs
divGroupToArticle :: [Element] -> Maybe MakThesArticle
divGroupToArticle [dateRedDiv, divTitle, divText] = MakThesArticle <$> title <*> date <*> summary <*> link
where
date = dateRedDiv ^? withAttribute "class" "date red" . text
titleLinkA = divTitle ^? withAttribute "class" "title" ...
named (only "a")
link = titleLinkA ^? _Just . attr "href" . _Just
title = titleLinkA ^? _Just . text
summary = Just $ divText ^. withAttribute "class" "text" . text
divGroupToArticle _ = Nothing
View
@@ -12,16 +12,19 @@ import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Catch
import Data.String
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import Data.Conduit
import Network.HTTP.Simple (httpSink)
import Network.HTTP.Conduit (parseUrl)
import qualified Data.XML.Types as XT
import qualified Text.XML.Stream.Parse as XP
import qualified Text.HTML.DOM as HTML
import Data.List.Split
import qualified Data.ByteString.Lazy as BL
import Control.Lens
import Text.Xml.Lens
import qualified Network.HTTP.Simple as HTTP
data ThePressProjectArticle = ThePressProjectArticle
{ articleTitle :: Text
@@ -36,106 +39,76 @@ urlBase, url :: String
urlBase = "http://www.thepressproject.gr"
url = "http://www.thepressproject.gr/list_en.php"
getChannel :: (MonadIO m) => m Channel
getChannel :: (MonadIO m, MonadThrow m) => m Channel
getChannel = do
articles <- fmap (fmap toArticle) $ do
req <- liftIO $ parseUrl url
liftIO $ httpSink req $ \_ ->
HTML.eventConduit =$= parseHtml
resp <- HTTP.httpLBS (fromString url)
let body = HTTP.getResponseBody resp
articles = fmap (fmap toArticle) (parseHtml body)
case articles of
Nothing -> throwM ParsingException
Just articles -> pure Channel { channelTitle = "ThePressProject"
, channelLink = "http://www.thepressproject.gr"
, channelDescription = "ThePressProject"
, channelArticles = articles
}
pure Channel { channelTitle = "ThePressProject"
, channelLink = "http://www.thepressproject.gr"
, channelDescription = "ThePressProject"
, channelArticles = articles
}
-- FIXME: The date without a proper time is not useful
toArticle :: ThePressProjectArticle -> Article
toArticle (ThePressProjectArticle title date text link img) = Article title link description Nothing
toArticle (ThePressProjectArticle title date summary link img) = Article title (T.pack urlBase <> "/" <> link) description Nothing
where
description = "<img src=\"" <> T.pack urlBase <> T.drop 2 img <> "\"/>" <> date <> "<br/><br/>" <> summary
parseHtml :: BL.ByteString -> Maybe [ThePressProjectArticle]
parseHtml body = case articles of
Nothing -> Nothing
Just [] -> Nothing
Just xs -> Just xs
where
description = "<img src=\"" <> img <> "\"/>" <> date <> "<br/><br/>" <> text
-- FIXME: How to make this a Conduit XT.Event m ThePressProjectArticle instead, i.e.
-- produce the articles lazily?
parseHtml :: (MonadThrow m) => ConduitM XT.Event o m [ThePressProjectArticle]
parseHtml = XP.force "html" $
XP.tagName "html" XP.ignoreAttrs $ \_ -> do
void (XP.many (XP.ignoreTree (/= "body")))
articles <- XP.force "body" $ XP.tagName "body" XP.ignoreAttrs $ \_ ->
mconcat <$> XP.many' parseDivFull
void (XP.many XP.ignoreAllTreesContent)
pure articles
parseDivFull :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
parseDivFull = tagNameWithAttrValue "div" "class" "full" $
mconcat <$> XP.many' parseDivMainLeft
parseDivMainLeft :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
parseDivMainLeft = tagNameWithAttrValue "div" "class" "mainleft" $
mconcat <$> XP.many' parseDivC70
parseDivC70 :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
parseDivC70 = tagNameWithAttrValue "div" "class" "c70" $
XP.force "div inner c70" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
mconcat <$> XP.many' parseDivSemiRight
parseDivSemiRight :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
parseDivSemiRight = tagNameWithAttrValue "div" "class" "semiright" $
mconcat <$> XP.many' parseDivList
parseDivList :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
parseDivList = tagNameWithAttrValue "div" "class" "list" $
XP.many parseItem
parseItem :: (MonadThrow m) => ConduitM XT.Event o m (Maybe ThePressProjectArticle)
parseItem = do
article <- XP.tagName "div" XP.ignoreAttrs $ \_ -> do
(date, img, link) <- parseDateImgLink
(title, text) <- parseText
pure (ThePressProjectArticle title date text link img)
case article of
Nothing -> pure Nothing
Just _ -> do
XP.force "div clear" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
void (XP.many XP.ignoreAllTreesContent)
pure article
parseDateImgLink :: (MonadThrow m) => ConduitM XT.Event o m (Text, Text, Text)
parseDateImgLink = XP.force "div date" $ XP.tagName "div" XP.ignoreAttrs $ \_ -> do
(link, img) <- XP.force "div line" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
XP.force "a" $ XP.tagName "a" (XP.requireAttr "href" <* XP.ignoreAttrs) $ \href -> do
img <- XP.force "img" $ XP.tagName "img" (XP.requireAttr "src" <* XP.ignoreAttrs) pure
pure (T.pack urlBase <> "/" <> href, T.pack urlBase <> T.drop 2 img)
date <- XP.force "div" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
XP.force "div open" $ tagNameWithAttrValue "div" "class" "open" $
XP.force "table" $ XP.tagName "table" XP.ignoreAttrs $ \_ ->
XP.force "tr" $ XP.tagName "tr" XP.ignoreAttrs $ \_ ->
XP.force "td" $ XP.tagName "td" XP.ignoreAttrs $ const XP.content
pure (date, img, link)
parseText :: (MonadThrow m) => ConduitM XT.Event o m (Text, Text)
parseText = XP.force "div text" $ XP.tagName "div" XP.ignoreAttrs $ \_ -> do
title <- XP.force "div title link" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
XP.force "a title" $ XP.tagName "a" (XP.requireAttr "href" <* XP.ignoreAttrs) $ const XP.content
XP.force "style" $ XP.tagName "style" XP.ignoreAttrs $ \_ ->
void (XP.many XP.ignoreAllTreesContent)
XP.force "div gap" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
void (XP.many XP.ignoreAllTreesContent)
text <- XP.force "div text" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
XP.force "a text" $ XP.tagName "a" XP.ignoreAttrs $ \_ -> do
void $ tagNameWithAttrValue "div" "class" "ticker" $
void XP.ignoreAllTreesContent
XP.content
XP.force "div open" $ tagNameWithAttrValue "div" "class" "open" $
void (XP.many XP.ignoreAllTreesContent)
pure (title, text)
-- Extract the list of divs that contain what we're interested in
extractedArticleDivs =
body ^.. html ...
named (only "body") ...
named (only "div") . withAttribute "class" "full" ...
named (only "div") . withAttribute "class" "mainleft" ...
named (only "div") . withAttribute "class" "c70" ...
named (only "div") ...
named (only "div") . withAttribute "class" "semiright" ...
named (only "div") . withAttribute "class" "list" ...
filtered (\e -> has (named (only "div")) e && hasn't (withAttribute "style" "clear:both;") e)
-- Convert the groups of divs to our data structure
articles = sequenceA $ fmap divToArticle extractedArticleDivs
divToArticle :: Element -> Maybe ThePressProjectArticle
divToArticle divArticle = ThePressProjectArticle <$> title <*> date <*> summary <*> link <*> img
where
leftDiv = divArticle ^? plate .
named (only "div") . withAttribute "style" "float:left; width:25%;"
imgA = leftDiv ^? traverse . ix 0 .
named (only "div") ...
named (only "a") . filtered (has (node "img"))
link = imgA ^? traverse . attr "href" . _Just
img = imgA ^? traverse . ix 0 .
named (only "img") . attr "src" . _Just
date = leftDiv ^? traverse . ix 1 .
named (only "div") ...
named (only "div") . withAttribute "class" "open" ...
named (only "table") ...
named (only "tr") ...
named (only "td") . text
rightDiv = divArticle ^? plate .
named (only "div") . withAttribute "style" "float:left; width:70%;"
title = rightDiv ^? traverse . ix 0 ...
named (only "a") . text
summary = rightDiv ^? traverse . ix 3 ...
named (only "a") . text
Oops, something went wrong.

0 comments on commit 7dacdab

Please sign in to comment.