Permalink
Browse files

Convert HTML parsing from html-conduit to xml-html-conduit-lens

Lens magic \o/
  • Loading branch information...
1 parent 610fdee commit 7dacdab658651c633aa02f168c51077cdb1a791f @sdroege committed Aug 29, 2016
Showing with 241 additions and 275 deletions.
  1. +3 −3 html-rss-proxy.cabal
  2. +2 −0 src/Main.hs
  3. +59 −65 src/MakThes.hs
  4. +75 −102 src/ThePressProject.hs
  5. +89 −93 src/ToVima.hs
  6. +13 −12 src/Utils.hs
@@ -38,12 +38,12 @@ executable html-rss-proxy
, containers
, transformers
, scotty
- , conduit
, http-conduit
- , xml-types
, xml-conduit
, xml-conduit-writer
- , html-conduit
+ , xml-html-conduit-lens
+ , split
+ , lens
, cereal
, cereal-text
, filelock
View
@@ -4,6 +4,7 @@ import Types
import Rss
import Db
import Config
+import Utils
import Data.Monoid
@@ -101,6 +102,7 @@ updateChannels path errorChannels = getCurrentMonotonicTime >>= go
handlers name = [ Handler (\(e :: IOException) -> storeException name (T.pack (show e)))
, Handler (\(e :: HttpException) -> storeException name (T.pack (show e)))
, Handler (\(e :: XmlException) -> storeException name (T.pack (show e)))
+ , Handler (\(e :: ParsingException) -> storeException name (T.pack (show e)))
, Handler (\(e :: SomeException) -> putStrLn ("Exception while updating " ++ T.unpack name ++ ": " ++ show e) >> exitFailure)
]
View
@@ -12,16 +12,19 @@ import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Catch
+import Data.String
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
-import Data.Conduit
-import Network.HTTP.Simple (httpSink)
-import Network.HTTP.Conduit (parseUrl)
-import qualified Data.XML.Types as XT
-import qualified Text.XML.Stream.Parse as XP
-import qualified Text.HTML.DOM as HTML
+import Data.List.Split
+
+import qualified Data.ByteString.Lazy as BL
+
+import Control.Lens
+import Text.Xml.Lens
+
+import qualified Network.HTTP.Simple as HTTP
data MakThesArticle = MakThesArticle
{ articleTitle :: Text
@@ -35,67 +38,58 @@ urlBase, url :: String
urlBase = "http://www.makthes.gr"
url = "http://www.makthes.gr/ItemsFlow/GR/Roi_Eidiseon"
-getChannel :: (MonadIO m) => m Channel
+getChannel :: (MonadIO m, MonadThrow m) => m Channel
getChannel = do
- articles <- fmap (fmap toArticle) $ do
- req <- liftIO $ parseUrl url
- liftIO $ httpSink req $ \_ ->
- HTML.eventConduit =$= parseHtml
-
- pure Channel { channelTitle = "ΜακΘεσ"
- , channelLink = "http://www.makthes.gr"
- , channelDescription = "Εφημερίδα Μακεδονία της Θεσσαλονίκης"
- , channelArticles = articles
- }
+ resp <- HTTP.httpLBS (fromString url)
+ let body = HTTP.getResponseBody resp
+ articles = fmap (fmap toArticle) (parseHtml body)
+
+ case articles of
+ Nothing -> throwM ParsingException
+ Just articles -> pure Channel { channelTitle = "ΜακΘεσ"
+ , channelLink = "http://www.makthes.gr"
+ , channelDescription = "Εφημερίδα Μακεδονία της Θεσσαλονίκης"
+ , channelArticles = articles
+ }
-- FIXME: The date without a proper time is not useful
toArticle :: MakThesArticle -> Article
-toArticle (MakThesArticle title date text link) = Article title link description Nothing
+toArticle (MakThesArticle title date summary link) = Article title (T.pack urlBase <> link) description Nothing
where
- description = date <> "<br/><br/>" <> text
-
--- FIXME: How to make this a Conduit XT.Event m MakThesArticle instead, i.e.
--- produce the articles lazily?
-parseHtml :: (MonadThrow m) => ConduitM XT.Event o m [MakThesArticle]
-parseHtml = XP.force "html" $
- XP.tagName "html" XP.ignoreAttrs $ \_ -> do
- void (XP.many (XP.ignoreTree (/= "body")))
- articles <- XP.force "body" $ XP.tagName "body" XP.ignoreAttrs $ \_ ->
- mconcat <$> XP.many' parseForm1
- void (XP.many XP.ignoreAllTreesContent)
- pure articles
-
-parseForm1 :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
-parseForm1 = tagNameWithAttrValue "form" "id" "form1" $
- mconcat <$> XP.many' parseDivMainPlaceholder
-
-parseDivMainPlaceholder :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
-parseDivMainPlaceholder = tagNameWithAttrValue "div" "id" "mainplaceholder" $
- mconcat <$> XP.many' parseDivInnerPage
-
-parseDivInnerPage :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
-parseDivInnerPage = tagNameWithAttrValue "div" "id" "innerpage" $
- mconcat <$> XP.many' parseDivMainContet
-
-parseDivMainContet :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
-parseDivMainContet = tagNameWithAttrValue "div" "class" "maincontet" $
- mconcat <$> XP.many' parseDivItemsFlow
-
-parseDivItemsFlow :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [MakThesArticle])
-parseDivItemsFlow = tagNameWithAttrValue "div" "id" "divitemsflow" $
- XP.many parseItem
-
-parseItem :: (MonadThrow m) => ConduitM XT.Event o m (Maybe MakThesArticle)
-parseItem = do
- date <- tagNameWithAttrValue "div" "class" "date red" XP.content
- case date of
- Nothing -> pure Nothing
- Just realDate -> do
-
- (title, link) <- XP.force "div class title" $ tagNameWithAttrValue "div" "class" "title" $
- XP.force "a title" $ XP.tagName "a" (XP.requireAttr "href" <* XP.ignoreAttrs) $ \link -> do
- title <- XP.content
- pure (title, T.pack urlBase <> link)
- text <- XP.force "text" $ tagNameWithAttrValue "div" "class" "text" XP.content
- void (XP.ignoreTagName "hr")
- pure (Just (MakThesArticle title realDate text link))
+ description = date <> "<br/><br/>" <> summary
+
+parseHtml :: BL.ByteString -> Maybe [MakThesArticle]
+parseHtml body = case articles of
+ Nothing -> Nothing
+ Just [] -> Nothing
+ Just xs -> Just xs
+ where
+ -- Extract the list of divs that contain what we're interested in
+ extractedArticleDivs =
+ body ^.. html ...
+ named (only "body") ...
+ named (only "form") . withAttribute "id" "form1" ...
+ named (only "div") . withAttribute "id" "mainplaceholder" ...
+ named (only "div") . withAttribute "id" "innerpage" ...
+ named (only "div") . withAttribute "class" "maincontet" ...
+ named (only "div") . withAttribute "id" "divitemsflow" .
+ plate
+
+ -- Group them, each article is 3 divs followed by an hr
+ groupedArticleDivs = (split . dropFinalBlank . dropDelims . whenElt) (\a -> a ^. name == "hr") extractedArticleDivs
+
+ -- Convert the groups of divs to our data structure
+ articles = sequenceA $ fmap divGroupToArticle groupedArticleDivs
+
+divGroupToArticle :: [Element] -> Maybe MakThesArticle
+divGroupToArticle [dateRedDiv, divTitle, divText] = MakThesArticle <$> title <*> date <*> summary <*> link
+ where
+ date = dateRedDiv ^? withAttribute "class" "date red" . text
+ titleLinkA = divTitle ^? withAttribute "class" "title" ...
+ named (only "a")
+ link = titleLinkA ^? _Just . attr "href" . _Just
+ title = titleLinkA ^? _Just . text
+ summary = Just $ divText ^. withAttribute "class" "text" . text
+
+divGroupToArticle _ = Nothing
+
View
@@ -12,16 +12,19 @@ import Control.Monad
import Control.Monad.IO.Class
import Control.Monad.Catch
+import Data.String
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
-import Data.Conduit
-import Network.HTTP.Simple (httpSink)
-import Network.HTTP.Conduit (parseUrl)
-import qualified Data.XML.Types as XT
-import qualified Text.XML.Stream.Parse as XP
-import qualified Text.HTML.DOM as HTML
+import Data.List.Split
+
+import qualified Data.ByteString.Lazy as BL
+
+import Control.Lens
+import Text.Xml.Lens
+
+import qualified Network.HTTP.Simple as HTTP
data ThePressProjectArticle = ThePressProjectArticle
{ articleTitle :: Text
@@ -36,106 +39,76 @@ urlBase, url :: String
urlBase = "http://www.thepressproject.gr"
url = "http://www.thepressproject.gr/list_en.php"
-getChannel :: (MonadIO m) => m Channel
+getChannel :: (MonadIO m, MonadThrow m) => m Channel
getChannel = do
- articles <- fmap (fmap toArticle) $ do
- req <- liftIO $ parseUrl url
- liftIO $ httpSink req $ \_ ->
- HTML.eventConduit =$= parseHtml
+ resp <- HTTP.httpLBS (fromString url)
+ let body = HTTP.getResponseBody resp
+ articles = fmap (fmap toArticle) (parseHtml body)
+
+ case articles of
+ Nothing -> throwM ParsingException
+ Just articles -> pure Channel { channelTitle = "ThePressProject"
+ , channelLink = "http://www.thepressproject.gr"
+ , channelDescription = "ThePressProject"
+ , channelArticles = articles
+ }
- pure Channel { channelTitle = "ThePressProject"
- , channelLink = "http://www.thepressproject.gr"
- , channelDescription = "ThePressProject"
- , channelArticles = articles
- }
-- FIXME: The date without a proper time is not useful
toArticle :: ThePressProjectArticle -> Article
-toArticle (ThePressProjectArticle title date text link img) = Article title link description Nothing
+toArticle (ThePressProjectArticle title date summary link img) = Article title (T.pack urlBase <> "/" <> link) description Nothing
+ where
+ description = "<img src=\"" <> T.pack urlBase <> T.drop 2 img <> "\"/>" <> date <> "<br/><br/>" <> summary
+
+
+parseHtml :: BL.ByteString -> Maybe [ThePressProjectArticle]
+parseHtml body = case articles of
+ Nothing -> Nothing
+ Just [] -> Nothing
+ Just xs -> Just xs
where
- description = "<img src=\"" <> img <> "\"/>" <> date <> "<br/><br/>" <> text
-
--- FIXME: How to make this a Conduit XT.Event m ThePressProjectArticle instead, i.e.
--- produce the articles lazily?
-parseHtml :: (MonadThrow m) => ConduitM XT.Event o m [ThePressProjectArticle]
-parseHtml = XP.force "html" $
- XP.tagName "html" XP.ignoreAttrs $ \_ -> do
- void (XP.many (XP.ignoreTree (/= "body")))
- articles <- XP.force "body" $ XP.tagName "body" XP.ignoreAttrs $ \_ ->
- mconcat <$> XP.many' parseDivFull
- void (XP.many XP.ignoreAllTreesContent)
- pure articles
-
-parseDivFull :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
-parseDivFull = tagNameWithAttrValue "div" "class" "full" $
- mconcat <$> XP.many' parseDivMainLeft
-
-parseDivMainLeft :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
-parseDivMainLeft = tagNameWithAttrValue "div" "class" "mainleft" $
- mconcat <$> XP.many' parseDivC70
-
-parseDivC70 :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
-parseDivC70 = tagNameWithAttrValue "div" "class" "c70" $
- XP.force "div inner c70" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
- mconcat <$> XP.many' parseDivSemiRight
-
-parseDivSemiRight :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
-parseDivSemiRight = tagNameWithAttrValue "div" "class" "semiright" $
- mconcat <$> XP.many' parseDivList
-
-parseDivList :: (MonadThrow m) => ConduitM XT.Event o m (Maybe [ThePressProjectArticle])
-parseDivList = tagNameWithAttrValue "div" "class" "list" $
- XP.many parseItem
-
-parseItem :: (MonadThrow m) => ConduitM XT.Event o m (Maybe ThePressProjectArticle)
-parseItem = do
- article <- XP.tagName "div" XP.ignoreAttrs $ \_ -> do
- (date, img, link) <- parseDateImgLink
- (title, text) <- parseText
-
- pure (ThePressProjectArticle title date text link img)
-
- case article of
- Nothing -> pure Nothing
- Just _ -> do
- XP.force "div clear" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
- void (XP.many XP.ignoreAllTreesContent)
- pure article
-
-parseDateImgLink :: (MonadThrow m) => ConduitM XT.Event o m (Text, Text, Text)
-parseDateImgLink = XP.force "div date" $ XP.tagName "div" XP.ignoreAttrs $ \_ -> do
- (link, img) <- XP.force "div line" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
- XP.force "a" $ XP.tagName "a" (XP.requireAttr "href" <* XP.ignoreAttrs) $ \href -> do
- img <- XP.force "img" $ XP.tagName "img" (XP.requireAttr "src" <* XP.ignoreAttrs) pure
- pure (T.pack urlBase <> "/" <> href, T.pack urlBase <> T.drop 2 img)
-
- date <- XP.force "div" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
- XP.force "div open" $ tagNameWithAttrValue "div" "class" "open" $
- XP.force "table" $ XP.tagName "table" XP.ignoreAttrs $ \_ ->
- XP.force "tr" $ XP.tagName "tr" XP.ignoreAttrs $ \_ ->
- XP.force "td" $ XP.tagName "td" XP.ignoreAttrs $ const XP.content
-
- pure (date, img, link)
-
-parseText :: (MonadThrow m) => ConduitM XT.Event o m (Text, Text)
-parseText = XP.force "div text" $ XP.tagName "div" XP.ignoreAttrs $ \_ -> do
- title <- XP.force "div title link" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
- XP.force "a title" $ XP.tagName "a" (XP.requireAttr "href" <* XP.ignoreAttrs) $ const XP.content
-
- XP.force "style" $ XP.tagName "style" XP.ignoreAttrs $ \_ ->
- void (XP.many XP.ignoreAllTreesContent)
-
- XP.force "div gap" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
- void (XP.many XP.ignoreAllTreesContent)
-
- text <- XP.force "div text" $ XP.tagName "div" XP.ignoreAttrs $ \_ ->
- XP.force "a text" $ XP.tagName "a" XP.ignoreAttrs $ \_ -> do
- void $ tagNameWithAttrValue "div" "class" "ticker" $
- void XP.ignoreAllTreesContent
- XP.content
-
- XP.force "div open" $ tagNameWithAttrValue "div" "class" "open" $
- void (XP.many XP.ignoreAllTreesContent)
-
- pure (title, text)
+ -- Extract the list of divs that contain what we're interested in
+ extractedArticleDivs =
+ body ^.. html ...
+ named (only "body") ...
+ named (only "div") . withAttribute "class" "full" ...
+ named (only "div") . withAttribute "class" "mainleft" ...
+ named (only "div") . withAttribute "class" "c70" ...
+ named (only "div") ...
+ named (only "div") . withAttribute "class" "semiright" ...
+ named (only "div") . withAttribute "class" "list" ...
+ filtered (\e -> has (named (only "div")) e && hasn't (withAttribute "style" "clear:both;") e)
+
+ -- Convert the groups of divs to our data structure
+ articles = sequenceA $ fmap divToArticle extractedArticleDivs
+
+divToArticle :: Element -> Maybe ThePressProjectArticle
+divToArticle divArticle = ThePressProjectArticle <$> title <*> date <*> summary <*> link <*> img
+ where
+ leftDiv = divArticle ^? plate .
+ named (only "div") . withAttribute "style" "float:left; width:25%;"
+
+ imgA = leftDiv ^? traverse . ix 0 .
+ named (only "div") ...
+ named (only "a") . filtered (has (node "img"))
+
+ link = imgA ^? traverse . attr "href" . _Just
+ img = imgA ^? traverse . ix 0 .
+ named (only "img") . attr "src" . _Just
+
+ date = leftDiv ^? traverse . ix 1 .
+ named (only "div") ...
+ named (only "div") . withAttribute "class" "open" ...
+ named (only "table") ...
+ named (only "tr") ...
+ named (only "td") . text
+
+ rightDiv = divArticle ^? plate .
+ named (only "div") . withAttribute "style" "float:left; width:70%;"
+
+ title = rightDiv ^? traverse . ix 0 ...
+ named (only "a") . text
+
+ summary = rightDiv ^? traverse . ix 3 ...
+ named (only "a") . text
Oops, something went wrong.

0 comments on commit 7dacdab

Please sign in to comment.