Permalink
Browse files

Initial commit

  • Loading branch information...
1 parent 536c811 commit 94f1811f26b8322c78653273bf2514e3ce61e0c5 @robstewart57 committed Jul 29, 2012
Showing with 398 additions and 3 deletions.
  1. +14 −0 LICENSE
  2. +56 −3 README.md
  3. +3 −0 Setup.lhs
  4. +118 −0 Text/Bibtex/DBLPToBibtex.hs
  5. +25 −0 Text/Bibtex/ListPapers.hs
  6. +82 −0 Text/Bibtex/Main.hs
  7. +44 −0 Text/Bibtex/SearchDBLPURIs.hs
  8. +56 −0 dblp2bibtex.cabal
View
@@ -0,0 +1,14 @@
+Copyright 2012 Rob Stewart
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
View
@@ -1,4 +1,57 @@
-dblp2bibtex
-===========
+# dblp2bibtex
-Generates bibtex files for authors identified in the DBLP database
+This `dblp2bibtex` tool provides 3 features, primarily generating bibtex files for authors identified in the DBLP database. Authors are disambiguated with Unique Resource Identifiers, which are used extensively in this utility.
+
+* Search for a DBLP URI given a name (e.g. "Simon Peyton Jones")
+* List the titles of publications for a give author URI
+* Generate a bibtex file for all publications authored by the given author URI
+
+## Usage
+
+dblp2bibtex [OPTIONS]
+ A Haskell utility to generate bibtex files for an author identified with a DBLP URI
+
+Common flags:
+ -g --generatebibtex=Author URI Get bibtex file for given URI
+ -s --search=Author name Search for URI by name (e.g. "Joe Bloggs")
+ -l --listpapers=Author URI List papers for an author URI
+ -o --outfile=Bibtex filename (default 'export.bib')
+ -h --help Display help message
+ -v --version Print version information
+
+### Examples
+
+```
+$ dblp2bibtex -s "Simon Marlow"
+http://dblp.l3s.de/d2r/resource/authors/Simon_Marlow
+
+$ dblp2bibtex -l http://dblp.l3s.de/d2r/resource/authors/Simon_Marlow
+Developing High-Performance Server Applications in Haskell - Case Study: A Haskell Web Server.
+A Semantics for Imprecise Exceptions.
+Deforestation for Higher-Order Functions.
+Composable memory transactions.
+...
+
+$ dblp2bibtex -g http://dblp.l3s.de/d2r/resource/authors/Simon_Marlow -o my_papers.bib
+```
+
+
+## Installation
+
+The `dblp2bibtex` utility is written in Haskell, and depends on some packages not currently on hackage.org. The Haskell platform is needed to resolve dependencies and to install `dblp2bibtex`. It can be downloaded easily using package managers on most Linux distributions, or directly from [http://hackage.haskell.org/platform//](Haskell Platform).
+
+```
+cabal update
+cd ~/path
+git clone git://github.com/robstewart57/RDF4H.git
+cd RDF4H
+cabal install
+cd ~/path
+git clone git://github.com/robstewart57/dblp2bibtex.git
+cd dblp2bibtex
+cabal install
+```
+
+## Issues?
+
+This can be regarded as experimental software, and will probably have edge case bugs. Contributions are welcome! Please report issues on the GitHub issues: [https://github.com/robstewart57/dblp2bibtex/issues](here).
View
@@ -0,0 +1,3 @@
+#!/usr/bin/env runhaskell
+> import Distribution.Simple
+> main = defaultMain
@@ -0,0 +1,118 @@
+module Text.Bibtex.DBLPToBibtex (
+ dblpToBibtex -- :: String -> IO String
+ ) where
+
+import Database.HSparql.Connection
+import Database.HSparql.QueryGenerator
+import Data.Text.Lazy (pack,unpack,stripPrefix)
+import Data.Text.Lazy.Internal (Text)
+import Data.Maybe (mapMaybe,fromJust)
+import Network.HTTP
+import Text.XML.HaXml.Parse
+import Text.XML.HaXml.Posn
+import Text.XML.HaXml.Combinators
+import Text.XML.HaXml
+import Text.BibTeX.Format
+import Text.BibTeX.Entry
+
+dblpToBibtex :: String -> IO [String]
+dblpToBibtex authorURI = do
+ keys <- getDBLPKeys authorURI
+ mapM (getEntry . unpack) keys
+
+selectPublications :: String -> Query SelectQuery
+selectPublications authorURI = do
+ foaf <- prefix "foaf" (iriRef "http://xmlns.com/foaf/0.1/")
+ let author = iriRef authorURI
+ publication <- var
+ triple publication (foaf .:. "maker") author
+ return SelectQuery { queryVars = [publication] }
+
+getDBLPKeys :: String -> IO [Text]
+getDBLPKeys authorURI = do
+ let dblpPrefix = pack "http://dblp.l3s.de/d2r/resource/publications/"
+ (Just results) <- selectQuery "http://sparql.sindice.com/sparql" $ selectPublications authorURI
+ let uris = map (\[URI uri] -> pack uri) results
+ let dblpKeys = map (fromJust . stripPrefix dblpPrefix) uris
+ return dblpKeys
+
+getEntry :: String -> IO String
+getEntry key = do
+ xml <- downloadXML key
+ return (entryFromXML key xml)
+
+downloadXML :: String -> IO String
+downloadXML key = do
+ let url = "http://dblp.uni-trier.de/rec/bibtex/" ++ key ++ ".xml"
+ request = replaceHeader HdrUserAgent "dblp2bib-client" (getRequest url)
+ simpleHTTP request >>= getResponseBody
+
+entryFromXML :: String -> String -> String
+entryFromXML dblpKey xml =
+ let (Document _ _ root _) = xmlParse "error.log" xml
+ rootElem = (CElem root noPos)
+ dblpEntry = (tag "dblp" /> elm)
+
+ authorsQ = (dblpEntry /> tag "author" /> txt)
+ crossRefQ = (dblpEntry /> tag "crossref" /> txt)
+
+ entryT = entryType' rootElem
+
+ authors = authorList $ extractTxt authorsQ rootElem
+
+ maybeXref = extractTxt crossRefQ rootElem
+ crossRef = [("crossref", "DBLP:" ++ head maybeXref) | not (null maybeXref)]
+ tuples = entryTuples rootElem dblpEntry
+ tuples' = ("bibsource","DBLP, http://dblp.uni-trier.de")
+ :
+ ("author",authors)
+ :
+ crossRef ++ tuples
+
+ bibtexEntry = Cons {
+ entryType = entryT
+ , identifier = "DBLP:"++dblpKey
+ , fields = tuples'
+ }
+
+ in (entry bibtexEntry)
+
+
+confirmEntryType :: Content i -> String -> Bool
+confirmEntryType rootElem typeStr =
+ length ((tag "dblp" /> tag typeStr) rootElem) > 0
+
+entryType' :: Content i -> String
+entryType' rootElem =
+ let xs = ["inproceedings","article","misc","book","phdthesis","incollection"]
+ xs' = filter (confirmEntryType rootElem) xs
+ in if null xs'
+ then error "Unexpected entry type"
+ else head xs'
+
+
+entryTuples :: Content a -> CFilter a -> [(String, String)]
+entryTuples rootElem dblpEntry =
+ let keys = ["title","pages","year","booktitle","volume","journal","ee","number","pages"]
+ in (mapMaybe entryTuple keys)
+
+ where
+ entryTuple :: String -> Maybe (String, String)
+ entryTuple key =
+ let xmlFilter = (dblpEntry /> tag key /> txt)
+ xs = extractTxt xmlFilter rootElem
+ in if null xs
+ then Nothing
+ else Just (key, head xs)
+
+
+extractTxt :: CFilter a -> Content a -> [String]
+extractTxt xmlFilter event =
+ let xs = xmlFilter event
+ in (concatMap validityCheck xs)
+
+ where
+ validityCheck t =
+ case t of
+ (CString _ y _) -> [y]
+ _ -> []
@@ -0,0 +1,25 @@
+module Text.Bibtex.ListPapers (
+ publicationTitlesForAuthor -- :: String -> IO String
+ ) where
+
+import Database.HSparql.Connection
+import Database.HSparql.QueryGenerator
+
+publicationTitlesForAuthor :: String -> IO [String]
+publicationTitlesForAuthor authorURI = do
+ (Just results) <- selectQuery "http://sparql.sindice.com/sparql" $ titlesQuery authorURI
+ let titles = map (\[TypedLiteral uri _] -> uri) results
+ return titles
+
+
+titlesQuery :: String -> Query SelectQuery
+titlesQuery authorURI = do
+ foaf <- prefix "foaf" (iriRef "http://xmlns.com/foaf/0.1/")
+ dc <- prefix "dc" (iriRef "http://purl.org/dc/elements/1.1/")
+ let author = iriRef authorURI
+ publication <- var
+ title <- var
+ triple publication (foaf .:. "maker") author
+ triple publication (dc .:. "title") title
+ distinct
+ return SelectQuery { queryVars = [title] }
View
@@ -0,0 +1,82 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+
+import Text.Bibtex.DBLPToBibtex (dblpToBibtex)
+import Text.Bibtex.SearchDBLPURIs (findURIByName)
+import Text.Bibtex.ListPapers (publicationTitlesForAuthor)
+import System.Console.CmdArgs
+import Control.Monad (when)
+
+
+data MyOptions = MyOptions
+ { generatebibtex :: String,
+ search :: String,
+ listpapers :: String,
+ outfile :: String
+ } deriving (Data, Typeable, Show, Eq)
+
+_PROGRAM_NAME :: String
+_PROGRAM_NAME = "dblp2bibtex"
+
+_PROGRAM_VERSION :: String
+_PROGRAM_VERSION = "0.0.1"
+
+_PROGRAM_INFO :: String
+_PROGRAM_INFO = _PROGRAM_NAME ++ " version " ++ _PROGRAM_VERSION
+
+_PROGRAM_ABOUT :: String
+_PROGRAM_ABOUT = "A Haskell utility to generate bibtex files for an author identified with a DBLP URI"
+
+_COPYRIGHT :: String
+_COPYRIGHT = "(C) Rob Stewart 2012"
+
+myProgOpts :: MyOptions
+myProgOpts = MyOptions
+ { generatebibtex = def &= typ "Author URI" &= help "Get bibtex file for given URI",
+ search = def &= typ "Author name" &= help "Search for URI by name (e.g. \"Joe Bloggs\")",
+ listpapers = def &= typ "Author URI" &= help "List papers for an author URI",
+ outfile = def &= typ "Bibtex filename" &= help "(default 'export.bib')"
+ }
+
+run :: Mode (CmdArgs MyOptions)
+run = cmdArgsMode $ myProgOpts
+ &= versionArg [explicit, name "version", name "v", summary _PROGRAM_INFO]
+ &= summary (_PROGRAM_INFO ++ ", " ++ _COPYRIGHT)
+ &= help _PROGRAM_ABOUT
+ &= helpArg [explicit, name "help", name "h"]
+ &= program _PROGRAM_NAME
+
+defaultFilename :: String
+defaultFilename = "export.bib"
+
+main :: IO ()
+main = do
+ opts <- cmdArgsRun run
+ let searchURI = search opts
+ let genURI = generatebibtex opts
+ let lstURI = listpapers opts
+ x = filter (==True) (map optUsed [searchURI,genURI,lstURI])
+ when (length x /= 1) $
+ error "Exactly one option must be used. Try --help option"
+
+ if not (null searchURI)
+ then do
+ -- Search Sindice for URIs
+ uris <- findURIByName searchURI
+ mapM_ putStrLn uris
+ else
+ if not (null genURI)
+ then do
+ -- Generate bibtex from DBLP URI
+ bibtex <- dblpToBibtex genURI
+ let fname = if not (null (outfile opts))
+ then outfile opts
+ else defaultFilename
+ writeFile fname (unlines bibtex)
+ else do
+ -- List papers for a URI
+ titles <- publicationTitlesForAuthor lstURI
+ mapM_ putStrLn titles
+
+
+optUsed :: String -> Bool
+optUsed x = length x > 0
@@ -0,0 +1,44 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Text.Bibtex.SearchDBLPURIs (
+ findURIByName -- :: String -> IO [String]
+ ) where
+
+import Network.HTTP hiding (Done)
+import Data.RDF
+import Data.RDF.TriplesGraph
+import Text.RDF.RDF4H.XmlParser
+import qualified Data.ByteString.Lazy.Char8 as B
+import Data.List.Split
+import System.IO (stdout, stderr, hSetBuffering, BufferMode(..))
+
+queryURI :: String -> String
+queryURI name = "http://api.sindice.com/v3/search?fq=class:foaf:Agent&format=rdfxml&fq=domain:dblp.l3s.de&field=link&q=" ++ urlEncode name
+
+findURIByName :: String -> IO [String]
+findURIByName name = do
+ hSetBuffering stdout LineBuffering
+ hSetBuffering stderr LineBuffering
+
+ let url = queryURI name
+ request = replaceHeader HdrUserAgent "dblp2bib-client" (getRequest url)
+ xml <- simpleHTTP request >>= getResponseBody
+ let doc = elimXmlHeader xml -- hack
+ let (Right (rdf::TriplesGraph)) = parseXmlRDF Nothing Nothing (B.pack doc)
+ return $ getURIs rdf
+
+
+getURIs :: TriplesGraph -> [String]
+getURIs rdf =
+ let triples = query rdf Nothing (Just (unode "link")) Nothing
+ linksURIs = map objectOf triples
+ links = map (\(UNode s) -> (reverse . b2s . value) s) linksURIs
+ in links
+
+-- This is a hack. This should be resolved by:
+-- https://github.com/UweSchmidt/hxt/issues/4
+elimXmlHeader :: String -> String
+elimXmlHeader xml =
+ let xs = splitOn "\n" xml
+ in unlines $ tail xs
Oops, something went wrong.

0 comments on commit 94f1811

Please sign in to comment.