Permalink
Browse files

First update

  • Loading branch information...
0 parents commit afc595ea30ba3613767ea279e1957a44cfec5483 @ragrawal committed Mar 30, 2012
@@ -0,0 +1,57 @@
+require 'curb'
+require 'cgi'
+
+require_relative 'MementoException'
+require_relative 'parser/ParserManager'
+require_relative 'writer/WriterManager'
+
+module Memento
+ def self.transform(input_format, output_format, value)
+ #Sanity Checks
+ raise MementoException, "Error: Missing required parameter: input_format" if input_format.nil? or input_format.empty?
+ raise MementoException, "Error: Missing required parameter: output_format" if output_format.nil? or output_format.empty?
+ raise MementoException, "Error: Missing required parameter: text" if value.nil? or value.empty?
+
+ #if input_format = 'site', then its a website and use UrlParser to get text
+ parser = nil
+ if ['site'].include?(input_format.downcase.strip)
+ parser = Memento::ParserManager.get_url_parser(value)
+ else
+ parser = Memento::ParserManager.get_text_parser(input_format)
+ end
+ raise MementoException, "Unable to find required parser" if parser.nil?
+
+ parser.value = value
+ data = parser.get_data
+
+ writer = Memento::WriterManager.get_writer(output_format)
+ return writer.export(data)
+ end
+
+ def self.get_page(url, parameters = {}, referer = nil)
+ c = Curl::Easy.new(url)
+ c.follow_location = true
+ c.header_in_body = false
+ c.useragent='Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5'
+ c.enable_cookies = true
+
+ if parameters and !parameters.empty?
+ c.http_post parameters.map{|k,v| "#{k}=#{CGI.escape(v)}"}.join('&')
+ end
+
+ #sometimes especially in the case pubmed url, c.perform
+ # falsely gives PartialFileError but successfully retrieves body
+ begin
+ c.perform
+ rescue
+
+ end
+ return c.body_str
+
+ end
+
+
+
+
+
+end
@@ -0,0 +1,8 @@
+# Note instead of subclassing from Exception, use StandardError because
+# StandardError deals with application level errors where as Exception deals with the
+# both application and environment level types of errors
+
+class MementoException < StandardError
+
+
+end
@@ -0,0 +1,82 @@
+module Memento
+ module Parser
+ class TextParser
+
+ attr_accessor :value
+
+ #constructor
+ def validate
+ raise MementoException, "Error: Missing required text" if text.nil? or text.empty?
+ end
+
+ # extract citation information form the input string
+ # and return an array of BibTexEntry object
+ def get_data
+ raise 'calling abstract method: get_data'
+ end
+
+ end
+
+ class UrlParser
+ attr_accessor :value
+
+
+ def validate
+ raise MementoException, 'Invalid Url' if @value.nil? or @value.empty?
+ end
+
+ #function: get_data
+ #@description: processes url and returns citation information as an array of BibTeX entries.
+ #
+ def get_data
+ validate
+ to = get_citation_url()
+ params = get_form_parameters
+ referrer = get_referrer()
+
+ citation = Memento.get_page(to, params, referrer)
+ raise MementoException, 'Error: Unable to fetch citation details' if citation.to_s.strip.empty?
+ puts citation
+ text_parser = Memento::ParserManager.get_text_parser(get_citation_format)
+ text_parser.value = citation
+ data = text_parser.get_data()
+
+ return data
+
+ end
+
+ protected
+ #If to fetch citation details requires filling form, then provide form parameters
+ def get_form_parameters
+ {}
+ end
+
+ def get_referrer
+ return @url
+ end
+
+ #============ ABSTRACT METHODS ============#
+ # Subclass of UrlParser will need to atleast implement these
+ # two functions
+ #==========================================
+
+ #returns Url from where the citation detalils can be fetched
+ def get_citation_url
+ raise MementoException, 'Called abstract method: get_citation_url'
+ end
+
+
+ # return the format of citation
+ def get_citation_format
+ raise MementoException, 'Called abstract method: get_citation_format'
+ end
+
+
+
+ end
+
+ end
+end
+
+
+
@@ -0,0 +1,45 @@
+require_relative 'Parser'
+
+#Test Parser
+require_relative 'text/BibTexParser'
+
+#Url Parser
+require_relative 'url/ASMParser'
+require_relative 'url/HubMedParser'
+require_relative 'url/ACMPortalParser'
+require_relative 'url/BlackwellSynergyParser'
+require_relative 'url/PubmedParser'
+
+
+module Memento
+ module ParserManager
+ TEXT_PARSER = {
+ 'bibtex' => {'name' => 'BibTeX', 'parser' => 'BibTexParser'}
+ }
+
+ WEBSITES = {
+ 'asm.org' => {'name' => 'ASM Journals', 'link' => 'http://journals.asm.org/', 'parser' => 'ASMParser'},
+ 'hubmed.org' => {'name' => 'Hubmed', 'link' => "http://www.hubmed.org", 'parser' => 'HubMedParser'},
+ 'dl.acm.org' => {'name' => 'ACM Digital Library', 'link' => 'http://dl.acm.org/', 'parser' => 'ACMPortalParser'},
+ 'ncbi.nlm.nih.gov' => {'name' => 'PubMed', 'link' => 'http://www.pubmed.gov', 'parser'=>'PubmedParser'},
+ 'onlinelibrary.wiley.com' => {'name' =>'Wiley Online Library', 'link' => "http://onlinelibrary.wiley.com", 'parser' => 'BlackwellSynergyParser'}
+
+ }
+
+ def self.get_text_parser(format)
+ raise MementoException, "Error: Missing require parameter: format" if format.nil? or format.empty?
+ info = TEXT_PARSER[format.to_s.downcase.strip]
+ raise MementoException, "Error: unsupported text format: #{format}" if info.nil?
+ Kernel.const_get(info['parser']).new
+ end
+
+ def self.get_url_parser(url)
+ raise MementoException, "Error: Missing required parameter url" if url.nil? or url.empty?
+ WEBSITES.each do |key, value|
+ return Kernel.const_get(value['parser']).new if url =~ /#{key}/
+ end
+ raise MementoException, "Error: Parsing is not supported for this website"
+ end #get_parser
+
+ end
+end
@@ -0,0 +1,50 @@
+require 'libxml'
+
+class AmazonXMLParser < Memento::Parser::UrlParser
+
+ def get_data
+ validate
+ doc = XML::Parser.string(@value)
+ item = doc.parse
+ item = item.root.find('./Items/Item')
+ if('Book' != item.attributes)
+
+ $item = $xml->Items->Item;
+ if('Book' != (string)$item->ItemAttributes->ProductGroup)
+ throw new Exception("Currently only books can be imported from Amazon");
+
+ $article['doctype'] = 'book';
+ $article['url'] = trim($item->DetailPageURL);
+ $article['title'] = trim($item->ItemAttributes->Title);
+ $article['publisher']=trim($item->ItemAttributes->Publisher);
+ $article['pages'] = trim($item->ItemAttributes->NumberOfPages);
+ $imgUrl = trim($item->SmallImage->URL);
+
+ if(!empty($imgUrl)){
+ $ch = curl_init($imgUrl);
+ $ext = strtolower(end(explode('.', $imgUrl)));
+ uses('neat_string');
+ $neat = new NeatString();
+ $filename = $neat->randomPassword(10) . '.' . $ext;
+ $fp = fopen(ARTICLE_ICON . $filename, 'w');
+ curl_setopt($ch, CURLOPT_FILE, $fp);
+ curl_setopt($ch, CURLOPT_HEADER, 0);
+ curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
+ curl_exec($ch);
+ curl_close($ch);
+ fclose($fp);
+ $article['img'] = ARTICLE_ICON_URL.$filename;
+ }
+
+
+
+ list($article['year'], $article['month'], $article['day']) = DateUtil::getCleanDate((string)$item->ItemAttributes->PublicationDate);
+
+ foreach($item->ItemAttributes->Author as $author)
+ $authors[] = (string) $author;
+
+ $data[0] = array('Article'=>$article, 'Author' => $authors);
+ return $data;
+ }
+ end
+end
@@ -0,0 +1,7 @@
+require 'bibtex'
+
+class BibTexParser < Memento::Parser::TextParser
+ def get_data
+ return BibTeX.parse @value
+ end
+end
@@ -0,0 +1,19 @@
+class ACMPortalParser < Memento::Parser::UrlParser
+ PATTERNS = [
+ Regexp.new('id=(\d*)', Regexp::IGNORECASE)
+ ]
+ BASE_URL = 'http://dl.acm.org/exportformats.cfm?expformat=bibtex&id='
+
+ def get_citation_url
+ validate
+ PATTERNS.each do |pattern|
+ m = @value.match(pattern)
+ return BASE_URL + m[1] if m and m.length == 2
+ end
+ raise MementoException, "Error: Unable to find citation information"
+ end
+
+ def get_citation_format
+ 'bibtex'
+ end
+end
@@ -0,0 +1,24 @@
+require 'uri'
+class ASMParser < Memento::Parser::UrlParser
+
+ PATTERNS = [
+ Regexp.new('http:\/\/(.*)\.asm\.org.*[abstract|full|reprint]\/(.*)\?', Regexp::IGNORECASE),
+ Regexp.new('http:\/\/(.*)\.asm\.org\/content\/(.*)\.[abstract|full]')
+ ].freeze
+ BASE_URL = 'http://DOMAIN.asm.org/citmgr?type=bibtex&gca=';
+
+ def get_citation_url
+ PATTERNS.each do |pattern|
+ match = @value.match(pattern)
+ return BASE_URL.gsub('DOMAIN', match[1]) + URI.escape("#{match[1]};#{match[2]}") if match and match.length == 3
+ end
+ raise MementoException, "Error: Unable to find link to bibtex"
+ end
+
+ def get_citation_format
+ "bibtex"
+ end
+
+
+
+end
@@ -0,0 +1,44 @@
+class AmazonParser < Memento::Parser::UrlParser
+ @BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=1E6W7G64405195A1J702&Operation=ItemLookup&ResponseGroup=Medium&ItemId='
+
+ PATTERNS = [
+ Regexp.new('/gp/product/(\d*)', Regexp::IGNORECASE),
+ Regexp.new('/ASIN/(\d*)', Regexp::IGNORECASE),
+ Regexp.new('/dp/(\w*)', Regexp::IGNORECASE)
+ ].freeze
+
+ if(preg_match("#/gp/product/(\d*)#", $url, $matches))
+ return $matches[1];
+ if(preg_match("#/ASIN/(\d*)#", $url, $matches))
+ return $matches[1];
+ if(preg_match("#/dp/(\w*)#", $url, $matches))
+ return $matches[1];
+
+ def get_data
+
+ end
+
+ protected
+
+ def get_citation_url
+ asin = get_asin()
+ raise MementoException, 'Unable to get Amazon Standard Identification Number (ASIN).' if(asin == -1)
+ return @BASE_URL + asin
+ end
+
+ #TODO:
+ def get_asin
+ PATTERNS.each do |pattern|
+ matches = @value.match(pattern)
+ return matches[1] if matches and matches.length >= 2
+ end
+ end
+
+
+ def get_citation_format
+ raise MementoException, 'Called abstract method: get_citation_format'
+ end
+
+
+
+end
@@ -0,0 +1,32 @@
+class BlackwellSynergyParser < Memento::Parser::UrlParser
+
+ PATTERNS = [
+ Regexp.new('doi/(.*)/[abstract|full]', Regexp::IGNORECASE)
+ ]
+ BASE_URL = 'http://onlinelibrary.wiley.com/documentcitationdownloadformsubmit'
+
+
+ def get_form_parameters
+ params = {'hasAbstract' => 'CITATION_AND_ABSTRACT', 'fileFormat' => 'BIBTEX', 'submit' => 'Submit'}
+
+ PATTERNS.each do |pattern|
+ match = @value.match(pattern)
+ if match and match.length == 2
+ params['doi'] = match[1]
+ break
+ end
+ end
+
+ raise MementoException, "Error: Unable to find DOI" unless params.has_key?('doi')
+
+ return params
+ end
+
+ def get_citation_url
+ BASE_URL
+ end
+
+ def get_citation_format
+ 'bibtex'
+ end
+end
Oops, something went wrong.

0 comments on commit afc595e

Please sign in to comment.