First update

ragrawal · Mar 30, 2012 · afc595e · afc595e
commit afc595e
Show file tree

Hide file tree

Showing 27 changed files with 1,050 additions and 0 deletions.
diff --git a/Memento.rb b/Memento.rb
@@ -0,0 +1,57 @@
+require 'curb'
+require 'cgi'
+
+require_relative 'MementoException'
+require_relative 'parser/ParserManager'
+require_relative 'writer/WriterManager'
+
+module Memento
+  def self.transform(input_format, output_format, value)
+        #Sanity Checks
+        raise MementoException, "Error: Missing required parameter: input_format" if input_format.nil? or input_format.empty?
+        raise MementoException, "Error: Missing required parameter: output_format" if output_format.nil? or output_format.empty?
+        raise MementoException, "Error: Missing required parameter: text" if value.nil? or value.empty?
+
+        #if input_format = 'site', then its a website and use UrlParser to get text
+        parser = nil
+        if ['site'].include?(input_format.downcase.strip)
+            parser = Memento::ParserManager.get_url_parser(value)
+        else
+            parser = Memento::ParserManager.get_text_parser(input_format)
+        end
+        raise MementoException, "Unable to find required parser" if parser.nil?
+
+        parser.value = value
+        data = parser.get_data
+
+        writer = Memento::WriterManager.get_writer(output_format)
+        return writer.export(data)
+  end
+
+  def self.get_page(url, parameters = {}, referer = nil)
+ 		c = Curl::Easy.new(url)
+ 		c.follow_location = true
+    c.header_in_body = false
+    c.useragent='Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5'
+    c.enable_cookies = true
+
+    if parameters and !parameters.empty?
+      c.http_post parameters.map{|k,v| "#{k}=#{CGI.escape(v)}"}.join('&')
+    end
+
+    #sometimes especially in the case pubmed url, c.perform 
+    # falsely gives PartialFileError but successfully retrieves body
+    begin
+      c.perform
+    rescue
+
+    end
+    return c.body_str
+
+ 	end
+
+
+
+
+
+end
diff --git a/MementoException.rb b/MementoException.rb
@@ -0,0 +1,8 @@
+# Note instead of subclassing from Exception, use StandardError because 
+# StandardError deals with application level errors where as Exception deals with the 
+# both application and environment level types of errors
+
+class MementoException < StandardError
+
+
+end
diff --git a/parser/Parser.rb b/parser/Parser.rb
@@ -0,0 +1,82 @@
+module Memento
+  module Parser
+    class TextParser 
+
+      attr_accessor :value
+
+      #constructor
+      def validate
+        raise MementoException, "Error: Missing required text" if text.nil? or text.empty?
+      end
+
+      # extract citation information form the input string 
+      # and return an array of BibTexEntry object
+      def get_data
+        raise 'calling abstract method: get_data'
+      end
+
+    end
+
+    class UrlParser 
+      attr_accessor :value
+
+
+      def validate
+        raise MementoException, 'Invalid Url' if @value.nil? or @value.empty?
+      end
+
+      #function: get_data
+      #@description: processes url and returns citation information as an array of BibTeX entries.
+      #
+      def get_data
+        validate
+        to = get_citation_url()
+        params = get_form_parameters
+        referrer = get_referrer()
+
+        citation = Memento.get_page(to, params, referrer)
+        raise MementoException, 'Error: Unable to fetch citation details' if citation.to_s.strip.empty?
+        puts citation
+        text_parser = Memento::ParserManager.get_text_parser(get_citation_format)
+        text_parser.value = citation
+        data = text_parser.get_data()
+
+        return data
+
+      end
+
+      protected
+       	#If to fetch citation details requires filling form, then provide form parameters
+       	def get_form_parameters
+       	  {}
+        end
+
+        def get_referrer
+      	  return @url
+      	end
+
+        #============ ABSTRACT METHODS ============#
+        # Subclass of UrlParser will need to atleast implement these 
+        # two functions
+        #==========================================
+
+     	  #returns Url from where the citation detalils can be fetched
+       	def get_citation_url
+       	  raise MementoException, 'Called abstract method: get_citation_url'
+       	end
+
+
+       	 # return the format of citation 	
+       	def get_citation_format
+       	  raise MementoException, 'Called abstract method: get_citation_format'
+       	end
+
+
+
+    end
+
+  end
+end
+
+
+
diff --git a/parser/ParserManager.rb b/parser/ParserManager.rb
@@ -0,0 +1,45 @@
+require_relative 'Parser'
+
+#Test Parser
+require_relative 'text/BibTexParser'
+
+#Url Parser
+require_relative 'url/ASMParser'
+require_relative 'url/HubMedParser'
+require_relative 'url/ACMPortalParser'
+require_relative 'url/BlackwellSynergyParser'
+require_relative 'url/PubmedParser'
+
+
+module Memento
+  module ParserManager
+      TEXT_PARSER = {
+        'bibtex' => {'name' => 'BibTeX', 'parser' => 'BibTexParser'}
+      }
+
+      WEBSITES = {
+        'asm.org' => {'name' => 'ASM Journals', 'link' => 'http://journals.asm.org/', 'parser' => 'ASMParser'},
+        'hubmed.org' => {'name' => 'Hubmed', 'link' => "http://www.hubmed.org", 'parser' => 'HubMedParser'},
+        'dl.acm.org' => {'name' => 'ACM Digital Library', 'link' => 'http://dl.acm.org/', 'parser' => 'ACMPortalParser'},
+        'ncbi.nlm.nih.gov' => {'name' => 'PubMed', 'link' => 'http://www.pubmed.gov', 'parser'=>'PubmedParser'},
+        'onlinelibrary.wiley.com' => {'name' =>'Wiley Online Library', 'link' => "http://onlinelibrary.wiley.com", 'parser' => 'BlackwellSynergyParser'}
+
+      }
+
+      def self.get_text_parser(format)
+     		raise MementoException, "Error: Missing require parameter: format" if format.nil? or format.empty?
+     		info = TEXT_PARSER[format.to_s.downcase.strip]
+     		raise MementoException, "Error: unsupported text format: #{format}" if info.nil?
+     		Kernel.const_get(info['parser']).new
+      end
+
+      def self.get_url_parser(url)
+        raise MementoException, "Error: Missing required parameter url" if url.nil? or url.empty?
+        WEBSITES.each do |key, value|
+          return Kernel.const_get(value['parser']).new if url =~ /#{key}/
+        end  
+        raise MementoException, "Error: Parsing is not supported for this website"
+      end #get_parser
+
+  end
+end
diff --git a/parser/text/AmazonXMLParser.rb b/parser/text/AmazonXMLParser.rb
@@ -0,0 +1,50 @@
+require 'libxml'
+
+class AmazonXMLParser < Memento::Parser::UrlParser
+
+  def get_data
+      validate
+      doc = XML::Parser.string(@value)
+      item = doc.parse
+      item = item.root.find('./Items/Item')
+      if('Book' != item.attributes)
+
+  		$item = $xml->Items->Item;
+  		if('Book' != (string)$item->ItemAttributes->ProductGroup)
+  			throw new Exception("Currently only books can be imported from Amazon");
+
+  		$article['doctype'] = 'book';
+  		$article['url'] = trim($item->DetailPageURL);
+  		$article['title'] = trim($item->ItemAttributes->Title);
+  		$article['publisher']=trim($item->ItemAttributes->Publisher);
+  		$article['pages'] = trim($item->ItemAttributes->NumberOfPages);
+  		$imgUrl = trim($item->SmallImage->URL);
+
+  		if(!empty($imgUrl)){
+  			$ch = curl_init($imgUrl);
+  			$ext = strtolower(end(explode('.', $imgUrl)));
+  			uses('neat_string');
+  			$neat = new NeatString();
+  			$filename = $neat->randomPassword(10) . '.' . $ext;
+  			$fp = fopen(ARTICLE_ICON . $filename, 'w');
+  			curl_setopt($ch, CURLOPT_FILE, $fp);
+  			curl_setopt($ch, CURLOPT_HEADER, 0);
+  			curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
+  			curl_exec($ch);
+  			curl_close($ch);
+  			fclose($fp);
+  			$article['img'] = ARTICLE_ICON_URL.$filename;
+  		}
+
+
+
+  		list($article['year'], $article['month'], $article['day']) = DateUtil::getCleanDate((string)$item->ItemAttributes->PublicationDate);
+
+  		foreach($item->ItemAttributes->Author as $author)
+  			$authors[] = (string) $author;
+
+  		$data[0] = array('Article'=>$article, 'Author' => $authors);
+  		return $data;
+  	}
+  end
+end
diff --git a/parser/text/BibTexParser.rb b/parser/text/BibTexParser.rb
@@ -0,0 +1,7 @@
+require 'bibtex'
+
+class BibTexParser < Memento::Parser::TextParser
+  def get_data
+    return BibTeX.parse @value
+  end
+end
diff --git a/parser/url/ACMPortalParser.rb b/parser/url/ACMPortalParser.rb
@@ -0,0 +1,19 @@
+class ACMPortalParser < Memento::Parser::UrlParser
+  PATTERNS = [
+    Regexp.new('id=(\d*)', Regexp::IGNORECASE)
+  ]
+  BASE_URL = 'http://dl.acm.org/exportformats.cfm?expformat=bibtex&id='
+
+  def get_citation_url
+    validate
+    PATTERNS.each do |pattern|
+      m = @value.match(pattern)
+      return BASE_URL + m[1] if m and m.length == 2
+    end
+    raise MementoException, "Error: Unable to find citation information" 
+  end
+
+  def get_citation_format
+    'bibtex'
+  end
+end
diff --git a/parser/url/ASMParser.rb b/parser/url/ASMParser.rb
@@ -0,0 +1,24 @@
+require 'uri'
+class ASMParser < Memento::Parser::UrlParser
+
+ 	PATTERNS = [
+ 	  Regexp.new('http:\/\/(.*)\.asm\.org.*[abstract|full|reprint]\/(.*)\?', Regexp::IGNORECASE),
+ 	  Regexp.new('http:\/\/(.*)\.asm\.org\/content\/(.*)\.[abstract|full]')
+ 	].freeze
+ 	BASE_URL = 'http://DOMAIN.asm.org/citmgr?type=bibtex&gca=';
+
+  def get_citation_url
+    PATTERNS.each do |pattern|
+      match = @value.match(pattern)
+      return BASE_URL.gsub('DOMAIN', match[1]) + URI.escape("#{match[1]};#{match[2]}") if match and match.length == 3
+    end
+    raise MementoException, "Error: Unable to find link to bibtex"
+  end
+
+  def get_citation_format
+    "bibtex"
+  end
+
+
+
+end
diff --git a/parser/url/AmazonParser.rb b/parser/url/AmazonParser.rb
@@ -0,0 +1,44 @@
+class AmazonParser < Memento::Parser::UrlParser
+    @BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=1E6W7G64405195A1J702&Operation=ItemLookup&ResponseGroup=Medium&ItemId='
+
+    PATTERNS = [
+        Regexp.new('/gp/product/(\d*)', Regexp::IGNORECASE),
+        Regexp.new('/ASIN/(\d*)', Regexp::IGNORECASE),
+        Regexp.new('/dp/(\w*)', Regexp::IGNORECASE)
+      ].freeze
+
+      if(preg_match("#/gp/product/(\d*)#", $url, $matches))
+   			return $matches[1];
+   		if(preg_match("#/ASIN/(\d*)#", $url, $matches))
+   			return $matches[1];
+   		if(preg_match("#/dp/(\w*)#", $url, $matches))
+   			return $matches[1];
+
+   	def get_data
+
+   	end
+
+    protected
+
+      def get_citation_url
+        asin = get_asin()
+        raise MementoException, 'Unable to get Amazon Standard Identification Number (ASIN).' if(asin == -1) 
+        return @BASE_URL + asin
+      end
+
+      #TODO: 
+      def get_asin
+        PATTERNS.each do |pattern|
+          matches = @value.match(pattern)
+          return matches[1] if matches and matches.length >= 2
+        end
+      end
+
+
+     	def get_citation_format
+     	  raise MementoException, 'Called abstract method: get_citation_format'
+     	end
+
+
+
+end
diff --git a/parser/url/BlackwellSynergyParser.rb b/parser/url/BlackwellSynergyParser.rb
@@ -0,0 +1,32 @@
+class BlackwellSynergyParser < Memento::Parser::UrlParser
+
+  PATTERNS = [
+    Regexp.new('doi/(.*)/[abstract|full]', Regexp::IGNORECASE)
+  ]
+  BASE_URL = 'http://onlinelibrary.wiley.com/documentcitationdownloadformsubmit'
+
+
+ 	def get_form_parameters
+ 	  params = {'hasAbstract' => 'CITATION_AND_ABSTRACT', 'fileFormat' => 'BIBTEX', 'submit' => 'Submit'}
+
+ 	  PATTERNS.each do |pattern|
+      match = @value.match(pattern)
+      if match and match.length == 2
+        params['doi'] = match[1]
+        break
+      end
+    end
+
+    raise MementoException, "Error: Unable to find DOI" unless params.has_key?('doi')
+
+    return params
+ 	end
+
+  def get_citation_url
+    BASE_URL
+  end
+
+  def get_citation_format
+    'bibtex'
+  end
+end