Skip to content

Commit

Permalink
First update
Browse files Browse the repository at this point in the history
  • Loading branch information
ragrawal committed Mar 30, 2012
0 parents commit afc595e
Show file tree
Hide file tree
Showing 27 changed files with 1,050 additions and 0 deletions.
57 changes: 57 additions & 0 deletions Memento.rb
@@ -0,0 +1,57 @@
require 'curb'
require 'cgi'

require_relative 'MementoException'
require_relative 'parser/ParserManager'
require_relative 'writer/WriterManager'

module Memento
def self.transform(input_format, output_format, value)
#Sanity Checks
raise MementoException, "Error: Missing required parameter: input_format" if input_format.nil? or input_format.empty?
raise MementoException, "Error: Missing required parameter: output_format" if output_format.nil? or output_format.empty?
raise MementoException, "Error: Missing required parameter: text" if value.nil? or value.empty?

#if input_format = 'site', then its a website and use UrlParser to get text
parser = nil
if ['site'].include?(input_format.downcase.strip)
parser = Memento::ParserManager.get_url_parser(value)
else
parser = Memento::ParserManager.get_text_parser(input_format)
end
raise MementoException, "Unable to find required parser" if parser.nil?

parser.value = value
data = parser.get_data

writer = Memento::WriterManager.get_writer(output_format)
return writer.export(data)
end

def self.get_page(url, parameters = {}, referer = nil)
c = Curl::Easy.new(url)
c.follow_location = true
c.header_in_body = false
c.useragent='Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5'
c.enable_cookies = true

if parameters and !parameters.empty?
c.http_post parameters.map{|k,v| "#{k}=#{CGI.escape(v)}"}.join('&')
end

#sometimes especially in the case pubmed url, c.perform
# falsely gives PartialFileError but successfully retrieves body
begin
c.perform
rescue

end
return c.body_str

end





end
8 changes: 8 additions & 0 deletions MementoException.rb
@@ -0,0 +1,8 @@
# Note instead of subclassing from Exception, use StandardError because
# StandardError deals with application level errors where as Exception deals with the
# both application and environment level types of errors

class MementoException < StandardError


end
82 changes: 82 additions & 0 deletions parser/Parser.rb
@@ -0,0 +1,82 @@
module Memento
module Parser
class TextParser

attr_accessor :value

#constructor
def validate
raise MementoException, "Error: Missing required text" if text.nil? or text.empty?
end

# extract citation information form the input string
# and return an array of BibTexEntry object
def get_data
raise 'calling abstract method: get_data'
end

end

class UrlParser
attr_accessor :value


def validate
raise MementoException, 'Invalid Url' if @value.nil? or @value.empty?
end

#function: get_data
#@description: processes url and returns citation information as an array of BibTeX entries.
#
def get_data
validate
to = get_citation_url()
params = get_form_parameters
referrer = get_referrer()

citation = Memento.get_page(to, params, referrer)
raise MementoException, 'Error: Unable to fetch citation details' if citation.to_s.strip.empty?
puts citation
text_parser = Memento::ParserManager.get_text_parser(get_citation_format)
text_parser.value = citation
data = text_parser.get_data()

return data

end

protected
#If to fetch citation details requires filling form, then provide form parameters
def get_form_parameters
{}
end

def get_referrer
return @url
end

#============ ABSTRACT METHODS ============#
# Subclass of UrlParser will need to atleast implement these
# two functions
#==========================================

#returns Url from where the citation detalils can be fetched
def get_citation_url
raise MementoException, 'Called abstract method: get_citation_url'
end


# return the format of citation
def get_citation_format
raise MementoException, 'Called abstract method: get_citation_format'
end



end

end
end



45 changes: 45 additions & 0 deletions parser/ParserManager.rb
@@ -0,0 +1,45 @@
require_relative 'Parser'

#Test Parser
require_relative 'text/BibTexParser'

#Url Parser
require_relative 'url/ASMParser'
require_relative 'url/HubMedParser'
require_relative 'url/ACMPortalParser'
require_relative 'url/BlackwellSynergyParser'
require_relative 'url/PubmedParser'


module Memento
module ParserManager
TEXT_PARSER = {
'bibtex' => {'name' => 'BibTeX', 'parser' => 'BibTexParser'}
}

WEBSITES = {
'asm.org' => {'name' => 'ASM Journals', 'link' => 'http://journals.asm.org/', 'parser' => 'ASMParser'},
'hubmed.org' => {'name' => 'Hubmed', 'link' => "http://www.hubmed.org", 'parser' => 'HubMedParser'},
'dl.acm.org' => {'name' => 'ACM Digital Library', 'link' => 'http://dl.acm.org/', 'parser' => 'ACMPortalParser'},
'ncbi.nlm.nih.gov' => {'name' => 'PubMed', 'link' => 'http://www.pubmed.gov', 'parser'=>'PubmedParser'},
'onlinelibrary.wiley.com' => {'name' =>'Wiley Online Library', 'link' => "http://onlinelibrary.wiley.com", 'parser' => 'BlackwellSynergyParser'}

}

def self.get_text_parser(format)
raise MementoException, "Error: Missing require parameter: format" if format.nil? or format.empty?
info = TEXT_PARSER[format.to_s.downcase.strip]
raise MementoException, "Error: unsupported text format: #{format}" if info.nil?
Kernel.const_get(info['parser']).new
end

def self.get_url_parser(url)
raise MementoException, "Error: Missing required parameter url" if url.nil? or url.empty?
WEBSITES.each do |key, value|
return Kernel.const_get(value['parser']).new if url =~ /#{key}/
end
raise MementoException, "Error: Parsing is not supported for this website"
end #get_parser

end
end
50 changes: 50 additions & 0 deletions parser/text/AmazonXMLParser.rb
@@ -0,0 +1,50 @@
require 'libxml'

class AmazonXMLParser < Memento::Parser::UrlParser

def get_data
validate
doc = XML::Parser.string(@value)
item = doc.parse
item = item.root.find('./Items/Item')
if('Book' != item.attributes)

$item = $xml->Items->Item;
if('Book' != (string)$item->ItemAttributes->ProductGroup)
throw new Exception("Currently only books can be imported from Amazon");

$article['doctype'] = 'book';
$article['url'] = trim($item->DetailPageURL);
$article['title'] = trim($item->ItemAttributes->Title);
$article['publisher']=trim($item->ItemAttributes->Publisher);
$article['pages'] = trim($item->ItemAttributes->NumberOfPages);
$imgUrl = trim($item->SmallImage->URL);

if(!empty($imgUrl)){
$ch = curl_init($imgUrl);
$ext = strtolower(end(explode('.', $imgUrl)));
uses('neat_string');
$neat = new NeatString();
$filename = $neat->randomPassword(10) . '.' . $ext;
$fp = fopen(ARTICLE_ICON . $filename, 'w');
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_exec($ch);
curl_close($ch);
fclose($fp);
$article['img'] = ARTICLE_ICON_URL.$filename;
}



list($article['year'], $article['month'], $article['day']) = DateUtil::getCleanDate((string)$item->ItemAttributes->PublicationDate);

foreach($item->ItemAttributes->Author as $author)
$authors[] = (string) $author;

$data[0] = array('Article'=>$article, 'Author' => $authors);
return $data;
}
end
end
7 changes: 7 additions & 0 deletions parser/text/BibTexParser.rb
@@ -0,0 +1,7 @@
require 'bibtex'

class BibTexParser < Memento::Parser::TextParser
def get_data
return BibTeX.parse @value
end
end
19 changes: 19 additions & 0 deletions parser/url/ACMPortalParser.rb
@@ -0,0 +1,19 @@
class ACMPortalParser < Memento::Parser::UrlParser
PATTERNS = [
Regexp.new('id=(\d*)', Regexp::IGNORECASE)
]
BASE_URL = 'http://dl.acm.org/exportformats.cfm?expformat=bibtex&id='

def get_citation_url
validate
PATTERNS.each do |pattern|
m = @value.match(pattern)
return BASE_URL + m[1] if m and m.length == 2
end
raise MementoException, "Error: Unable to find citation information"
end

def get_citation_format
'bibtex'
end
end
24 changes: 24 additions & 0 deletions parser/url/ASMParser.rb
@@ -0,0 +1,24 @@
require 'uri'
class ASMParser < Memento::Parser::UrlParser

PATTERNS = [
Regexp.new('http:\/\/(.*)\.asm\.org.*[abstract|full|reprint]\/(.*)\?', Regexp::IGNORECASE),
Regexp.new('http:\/\/(.*)\.asm\.org\/content\/(.*)\.[abstract|full]')
].freeze
BASE_URL = 'http://DOMAIN.asm.org/citmgr?type=bibtex&gca=';

def get_citation_url
PATTERNS.each do |pattern|
match = @value.match(pattern)
return BASE_URL.gsub('DOMAIN', match[1]) + URI.escape("#{match[1]};#{match[2]}") if match and match.length == 3
end
raise MementoException, "Error: Unable to find link to bibtex"
end

def get_citation_format
"bibtex"
end



end
44 changes: 44 additions & 0 deletions parser/url/AmazonParser.rb
@@ -0,0 +1,44 @@
class AmazonParser < Memento::Parser::UrlParser
@BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=1E6W7G64405195A1J702&Operation=ItemLookup&ResponseGroup=Medium&ItemId='

PATTERNS = [
Regexp.new('/gp/product/(\d*)', Regexp::IGNORECASE),
Regexp.new('/ASIN/(\d*)', Regexp::IGNORECASE),
Regexp.new('/dp/(\w*)', Regexp::IGNORECASE)
].freeze

if(preg_match("#/gp/product/(\d*)#", $url, $matches))
return $matches[1];
if(preg_match("#/ASIN/(\d*)#", $url, $matches))
return $matches[1];
if(preg_match("#/dp/(\w*)#", $url, $matches))
return $matches[1];

def get_data

end

protected

def get_citation_url
asin = get_asin()
raise MementoException, 'Unable to get Amazon Standard Identification Number (ASIN).' if(asin == -1)
return @BASE_URL + asin
end

#TODO:
def get_asin
PATTERNS.each do |pattern|
matches = @value.match(pattern)
return matches[1] if matches and matches.length >= 2
end
end


def get_citation_format
raise MementoException, 'Called abstract method: get_citation_format'
end



end
32 changes: 32 additions & 0 deletions parser/url/BlackwellSynergyParser.rb
@@ -0,0 +1,32 @@
class BlackwellSynergyParser < Memento::Parser::UrlParser

PATTERNS = [
Regexp.new('doi/(.*)/[abstract|full]', Regexp::IGNORECASE)
]
BASE_URL = 'http://onlinelibrary.wiley.com/documentcitationdownloadformsubmit'


def get_form_parameters
params = {'hasAbstract' => 'CITATION_AND_ABSTRACT', 'fileFormat' => 'BIBTEX', 'submit' => 'Submit'}

PATTERNS.each do |pattern|
match = @value.match(pattern)
if match and match.length == 2
params['doi'] = match[1]
break
end
end

raise MementoException, "Error: Unable to find DOI" unless params.has_key?('doi')

return params
end

def get_citation_url
BASE_URL
end

def get_citation_format
'bibtex'
end
end

0 comments on commit afc595e

Please sign in to comment.