Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit afc595e
Showing
27 changed files
with
1,050 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
require 'curb' | ||
require 'cgi' | ||
|
||
require_relative 'MementoException' | ||
require_relative 'parser/ParserManager' | ||
require_relative 'writer/WriterManager' | ||
|
||
module Memento | ||
def self.transform(input_format, output_format, value) | ||
#Sanity Checks | ||
raise MementoException, "Error: Missing required parameter: input_format" if input_format.nil? or input_format.empty? | ||
raise MementoException, "Error: Missing required parameter: output_format" if output_format.nil? or output_format.empty? | ||
raise MementoException, "Error: Missing required parameter: text" if value.nil? or value.empty? | ||
|
||
#if input_format = 'site', then its a website and use UrlParser to get text | ||
parser = nil | ||
if ['site'].include?(input_format.downcase.strip) | ||
parser = Memento::ParserManager.get_url_parser(value) | ||
else | ||
parser = Memento::ParserManager.get_text_parser(input_format) | ||
end | ||
raise MementoException, "Unable to find required parser" if parser.nil? | ||
|
||
parser.value = value | ||
data = parser.get_data | ||
|
||
writer = Memento::WriterManager.get_writer(output_format) | ||
return writer.export(data) | ||
end | ||
|
||
def self.get_page(url, parameters = {}, referer = nil) | ||
c = Curl::Easy.new(url) | ||
c.follow_location = true | ||
c.header_in_body = false | ||
c.useragent='Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 GTB5' | ||
c.enable_cookies = true | ||
|
||
if parameters and !parameters.empty? | ||
c.http_post parameters.map{|k,v| "#{k}=#{CGI.escape(v)}"}.join('&') | ||
end | ||
|
||
#sometimes especially in the case pubmed url, c.perform | ||
# falsely gives PartialFileError but successfully retrieves body | ||
begin | ||
c.perform | ||
rescue | ||
|
||
end | ||
return c.body_str | ||
|
||
end | ||
|
||
|
||
|
||
|
||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Note instead of subclassing from Exception, use StandardError because | ||
# StandardError deals with application level errors where as Exception deals with the | ||
# both application and environment level types of errors | ||
|
||
class MementoException < StandardError | ||
|
||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
module Memento | ||
module Parser | ||
class TextParser | ||
|
||
attr_accessor :value | ||
|
||
#constructor | ||
def validate | ||
raise MementoException, "Error: Missing required text" if text.nil? or text.empty? | ||
end | ||
|
||
# extract citation information form the input string | ||
# and return an array of BibTexEntry object | ||
def get_data | ||
raise 'calling abstract method: get_data' | ||
end | ||
|
||
end | ||
|
||
class UrlParser | ||
attr_accessor :value | ||
|
||
|
||
def validate | ||
raise MementoException, 'Invalid Url' if @value.nil? or @value.empty? | ||
end | ||
|
||
#function: get_data | ||
#@description: processes url and returns citation information as an array of BibTeX entries. | ||
# | ||
def get_data | ||
validate | ||
to = get_citation_url() | ||
params = get_form_parameters | ||
referrer = get_referrer() | ||
|
||
citation = Memento.get_page(to, params, referrer) | ||
raise MementoException, 'Error: Unable to fetch citation details' if citation.to_s.strip.empty? | ||
puts citation | ||
text_parser = Memento::ParserManager.get_text_parser(get_citation_format) | ||
text_parser.value = citation | ||
data = text_parser.get_data() | ||
|
||
return data | ||
|
||
end | ||
|
||
protected | ||
#If to fetch citation details requires filling form, then provide form parameters | ||
def get_form_parameters | ||
{} | ||
end | ||
|
||
def get_referrer | ||
return @url | ||
end | ||
|
||
#============ ABSTRACT METHODS ============# | ||
# Subclass of UrlParser will need to atleast implement these | ||
# two functions | ||
#========================================== | ||
|
||
#returns Url from where the citation detalils can be fetched | ||
def get_citation_url | ||
raise MementoException, 'Called abstract method: get_citation_url' | ||
end | ||
|
||
|
||
# return the format of citation | ||
def get_citation_format | ||
raise MementoException, 'Called abstract method: get_citation_format' | ||
end | ||
|
||
|
||
|
||
end | ||
|
||
end | ||
end | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
require_relative 'Parser' | ||
|
||
#Test Parser | ||
require_relative 'text/BibTexParser' | ||
|
||
#Url Parser | ||
require_relative 'url/ASMParser' | ||
require_relative 'url/HubMedParser' | ||
require_relative 'url/ACMPortalParser' | ||
require_relative 'url/BlackwellSynergyParser' | ||
require_relative 'url/PubmedParser' | ||
|
||
|
||
module Memento | ||
module ParserManager | ||
TEXT_PARSER = { | ||
'bibtex' => {'name' => 'BibTeX', 'parser' => 'BibTexParser'} | ||
} | ||
|
||
WEBSITES = { | ||
'asm.org' => {'name' => 'ASM Journals', 'link' => 'http://journals.asm.org/', 'parser' => 'ASMParser'}, | ||
'hubmed.org' => {'name' => 'Hubmed', 'link' => "http://www.hubmed.org", 'parser' => 'HubMedParser'}, | ||
'dl.acm.org' => {'name' => 'ACM Digital Library', 'link' => 'http://dl.acm.org/', 'parser' => 'ACMPortalParser'}, | ||
'ncbi.nlm.nih.gov' => {'name' => 'PubMed', 'link' => 'http://www.pubmed.gov', 'parser'=>'PubmedParser'}, | ||
'onlinelibrary.wiley.com' => {'name' =>'Wiley Online Library', 'link' => "http://onlinelibrary.wiley.com", 'parser' => 'BlackwellSynergyParser'} | ||
|
||
} | ||
|
||
def self.get_text_parser(format) | ||
raise MementoException, "Error: Missing require parameter: format" if format.nil? or format.empty? | ||
info = TEXT_PARSER[format.to_s.downcase.strip] | ||
raise MementoException, "Error: unsupported text format: #{format}" if info.nil? | ||
Kernel.const_get(info['parser']).new | ||
end | ||
|
||
def self.get_url_parser(url) | ||
raise MementoException, "Error: Missing required parameter url" if url.nil? or url.empty? | ||
WEBSITES.each do |key, value| | ||
return Kernel.const_get(value['parser']).new if url =~ /#{key}/ | ||
end | ||
raise MementoException, "Error: Parsing is not supported for this website" | ||
end #get_parser | ||
|
||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
require 'libxml' | ||
|
||
class AmazonXMLParser < Memento::Parser::UrlParser | ||
|
||
def get_data | ||
validate | ||
doc = XML::Parser.string(@value) | ||
item = doc.parse | ||
item = item.root.find('./Items/Item') | ||
if('Book' != item.attributes) | ||
|
||
$item = $xml->Items->Item; | ||
if('Book' != (string)$item->ItemAttributes->ProductGroup) | ||
throw new Exception("Currently only books can be imported from Amazon"); | ||
|
||
$article['doctype'] = 'book'; | ||
$article['url'] = trim($item->DetailPageURL); | ||
$article['title'] = trim($item->ItemAttributes->Title); | ||
$article['publisher']=trim($item->ItemAttributes->Publisher); | ||
$article['pages'] = trim($item->ItemAttributes->NumberOfPages); | ||
$imgUrl = trim($item->SmallImage->URL); | ||
|
||
if(!empty($imgUrl)){ | ||
$ch = curl_init($imgUrl); | ||
$ext = strtolower(end(explode('.', $imgUrl))); | ||
uses('neat_string'); | ||
$neat = new NeatString(); | ||
$filename = $neat->randomPassword(10) . '.' . $ext; | ||
$fp = fopen(ARTICLE_ICON . $filename, 'w'); | ||
curl_setopt($ch, CURLOPT_FILE, $fp); | ||
curl_setopt($ch, CURLOPT_HEADER, 0); | ||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); | ||
curl_exec($ch); | ||
curl_close($ch); | ||
fclose($fp); | ||
$article['img'] = ARTICLE_ICON_URL.$filename; | ||
} | ||
|
||
|
||
|
||
list($article['year'], $article['month'], $article['day']) = DateUtil::getCleanDate((string)$item->ItemAttributes->PublicationDate); | ||
|
||
foreach($item->ItemAttributes->Author as $author) | ||
$authors[] = (string) $author; | ||
|
||
$data[0] = array('Article'=>$article, 'Author' => $authors); | ||
return $data; | ||
} | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
require 'bibtex' | ||
|
||
class BibTexParser < Memento::Parser::TextParser | ||
def get_data | ||
return BibTeX.parse @value | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
class ACMPortalParser < Memento::Parser::UrlParser | ||
PATTERNS = [ | ||
Regexp.new('id=(\d*)', Regexp::IGNORECASE) | ||
] | ||
BASE_URL = 'http://dl.acm.org/exportformats.cfm?expformat=bibtex&id=' | ||
|
||
def get_citation_url | ||
validate | ||
PATTERNS.each do |pattern| | ||
m = @value.match(pattern) | ||
return BASE_URL + m[1] if m and m.length == 2 | ||
end | ||
raise MementoException, "Error: Unable to find citation information" | ||
end | ||
|
||
def get_citation_format | ||
'bibtex' | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
require 'uri' | ||
class ASMParser < Memento::Parser::UrlParser | ||
|
||
PATTERNS = [ | ||
Regexp.new('http:\/\/(.*)\.asm\.org.*[abstract|full|reprint]\/(.*)\?', Regexp::IGNORECASE), | ||
Regexp.new('http:\/\/(.*)\.asm\.org\/content\/(.*)\.[abstract|full]') | ||
].freeze | ||
BASE_URL = 'http://DOMAIN.asm.org/citmgr?type=bibtex&gca='; | ||
|
||
def get_citation_url | ||
PATTERNS.each do |pattern| | ||
match = @value.match(pattern) | ||
return BASE_URL.gsub('DOMAIN', match[1]) + URI.escape("#{match[1]};#{match[2]}") if match and match.length == 3 | ||
end | ||
raise MementoException, "Error: Unable to find link to bibtex" | ||
end | ||
|
||
def get_citation_format | ||
"bibtex" | ||
end | ||
|
||
|
||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
class AmazonParser < Memento::Parser::UrlParser | ||
@BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=1E6W7G64405195A1J702&Operation=ItemLookup&ResponseGroup=Medium&ItemId=' | ||
|
||
PATTERNS = [ | ||
Regexp.new('/gp/product/(\d*)', Regexp::IGNORECASE), | ||
Regexp.new('/ASIN/(\d*)', Regexp::IGNORECASE), | ||
Regexp.new('/dp/(\w*)', Regexp::IGNORECASE) | ||
].freeze | ||
|
||
if(preg_match("#/gp/product/(\d*)#", $url, $matches)) | ||
return $matches[1]; | ||
if(preg_match("#/ASIN/(\d*)#", $url, $matches)) | ||
return $matches[1]; | ||
if(preg_match("#/dp/(\w*)#", $url, $matches)) | ||
return $matches[1]; | ||
|
||
def get_data | ||
|
||
end | ||
|
||
protected | ||
|
||
def get_citation_url | ||
asin = get_asin() | ||
raise MementoException, 'Unable to get Amazon Standard Identification Number (ASIN).' if(asin == -1) | ||
return @BASE_URL + asin | ||
end | ||
|
||
#TODO: | ||
def get_asin | ||
PATTERNS.each do |pattern| | ||
matches = @value.match(pattern) | ||
return matches[1] if matches and matches.length >= 2 | ||
end | ||
end | ||
|
||
|
||
def get_citation_format | ||
raise MementoException, 'Called abstract method: get_citation_format' | ||
end | ||
|
||
|
||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
class BlackwellSynergyParser < Memento::Parser::UrlParser | ||
|
||
PATTERNS = [ | ||
Regexp.new('doi/(.*)/[abstract|full]', Regexp::IGNORECASE) | ||
] | ||
BASE_URL = 'http://onlinelibrary.wiley.com/documentcitationdownloadformsubmit' | ||
|
||
|
||
def get_form_parameters | ||
params = {'hasAbstract' => 'CITATION_AND_ABSTRACT', 'fileFormat' => 'BIBTEX', 'submit' => 'Submit'} | ||
|
||
PATTERNS.each do |pattern| | ||
match = @value.match(pattern) | ||
if match and match.length == 2 | ||
params['doi'] = match[1] | ||
break | ||
end | ||
end | ||
|
||
raise MementoException, "Error: Unable to find DOI" unless params.has_key?('doi') | ||
|
||
return params | ||
end | ||
|
||
def get_citation_url | ||
BASE_URL | ||
end | ||
|
||
def get_citation_format | ||
'bibtex' | ||
end | ||
end |
Oops, something went wrong.