Skip to content

Commit

Permalink
WINNING
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob McGrail committed Oct 17, 2012
1 parent 1ca22ea commit 84c40fa
Show file tree
Hide file tree
Showing 15 changed files with 140 additions and 97 deletions.
6 changes: 3 additions & 3 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ directories:
xml: './output/xml'

ids:
homepage: 2
images: '1b26c0454b09bb49dfb1b9190ffd67cb'
files: '0b113a208f7890f9ad3c24444ff5988c'
homepage: '547cc4ebe61632474a33700a26e8b21b'
images: 'e7ff633c6b8e0fd3531e74c6e712bead'
files: '732a5acd01b51a6fe6eab448ad4138a9'
start: 10000

db: 3
19 changes: 17 additions & 2 deletions lib/helpers/string_from_path.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,28 @@ def self.get_case_insensitive(path)


def self.get(path)
if File.exists?(path)
# Check for the file, and ensure it isn't a directory.
#
# If it is a directory we will check for an index.htm file.

if File.exists?(path) && !File.directory?(path)
file = File.open(path)
str = file.read
file.close
str
else
nil

# Checking for index.htm file.

if File.exists?(path + '/index.htm')
file = File.open(path + '/index.htm')
str = file.read
file.close
str
else
nil
end
end

end
end
33 changes: 13 additions & 20 deletions lib/inports/ezpub/file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class File < EzPub::Handler
extend StaticCopy
extend NameMaker
extend MediaPathHelper
extend ImportPathHelper

def self.priority
# I should run nearly last,
Expand All @@ -26,32 +27,24 @@ def self.mine?(path)
raise BadHandlerOrder, "#{path} flagged as image from generic File handler."
end

# ::binary? method from ptools gem.
#
# It performs a "best guess" based on a simple test
# of the first +File.blksize+ characters.

exts = /\.#{EZP_ICON_BINARY_EXTENSIONS.join('|')}$/

if ::File.binary?(path)

# Use MediaPathHelper module to create a heirarchy of media library
# folders for this path, if needed.

unless has_media_path? path, 'files'
create_media_path(path, 'files')
end
# Use MediaPathHelper module to create a heirarchy of media library
# folders for this path, if needed.

if exts.match(path.downcase)
true
else
Logger.warning path, 'Unknown ext for file'
false
end
unless has_media_path? path, 'files'
create_media_path(path, 'files')
end

if exts.match(path.downcase)
true
else
Logger.warning path, 'Unknown ext for file'
false
end

else
false
end
end

Expand All @@ -74,7 +67,7 @@ def self.store(path)

$r.hset path, 'fields', 'file:ezbinaryfile,name:ezstring'

$r.hset path, 'field_file', dest
$r.hset path, 'field_file', trim_for_ezp(dest)
$r.hset path, 'field_name', pretify_filename(dest)
end
end
Expand Down
11 changes: 1 addition & 10 deletions lib/inports/ezpub/general_content.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def self.priority


def self.mine?(path)
if page?(path) || ::File.exists?(path + '/index.htm')
if page?(path)
true
else
false
Expand All @@ -28,15 +28,6 @@ def self.mine?(path)
def self.store(path)
filepath = path

unless page? path
if ::File.exists? path + '/index.htm'
filepath = path + '/index.htm'
else
raise JustAFolder, "#{path} has no index.htm!"
end
end


$r.log_key(path)

$r.hset path, 'id', $r.get_id
Expand Down
3 changes: 2 additions & 1 deletion lib/inports/ezpub/image.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class Image < EzPub::Handler
extend StaticCopy
extend NameMaker
extend MediaPathHelper
extend ImportPathHelper

def self.priority
99
Expand Down Expand Up @@ -64,7 +65,7 @@ def self.store(path)

$r.hset path, 'fields', 'image:ezimage,name:ezstring'

$r.hset path, 'field_image', dest
$r.hset path, 'field_image', trim_for_ezp(dest)
$r.hset path, 'field_name', pretify_filename(dest)
end
end
Expand Down
33 changes: 1 addition & 32 deletions lib/inports/ezpub/media_folder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,38 +35,7 @@ def self.store(path)

$r.hset path, 'id', $r.get_id

begin
parent = parent_id path

# Since orphanity, in this case, could just mean that we've bottomed out
# on the non-standard /files and /images folders, we test for that before
# raising a general exception.

rescue Orphanity

# This hard codes the expectation that input paths will begin with '.'.
#
# It checks that this is legitmately a bottom level path.

if path._parentize =~ /^media:\w+:\.$/

case /media:([^:]+)/.match(path)[1]

# Case switch for appropriate folder values in
# config file.

when 'images'
parent = CONFIG['ids']['images']
when 'files'
parent = CONFIG['ids']['files']
else
raise BadPath, "Unhandled MediaFolder parent due to no matches in predefined folders."
end

else
raise Orphanity, "Unhandled MediaFolder parent. Unhandled path didn't bottom out as expected."
end
end
parent = parent_id path

$r.hset path, 'parent', parent

Expand Down
2 changes: 1 addition & 1 deletion lib/inports/ezpub/mine_modules/ezp_extensions.rb
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
EZP_ICON_BINARY_EXTENSIONS = %w[doc docx mp3 pdf ppt pps ppsx pptx rtf wav xls xlsx zip pub]
EZP_ICON_BINARY_EXTENSIONS = %w[doc docx mp3 pdf ppt pps ppsx pptx rtf wav xls xlsx zip pub rar]
EZP_ICON_IMAGE_EXTENSIONS = %w[jpg jpeg gif bmp png]
19 changes: 13 additions & 6 deletions lib/inports/ezpub/mine_modules/include_or_page.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
module IncludeOrPage
def page?(path)
# if path =~ /.html$/
# raise UnexpectedPagelikeFile, 'Encountered an .html path, but only expect .htm.'
# end

path =~ /.(htm|html)$/ ? true : false
if path =~ /.(htm|html)$/
if path !~ /index\.(htm|html)$/
true
else
false
end
else
if ::File.exists?(path + '/index.htm')
true
else
false
end
end
end


def include?(path)
path =~ /.cfm$/ ? true : false
end

end
13 changes: 13 additions & 0 deletions lib/inports/ezpub/mine_modules/is_a_redirect.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module IsARedirect
def redirect?(path)
str = StringFromPath.get_case_insensitive(path)
doc = Nokogiri::HTML(str)

if doc.xpath("//cfheader[@statuscode='301']")
# Return the destination.
doc.xpath("//cfheader[@name='Location']").first[:value]
else
false
end
end
end
36 changes: 36 additions & 0 deletions lib/inports/ezpub/redirect.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
module EzPub
class Redirect < EzPub::Handler
EzPub::HandlerSets::All << self
EzPub::HandlerSets::Content << self

extend IsARedirect

# Identifying redirects, and then ignore them.
# Runs before content classes so they don't have to deal
# with this nonsense.

def self.priority
1
end


def self.mine?(path)
# Check it's a page.
if page?(path)
# Check if it's a redirect.
if redirect?(path)
true
else
false
end
else
false
end
end


def self.store(path)
Logger.warning path 'ignored redirect', 'shhh'
end
end
end
5 changes: 5 additions & 0 deletions lib/inports/ezpub/store_modules/import_path_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module ImportPathHelper
def trim_for_ezp(path)
path.gsub(/^\.\//, '')
end
end
24 changes: 18 additions & 6 deletions lib/inports/processor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ class Processor

# Pass in an optional input folder and handlers constant.

@@runs = 0

def initialize(opts = {})
@root = opts[:root] || CONFIG['directories']['input']
@handlers = opts[:handlers] || EzPub::HandlerSets::All
Expand All @@ -26,25 +28,35 @@ def ingest
unless handle path

# Given multiple runs of #ingest, we track unhandled items in
# a set, adding and removing paths as they are handled or vice
# versa.
# a set, adding paths as they fail handling.
#
# These are then logged in Processor#log_unhandled

$r.sadd 'unhandled', path
else
$r.srem 'unhandled', path
$r.sadd "unhandled-#{@@runs}", path
end
end

# Increment run count to distinguish unhandled sets.
@@runs += 1
end


# See comments for Processor#ingest

def log_unhandled
$r.smembers('unhandled').each do |k|
sets = []

# Use @@runs count to collect up our unhandled set keys.
@@runs.times {|i| sets << "unhandled-#{i}"}

# Iterate through the intersection of these sets.
# This should be all paths unhandled in any run.

$r.sinter(*sets).each do |k|
Logger.warning k, 'Unhandled', 'shh'
end

$r.del sets
end


Expand Down
10 changes: 8 additions & 2 deletions lib/inports/redis.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def kill_keys

$r.del 'post_process'
$r.del 'keys'
$r.del 'unhandled'

# Attempt to delete unhandled sets in the event of a broken run.
5.times {|i| $r.del "unhandled-#{i}"}
end
end

Expand All @@ -46,5 +48,9 @@ def kill_keys
# Set node id incrementer to our safe offset.
$r.set 'idcount', CONFIG['ids']['start']

# Set input directory path as having the eZPublish homepage node id.
# Set input directory path as having the eZPublish homepage remote id.
$r.hset CONFIG['directories']['input'], 'id', CONFIG['ids']['homepage']

# Set media folders paths as having the appropriate remote ids.
$r.hset 'media:files:.', 'id', CONFIG['ids']['files']
$r.hset 'media:images:.', 'id', CONFIG['ids']['images']
17 changes: 3 additions & 14 deletions scratch.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,8 @@

$r.kill_keys

$r.hset './input/curriculum-support', 'id', '123'
# puts EzPub::File.mine?('./input/curriculum-support/pdfs/tl-learning-progression-diagrams-oct-2010.pdf')

EzPub::GeneralContent.store('./input/curriculum-support/index.htm')
include IsARedirect



# CONFIG['directories']['input'] = './test/mocks'

# result = resolve_includes './test/mocks/has_includes.htm'




#puts result #if $verbose

# puts Crawler.new.list
redirect?('./input/curriculum-support/CSP/index.htm')
6 changes: 6 additions & 0 deletions test/test_include_of_page.rb → test/test_include_or_page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,10 @@ def test_page_correctly_identifies_page_paths
refute page? './some/path/thing.cfm'
refute page? './some/path/thing.jpg'
end


def test_page_doesnt_acknowledge_indexs
refute page? './some/path/index.htm'
refute page? './some/path/index.html'
end
end

0 comments on commit 84c40fa

Please sign in to comment.