Skip to content

Commit

Permalink
Merge pull request jekyll#368 from jacius/wordpress
Browse files Browse the repository at this point in the history
Greatly improved and extended the wordpress.rb migrator.
  • Loading branch information
mojombo committed Jan 22, 2012
2 parents 1292c02 + 1b2b5ff commit 22159b3
Showing 1 changed file with 273 additions and 41 deletions.
314 changes: 273 additions & 41 deletions lib/jekyll/migrators/wordpress.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,52 +11,284 @@

module Jekyll
module WordPress
def self.process(dbname, user, pass, host = 'localhost', table_prefix = 'wp_')
db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')

# Main migrator function. Call this to perform the migration.
#
# dbname:: The name of the database
# user:: The database user name
# pass:: The database user's password
# host:: The address of the MySQL database host. Default: 'localhost'
# options:: A hash table of configuration options.
#
# Supported options are:
#
# :table_prefix:: Prefix of database tables used by WordPress.
# Default: 'wp_'
# :clean_entities:: If true, convert non-ASCII characters to HTML
# entities in the posts, comments, titles, and
# names. Requires the 'htmlentities' gem to
# work. Default: true.
# :comments:: If true, migrate post comments too. Comments
# are saved in the post's YAML front matter.
# Default: true.
# :categories:: If true, save the post's categories in its
# YAML front matter.
# :tags:: If true, save the post's tags in its
# YAML front matter.
# :more_excerpt:: If true, when a post has no excerpt but
# does have a <!-- more --> tag, use the
# preceding post content as the excerpt.
# Default: true.
# :more_anchor:: If true, convert a <!-- more --> tag into
# two HTML anchors with ids "more" and
# "more-NNN" (where NNN is the post number).
# Default: true.
# :status:: Array of allowed post statuses. Only
# posts with matching status will be migrated.
# Known statuses are :publish, :draft, :private,
# and :revision. If this is nil or an empty
# array, all posts are migrated regardless of
# status. Default: [:publish].
#
def self.process(dbname, user, pass, host='localhost', options={})
options = {
:table_prefix => 'wp_',
:clean_entities => true,
:comments => true,
:categories => true,
:tags => true,
:more_excerpt => true,
:more_anchor => true,
:status => [:publish] # :draft, :private, :revision
}.merge(options)

if options[:clean_entities]
begin
require 'htmlentities'
rescue LoadError
STDERR.puts "Could not require 'htmlentities', so the " +
":clean_entities option is now disabled."
options[:clean_entities] = false
end
end

FileUtils.mkdir_p("_posts")

# Reads a MySQL database via Sequel and creates a post file for each
# post in wp_posts that has post_status = 'publish'. This restriction is
# made because 'draft' posts are not guaranteed to have valid dates.
query = "SELECT post_title, \
post_name, \
post_date, \
post_content, \
post_excerpt, \
ID, \
guid \
FROM #{table_prefix}posts \
WHERE post_status = 'publish' AND \
post_type = 'post'"

db[query].each do |post|
# Get required fields and construct Jekyll compatible name.
title = post[:post_title]
slug = post[:post_name]
date = post[:post_date]
content = post[:post_content]
name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, date.day,
slug]

# Get the relevant fields as a hash, delete empty fields and convert
# to YAML for the header.
data = {
'layout' => 'post',
'title' => title.to_s,
'excerpt' => post[:post_excerpt].to_s,
'wordpress_id' => post[:ID],
'wordpress_url' => post[:guid],
'date' => date
}.delete_if { |k,v| v.nil? || v == '' }.to_yaml

# Write out the data and content to file
File.open("_posts/#{name}", "w") do |f|
f.puts data
f.puts "---"
f.puts content
db = Sequel.mysql(dbname, :user => user, :password => pass,
:host => host, :encoding => 'utf8')

px = options[:table_prefix]

posts_query = "
SELECT
posts.ID AS `id`,
posts.guid AS `guid`,
posts.post_type AS `type`,
posts.post_status AS `status`,
posts.post_title AS `title`,
posts.post_name AS `slug`,
posts.post_date AS `date`,
posts.post_content AS `content`,
posts.post_excerpt AS `excerpt`,
posts.comment_count AS `comment_count`,
users.display_name AS `author`,
users.user_login AS `author_login`,
users.user_email AS `author_email`,
users.user_url AS `author_url`
FROM #{px}posts AS `posts`
LEFT JOIN #{px}users AS `users`
ON posts.post_author = users.ID"

if options[:status] and not options[:status].empty?
status = options[:status][0]
posts_query << "
WHERE posts.post_status = '#{status.to_s}'"
options[:status][1..-1].each do |status|
posts_query << " OR
posts.post_status = '#{status.to_s}'"
end
end

db[posts_query].each do |post|
process_post(post, db, options)
end
end


def self.process_post(post, db, options)
px = options[:table_prefix]

title = post[:title]
if options[:clean_entities]
title = clean_entities(title)
end

slug = post[:slug]
if !slug or slug.empty?
slug = sluggify(title)
end

date = post[:date] || Time.now
name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month,
date.day, slug]
content = post[:content].to_s
if options[:clean_entities]
content = clean_entities(content)
end

excerpt = post[:excerpt].to_s

more_index = content.index(/<!-- *more *-->/)
more_anchor = nil
if more_index
if options[:more_excerpt] and
(post[:excerpt].nil? or post[:excerpt].empty?)
excerpt = content[0...more_index]
end
if options[:more_anchor]
more_link = "more"
content.sub!(/<!-- *more *-->/,
"<a id=\"more\"></a>" +
"<a id=\"more-#{post[:id]}\"></a>")
end
end

categories = []
tags = []

if options[:categories] or options[:tags]

cquery =
"SELECT
terms.name AS `name`,
ttax.taxonomy AS `type`
FROM
#{px}terms AS `terms`,
#{px}term_relationships AS `trels`,
#{px}term_taxonomy AS `ttax`
WHERE
trels.object_id = '#{post[:id]}' AND
trels.term_taxonomy_id = ttax.term_taxonomy_id AND
terms.term_id = ttax.term_id"

db[cquery].each do |term|
if options[:categories] and term[:type] == "category"
if options[:clean_entities]
categories << clean_entities(term[:name])
else
categories << term[:name]
end
elsif options[:tags] and term[:type] == "post_tag"
if options[:clean_entities]
tags << clean_entities(term[:name])
else
tags << term[:name]
end
end
end
end

comments = []

if options[:comments] and post[:comment_count].to_i > 0
cquery =
"SELECT
comment_ID AS `id`,
comment_author AS `author`,
comment_author_email AS `author_email`,
comment_author_url AS `author_url`,
comment_date AS `date`,
comment_date_gmt AS `date_gmt`,
comment_content AS `content`
FROM #{px}comments
WHERE
comment_post_ID = '#{post[:id]}' AND
comment_approved != 'spam'"


db[cquery].each do |comment|

comcontent = comment[:content].to_s
if comcontent.respond_to?(:force_encoding)
comcontent.force_encoding("UTF-8")
end
if options[:clean_entities]
comcontent = clean_entities(comcontent)
end
comauthor = comment[:author].to_s
if options[:clean_entities]
comauthor = clean_entities(comauthor)
end

comments << {
'id' => comment[:id].to_i,
'author' => comauthor,
'author_email' => comment[:author_email].to_s,
'author_url' => comment[:author_url].to_s,
'date' => comment[:date].to_s,
'date_gmt' => comment[:date_gmt].to_s,
'content' => comcontent,
}
end

comments.sort!{ |a,b| a['id'] <=> b['id'] }
end

# Get the relevant fields as a hash, delete empty fields and
# convert to YAML for the header.
data = {
'layout' => post[:type].to_s,
'status' => post[:status].to_s,
'published' => (post[:status].to_s == "publish"),
'title' => title.to_s,
'author' => post[:author].to_s,
'author_login' => post[:author_login].to_s,
'author_email' => post[:author_email].to_s,
'author_url' => post[:author_url].to_s,
'excerpt' => excerpt,
'more_anchor' => more_anchor,
'wordpress_id' => post[:id],
'wordpress_url' => post[:guid].to_s,
'date' => date,
'categories' => options[:categories] ? categories : nil,
'tags' => options[:tags] ? tags : nil,
'comments' => options[:comments] ? comments : nil,
}.delete_if { |k,v| v.nil? || v == '' }.to_yaml

# Write out the data and content to file
File.open("_posts/#{name}", "w") do |f|
f.puts data
f.puts "---"
f.puts content
end
end


def self.clean_entities( text )
if text.respond_to?(:force_encoding)
text.force_encoding("UTF-8")
end
text = HTMLEntities.new.encode(text, :named)
# We don't want to convert these, it would break all
# HTML tags in the post and comments.
text.gsub!("&amp;", "&")
text.gsub!("&lt;", "<")
text.gsub!("&gt;", ">")
text.gsub!("&quot;", '"')
text.gsub!("&apos;", "'")
text
end


def self.sluggify( title )
begin
require 'unidecode'
title = title.to_ascii
rescue LoadError
STDERR.puts "Could not require 'unidecode'. If your post titles have non-ASCII characters, you could get nicer permalinks by installing unidecode."
end
title.downcase.gsub(/[^0-9A-Za-z]+/, " ").strip.gsub(" ", "-")
end

end
end

0 comments on commit 22159b3

Please sign in to comment.