Merge pull request jekyll#368 from jacius/wordpress

Greatly improved and extended the wordpress.rb migrator.
sillylogger · Jan 22, 2012 · 22159b3 · 22159b3
2 parents 1292c02 + 1b2b5ff
commit 22159b3
Showing 1 changed file with 273 additions and 41 deletions.
diff --git a/lib/jekyll/migrators/wordpress.rb b/lib/jekyll/migrators/wordpress.rb
@@ -11,52 +11,284 @@
 
 module Jekyll
   module WordPress
-    def self.process(dbname, user, pass, host = 'localhost', table_prefix = 'wp_')
-      db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8')
+
+    # Main migrator function. Call this to perform the migration.
+    # 
+    # dbname::  The name of the database
+    # user::    The database user name
+    # pass::    The database user's password
+    # host::    The address of the MySQL database host. Default: 'localhost'
+    # options:: A hash table of configuration options.
+    # 
+    # Supported options are:
+    # 
+    # :table_prefix::   Prefix of database tables used by WordPress.
+    #                   Default: 'wp_'
+    # :clean_entities:: If true, convert non-ASCII characters to HTML
+    #                   entities in the posts, comments, titles, and
+    #                   names. Requires the 'htmlentities' gem to
+    #                   work. Default: true.
+    # :comments::       If true, migrate post comments too. Comments
+    #                   are saved in the post's YAML front matter.
+    #                   Default: true.
+    # :categories::     If true, save the post's categories in its
+    #                   YAML front matter.
+    # :tags::           If true, save the post's tags in its
+    #                   YAML front matter.
+    # :more_excerpt::   If true, when a post has no excerpt but
+    #                   does have a <!-- more --> tag, use the
+    #                   preceding post content as the excerpt.
+    #                   Default: true.
+    # :more_anchor::    If true, convert a <!-- more --> tag into
+    #                   two HTML anchors with ids "more" and
+    #                   "more-NNN" (where NNN is the post number).
+    #                   Default: true.
+    # :status::         Array of allowed post statuses. Only
+    #                   posts with matching status will be migrated.
+    #                   Known statuses are :publish, :draft, :private,
+    #                   and :revision. If this is nil or an empty
+    #                   array, all posts are migrated regardless of
+    #                   status. Default: [:publish].
+    # 
+    def self.process(dbname, user, pass, host='localhost', options={})
+      options = {
+        :table_prefix   => 'wp_',
+        :clean_entities => true,
+        :comments       => true,
+        :categories     => true,
+        :tags           => true,
+        :more_excerpt   => true,
+        :more_anchor    => true,
+        :status         => [:publish] # :draft, :private, :revision
+      }.merge(options)
+
+      if options[:clean_entities]
+        begin
+          require 'htmlentities'
+        rescue LoadError
+          STDERR.puts "Could not require 'htmlentities', so the " +
+                      ":clean_entities option is now disabled."
+          options[:clean_entities] = false
+        end
+      end
 
       FileUtils.mkdir_p("_posts")
 
-      # Reads a MySQL database via Sequel and creates a post file for each
-      # post in wp_posts that has post_status = 'publish'. This restriction is
-      # made because 'draft' posts are not guaranteed to have valid dates.
-      query = "SELECT post_title, \
-                      post_name, \
-                      post_date, \
-                      post_content, \
-                      post_excerpt, \
-                      ID, \
-                      guid \
-               FROM #{table_prefix}posts \
-               WHERE post_status = 'publish' AND \
-                     post_type = 'post'"
-
-      db[query].each do |post|
-        # Get required fields and construct Jekyll compatible name.
-        title = post[:post_title]
-        slug = post[:post_name]
-        date = post[:post_date]
-        content = post[:post_content]
-        name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, date.day,
-                                               slug]
-
-        # Get the relevant fields as a hash, delete empty fields and convert
-        # to YAML for the header.
-        data = {
-           'layout' => 'post',
-           'title' => title.to_s,
-           'excerpt' => post[:post_excerpt].to_s,
-           'wordpress_id' => post[:ID],
-           'wordpress_url' => post[:guid],
-           'date' => date
-         }.delete_if { |k,v| v.nil? || v == '' }.to_yaml
-
-        # Write out the data and content to file
-        File.open("_posts/#{name}", "w") do |f|
-          f.puts data
-          f.puts "---"
-          f.puts content
+      db = Sequel.mysql(dbname, :user => user, :password => pass,
+                        :host => host, :encoding => 'utf8')
+
+      px = options[:table_prefix]
+
+      posts_query = "
+         SELECT
+           posts.ID            AS `id`,
+           posts.guid          AS `guid`,
+           posts.post_type     AS `type`,
+           posts.post_status   AS `status`,
+           posts.post_title    AS `title`,
+           posts.post_name     AS `slug`,
+           posts.post_date     AS `date`,
+           posts.post_content  AS `content`,
+           posts.post_excerpt  AS `excerpt`,
+           posts.comment_count AS `comment_count`,
+           users.display_name  AS `author`,
+           users.user_login    AS `author_login`,
+           users.user_email    AS `author_email`,
+           users.user_url      AS `author_url`
+         FROM #{px}posts AS `posts`
+           LEFT JOIN #{px}users AS `users`
+             ON posts.post_author = users.ID"
+
+      if options[:status] and not options[:status].empty?
+        status = options[:status][0]
+        posts_query << "
+         WHERE posts.post_status = '#{status.to_s}'"
+        options[:status][1..-1].each do |status|
+          posts_query << " OR
+           posts.post_status = '#{status.to_s}'"
+        end
+      end
+
+      db[posts_query].each do |post|
+        process_post(post, db, options)
+      end
+    end
+
+
+    def self.process_post(post, db, options)
+      px = options[:table_prefix]
+
+      title = post[:title]
+      if options[:clean_entities]
+        title = clean_entities(title)
+      end
+
+      slug = post[:slug]
+      if !slug or slug.empty?
+        slug = sluggify(title)
+      end
+
+      date = post[:date] || Time.now
+      name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month,
+                                             date.day, slug]
+      content = post[:content].to_s
+      if options[:clean_entities]
+        content = clean_entities(content)
+      end
+
+      excerpt = post[:excerpt].to_s
+
+      more_index = content.index(/<!-- *more *-->/)
+      more_anchor = nil
+      if more_index
+        if options[:more_excerpt] and
+            (post[:excerpt].nil? or post[:excerpt].empty?)
+          excerpt = content[0...more_index]
+        end
+        if options[:more_anchor]
+          more_link = "more"
+          content.sub!(/<!-- *more *-->/,
+                       "<a id=\"more\"></a>" + 
+                       "<a id=\"more-#{post[:id]}\"></a>")
+        end
+      end
+
+      categories = []
+      tags = []
+
+      if options[:categories] or options[:tags]
+
+        cquery =
+          "SELECT
+             terms.name AS `name`,
+             ttax.taxonomy AS `type`
+           FROM
+             #{px}terms AS `terms`,
+             #{px}term_relationships AS `trels`,
+             #{px}term_taxonomy AS `ttax`
+           WHERE
+             trels.object_id = '#{post[:id]}' AND
+             trels.term_taxonomy_id = ttax.term_taxonomy_id AND
+             terms.term_id = ttax.term_id"
+
+        db[cquery].each do |term|
+          if options[:categories] and term[:type] == "category"
+            if options[:clean_entities]
+              categories << clean_entities(term[:name])
+            else
+              categories << term[:name]
+            end
+          elsif options[:tags] and term[:type] == "post_tag"
+            if options[:clean_entities]
+              tags << clean_entities(term[:name])
+            else
+              tags << term[:name]
+            end
+          end
         end
       end
+
+      comments = []
+
+      if options[:comments] and post[:comment_count].to_i > 0
+        cquery =
+          "SELECT
+             comment_ID           AS `id`,
+             comment_author       AS `author`,
+             comment_author_email AS `author_email`,
+             comment_author_url   AS `author_url`,
+             comment_date         AS `date`,
+             comment_date_gmt     AS `date_gmt`,
+             comment_content      AS `content`
+           FROM #{px}comments
+           WHERE
+             comment_post_ID = '#{post[:id]}' AND
+             comment_approved != 'spam'"
+
+
+        db[cquery].each do |comment|
+
+          comcontent = comment[:content].to_s
+          if comcontent.respond_to?(:force_encoding)
+            comcontent.force_encoding("UTF-8")
+          end
+          if options[:clean_entities]
+            comcontent = clean_entities(comcontent)
+          end
+          comauthor = comment[:author].to_s
+          if options[:clean_entities]
+            comauthor = clean_entities(comauthor)
+          end
+
+          comments << {
+            'id'           => comment[:id].to_i,
+            'author'       => comauthor,
+            'author_email' => comment[:author_email].to_s,
+            'author_url'   => comment[:author_url].to_s,
+            'date'         => comment[:date].to_s,
+            'date_gmt'     => comment[:date_gmt].to_s,
+            'content'      => comcontent,
+          }
+        end
+
+        comments.sort!{ |a,b| a['id'] <=> b['id'] }
+      end
+
+      # Get the relevant fields as a hash, delete empty fields and
+      # convert to YAML for the header.
+      data = {
+        'layout'        => post[:type].to_s,
+        'status'        => post[:status].to_s,
+        'published'     => (post[:status].to_s == "publish"),
+        'title'         => title.to_s,
+        'author'        => post[:author].to_s,
+        'author_login'  => post[:author_login].to_s,
+        'author_email'  => post[:author_email].to_s,
+        'author_url'    => post[:author_url].to_s,
+        'excerpt'       => excerpt,
+        'more_anchor'   => more_anchor,
+        'wordpress_id'  => post[:id],
+        'wordpress_url' => post[:guid].to_s,
+        'date'          => date,
+        'categories'    => options[:categories] ? categories : nil,
+        'tags'          => options[:tags] ? tags : nil,
+        'comments'      => options[:comments] ? comments : nil,
+      }.delete_if { |k,v| v.nil? || v == '' }.to_yaml
+
+      # Write out the data and content to file
+      File.open("_posts/#{name}", "w") do |f|
+        f.puts data
+        f.puts "---"
+        f.puts content
+      end
+    end
+
+
+    def self.clean_entities( text )
+      if text.respond_to?(:force_encoding)
+        text.force_encoding("UTF-8")
+      end
+      text = HTMLEntities.new.encode(text, :named)
+      # We don't want to convert these, it would break all
+      # HTML tags in the post and comments.
+      text.gsub!("&amp;", "&")
+      text.gsub!("&lt;", "<")
+      text.gsub!("&gt;", ">")
+      text.gsub!("&quot;", '"')
+      text.gsub!("&apos;", "'")
+      text
+    end
+
+
+    def self.sluggify( title )
+      begin
+        require 'unidecode'
+        title = title.to_ascii
+      rescue LoadError
+        STDERR.puts "Could not require 'unidecode'. If your post titles have non-ASCII characters, you could get nicer permalinks by installing unidecode."
+      end
+      title.downcase.gsub(/[^0-9A-Za-z]+/, " ").strip.gsub(" ", "-")
     end
+
   end
 end