From 581e725d1ce1ede297677c77ef23f3674c73a197 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Thu, 16 Oct 2025 17:28:58 +0900 Subject: [PATCH 01/28] Initial implementation of raw mail importer --- bin/import_mails | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100755 bin/import_mails diff --git a/bin/import_mails b/bin/import_mails new file mode 100755 index 0000000..0c966e7 --- /dev/null +++ b/bin/import_mails @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby + +require 'optparse' +require 'mail' + +BASE_DIR = Rails.root.join('tmp') + +params = {} +OptionParser.new do |opts| + opts.on('--list LIST') + opts.on('--from FROM', Integer) + opts.on('--to TO', Integer) +end.parse!(into: params) + +list = List.find_by_name(params[:list]) + +Message.transaction do + (params[:from]..params[:to]).each do |seq| + begin + str = File.binread BASE_DIR.join(list.name, seq.to_s) + mail = Mail.read_from_string str + message = Message.new list_id: list.id, list_seq: seq, body: mail.body.decoded, subject: mail.subject, from: mail.from, published_at: mail.date, message_id_header: mail.message_id + message.save! + rescue ActiveRecord::RecordNotUnique + STDERR.puts("#{list}:#{seq} already exists in Postgres") + rescue StandardError => e + STDERR.puts("failed to import #{list}:#{seq}: #{e}") + end + end +end From 352f5f557a27b421efb7ac91e2be8d344f7e7557 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Thu, 16 Oct 2025 17:59:30 +0900 Subject: [PATCH 02/28] Extract Message.from_mail to a method --- app/models/message.rb | 4 ++++ bin/import_mails | 2 +- test/models/message_test.rb | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 1f5151b..e309011 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -11,6 +11,10 @@ class Message < ApplicationRecord self.skip_time_zone_conversion_for_attributes = [:published_at] class << self + def from_mail(mail, list, list_seq) + new list_id: list.id, list_seq: list_seq, body: mail.body.decoded, subject: mail.subject, from: mail.from, published_at: mail.date, message_id_header: mail.message_id + end + def from_s3(list_name, list_seq, s3_client = Aws::S3::Client.new(region: BLADE_BUCKET_REGION)) obj = s3_client.get_object(bucket: BLADE_BUCKET_NAME, key: "#{list_name}/#{list_seq}") diff --git a/bin/import_mails b/bin/import_mails index 0c966e7..c9bcc4c 100755 --- a/bin/import_mails +++ b/bin/import_mails @@ -19,7 +19,7 @@ Message.transaction do begin str = File.binread BASE_DIR.join(list.name, seq.to_s) mail = Mail.read_from_string str - message = Message.new list_id: list.id, list_seq: seq, body: mail.body.decoded, subject: mail.subject, from: mail.from, published_at: mail.date, message_id_header: mail.message_id + message = Message.from_mail mail, list, seq message.save! rescue ActiveRecord::RecordNotUnique STDERR.puts("#{list}:#{seq} already exists in Postgres") diff --git a/test/models/message_test.rb b/test/models/message_test.rb index ce97450..02c1f5c 100644 --- a/test/models/message_test.rb +++ b/test/models/message_test.rb @@ -1,6 +1,21 @@ require "test_helper" class MessageTest < ActiveSupport::TestCase + test 'from_mail' do + mail = Mail.read_from_string(< Date: Thu, 16 Oct 2025 18:05:54 +0900 Subject: [PATCH 03/28] Mail file might not exist or might be a blank file --- bin/import_mails | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/import_mails b/bin/import_mails index c9bcc4c..0cb8cfd 100755 --- a/bin/import_mails +++ b/bin/import_mails @@ -17,7 +17,12 @@ list = List.find_by_name(params[:list]) Message.transaction do (params[:from]..params[:to]).each do |seq| begin - str = File.binread BASE_DIR.join(list.name, seq.to_s) + filepath = BASE_DIR.join(list.name, seq.to_s) + next unless filepath.exist? + + str = File.binread filepath + next if str.blank? + mail = Mail.read_from_string str message = Message.from_mail mail, list, seq message.save! From 0ad4d14962c7c5ed0936b4074015e6f19a1c9a06 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Thu, 16 Oct 2025 20:17:34 +0900 Subject: [PATCH 04/28] Let's assume that all mail body are encoded in ISO-2022-JP and let's convert all of them to UTF-8 --- app/models/message.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index e309011..43b461d 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -12,7 +12,8 @@ class Message < ApplicationRecord class << self def from_mail(mail, list, list_seq) - new list_id: list.id, list_seq: list_seq, body: mail.body.decoded, subject: mail.subject, from: mail.from, published_at: mail.date, message_id_header: mail.message_id + body = mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP + new list_id: list.id, list_seq: list_seq, body: body, subject: mail.subject, from: mail.from, published_at: mail.date, message_id_header: mail.message_id end def from_s3(list_name, list_seq, s3_client = Aws::S3::Client.new(region: BLADE_BUCKET_REGION)) From 4e1c01ec01eaab1355cacb3cd39c82a6dca70527 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Thu, 16 Oct 2025 20:24:14 +0900 Subject: [PATCH 05/28] Use unparsed From value because so far we're just showing them on HTML, not machine-processing --- app/models/message.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 43b461d..4f11091 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -13,7 +13,8 @@ class Message < ApplicationRecord class << self def from_mail(mail, list, list_seq) body = mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP - new list_id: list.id, list_seq: list_seq, body: body, subject: mail.subject, from: mail.from, published_at: mail.date, message_id_header: mail.message_id + from = mail.from_address.decoded + new list_id: list.id, list_seq: list_seq, body: body, subject: mail.subject, from: from, published_at: mail.date, message_id_header: mail.message_id end def from_s3(list_name, list_seq, s3_client = Aws::S3::Client.new(region: BLADE_BUCKET_REGION)) From 4e62345aa1317614673a1b2b341ea6e13e0b94cb Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Thu, 16 Oct 2025 20:59:01 +0900 Subject: [PATCH 06/28] Mail body may sometimes be broken, but let's proceed anyway e.g. ruby-dev 1553, 2320, 2321, 2322, 4361, etc. --- app/models/message.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 4f11091..e52366e 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -12,7 +12,12 @@ class Message < ApplicationRecord class << self def from_mail(mail, list, list_seq) - body = mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP + body = begin + mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP + rescue Encoding::InvalidByteSequenceError + mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP, invalid: :replace, undef: :replace + end + from = mail.from_address.decoded new list_id: list.id, list_seq: list_seq, body: body, subject: mail.subject, from: from, published_at: mail.date, message_id_header: mail.message_id end From a4d60668fcb25abe184a8ec740b418a7f8e584be Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Fri, 17 Oct 2025 05:50:43 +0900 Subject: [PATCH 07/28] Fall back to Mail's default encoding handling for some ruby-dev mails, e.g. 2320 --- app/models/message.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/models/message.rb b/app/models/message.rb index e52366e..0faab0d 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -16,6 +16,8 @@ def from_mail(mail, list, list_seq) mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP rescue Encoding::InvalidByteSequenceError mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP, invalid: :replace, undef: :replace + rescue Encoding::UndefinedConversionError + mail.decoded end from = mail.from_address.decoded From 1e1122c3d89f7984b84b19afc330673b51d27045 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Fri, 17 Oct 2025 06:49:56 +0900 Subject: [PATCH 08/28] Turned out that it's hard to do this without Kconv... e.g. ruby-dev 2655 --- app/models/message.rb | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/app/models/message.rb b/app/models/message.rb index 0faab0d..fd7d778 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -12,14 +12,7 @@ class Message < ApplicationRecord class << self def from_mail(mail, list, list_seq) - body = begin - mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP - rescue Encoding::InvalidByteSequenceError - mail.body.decoded.encode Encoding::UTF_8, Encoding::ISO_2022_JP, invalid: :replace, undef: :replace - rescue Encoding::UndefinedConversionError - mail.decoded - end - + body = Kconv.toutf8 mail.body.raw_source from = mail.from_address.decoded new list_id: list.id, list_seq: list_seq, body: body, subject: mail.subject, from: from, published_at: mail.date, message_id_header: mail.message_id end From 6365a1ff48ee33145b7648a2e1d14a74dd333b4f Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Fri, 17 Oct 2025 06:51:14 +0900 Subject: [PATCH 09/28] Decode subject --- app/models/message.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index fd7d778..0189d84 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -13,8 +13,9 @@ class Message < ApplicationRecord class << self def from_mail(mail, list, list_seq) body = Kconv.toutf8 mail.body.raw_source + subject = Kconv.toutf8 mail.subject from = mail.from_address.decoded - new list_id: list.id, list_seq: list_seq, body: body, subject: mail.subject, from: from, published_at: mail.date, message_id_header: mail.message_id + new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: mail.message_id end def from_s3(list_name, list_seq, s3_client = Aws::S3::Client.new(region: BLADE_BUCKET_REGION)) From 5e4ea8de74f9779f7a3670cf65794b90ae3b592f Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Fri, 17 Oct 2025 08:00:47 +0900 Subject: [PATCH 10/28] Retrieve parent message-id from in-reply-to, references header --- app/models/message.rb | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 0189d84..000cec2 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -15,7 +15,24 @@ def from_mail(mail, list, list_seq) body = Kconv.toutf8 mail.body.raw_source subject = Kconv.toutf8 mail.subject from = mail.from_address.decoded - new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: mail.message_id + + # mail.in_reply_to returns strange Array object in some cases (?), so let's use the raw value + parent_message_id = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value) + parent_message = Message.find_by message_id_header: parent_message_id if parent_message_id + if !parent_message && (String === mail.references) + parent_message = Message.find_by message_id_header: mail.references + end + if !parent_message && (Array === mail.references) + mail.references.compact.each do |ref| + break if (parent_message = Message.find_by message_id_header: ref) + end + end + + new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: mail.message_id, parent_id: parent_message&.id + end + + private def extract_message_id_from_in_reply_to(header) + header && header.strip.scan(/<([^>]+)>/).flatten.first end def from_s3(list_name, list_seq, s3_client = Aws::S3::Client.new(region: BLADE_BUCKET_REGION)) From 4110f59ed355fb55bfba502a227938a82e4d6d0b Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 01:15:29 +0900 Subject: [PATCH 11/28] rails g migration add_index_messages_message_id_header --- .../20251017161507_add_index_messages_message_id_header.rb | 4 ++++ db/schema.rb | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 db/migrate/20251017161507_add_index_messages_message_id_header.rb diff --git a/db/migrate/20251017161507_add_index_messages_message_id_header.rb b/db/migrate/20251017161507_add_index_messages_message_id_header.rb new file mode 100644 index 0000000..b92d7d6 --- /dev/null +++ b/db/migrate/20251017161507_add_index_messages_message_id_header.rb @@ -0,0 +1,4 @@ +class AddIndexMessagesMessageIdHeader < ActiveRecord::Migration[8.0] + def change + end +end diff --git a/db/schema.rb b/db/schema.rb index 4ebda35..9b743e7 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2025_10_10_175060) do +ActiveRecord::Schema[8.0].define(version: 2025_10_17_161507) do # These are extensions that must be enabled in order to support this database enable_extension "pg_catalog.plpgsql" enable_extension "pg_trgm" From ca6f1508ca50344f5dee17d7d25110934524f129 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Fri, 17 Oct 2025 19:27:53 +0900 Subject: [PATCH 12/28] Workaround "string contains null byte" on ruby-dev: 13859 --- app/models/message.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/models/message.rb b/app/models/message.rb index 000cec2..2a03c66 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -13,6 +13,9 @@ class Message < ApplicationRecord class << self def from_mail(mail, list, list_seq) body = Kconv.toutf8 mail.body.raw_source + if (list.name == 'ruby-dev') && (list_seq == 13859) + body.gsub!("\u0000", '') + end subject = Kconv.toutf8 mail.subject from = mail.from_address.decoded From 7281ed409e30dde9ffb51e5afb6889757999f87c Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 00:23:03 +0900 Subject: [PATCH 13/28] Some more null bytes --- app/models/message.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 2a03c66..8f7852a 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -13,7 +13,7 @@ class Message < ApplicationRecord class << self def from_mail(mail, list, list_seq) body = Kconv.toutf8 mail.body.raw_source - if (list.name == 'ruby-dev') && (list_seq == 13859) + if ((list.name == 'ruby-dev') && list_seq.in?([13859, 26229, 39731, 39734])) || ((list.name == 'ruby-core') && list_seq.in?([5231])) || ((list.name == 'ruby-list') && list_seq.in?([29637, 29711, 30148])) || ((list.name == 'ruby-talk') && list_seq.in?([5198, 61316])) body.gsub!("\u0000", '') end subject = Kconv.toutf8 mail.subject From 7feef359fe20aaa18965d42f509e8eae66e86c17 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Fri, 17 Oct 2025 21:43:02 +0900 Subject: [PATCH 14/28] Properly encode from to UTF-8 this still warns "Encoding conversion failed code converter not found (ISO-2022-JP-2 to UTF-8)" when fetching `from` from mail, but it seems like it's working anyway --- app/models/message.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/models/message.rb b/app/models/message.rb index 8f7852a..e8395ba 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -18,6 +18,9 @@ def from_mail(mail, list, list_seq) end subject = Kconv.toutf8 mail.subject from = mail.from_address.decoded + if (list.name == 'ruby-dev') && (list_seq == 13859) + from = Kconv.toutf8 from + end # mail.in_reply_to returns strange Array object in some cases (?), so let's use the raw value parent_message_id = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value) From e9d9a7719248a44c6c758eb741ea9235fc5716e5 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 00:58:29 +0900 Subject: [PATCH 15/28] Mail#from_address can be nil --- app/models/message.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index e8395ba..51fd928 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -17,7 +17,7 @@ def from_mail(mail, list, list_seq) body.gsub!("\u0000", '') end subject = Kconv.toutf8 mail.subject - from = mail.from_address.decoded + from = mail.from_address&.decoded if (list.name == 'ruby-dev') && (list_seq == 13859) from = Kconv.toutf8 from end From 5b5a144ef0c33f69169a4a08cd9438b0a9547511 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 01:00:49 +0900 Subject: [PATCH 16/28] Work around ruby-core: 161 mojibake --- app/models/message.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/models/message.rb b/app/models/message.rb index 51fd928..721db68 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -18,6 +18,9 @@ def from_mail(mail, list, list_seq) end subject = Kconv.toutf8 mail.subject from = mail.from_address&.decoded + if !from && (list.name == 'ruby-core') && (list_seq == 161) + from = mail.from.encode Encoding::UTF_8, Encoding::KOI8_R + end if (list.name == 'ruby-dev') && (list_seq == 13859) from = Kconv.toutf8 from end From d5ffd4a0d96c96d90fa64bc6f9852765a17e53d3 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 01:06:55 +0900 Subject: [PATCH 17/28] There can be emails without a subject e.g. ruby-core: 290 --- app/models/message.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 721db68..aa20131 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -16,7 +16,8 @@ def from_mail(mail, list, list_seq) if ((list.name == 'ruby-dev') && list_seq.in?([13859, 26229, 39731, 39734])) || ((list.name == 'ruby-core') && list_seq.in?([5231])) || ((list.name == 'ruby-list') && list_seq.in?([29637, 29711, 30148])) || ((list.name == 'ruby-talk') && list_seq.in?([5198, 61316])) body.gsub!("\u0000", '') end - subject = Kconv.toutf8 mail.subject + subject = mail.subject + subject = Kconv.toutf8 subject if subject from = mail.from_address&.decoded if !from && (list.name == 'ruby-core') && (list_seq == 161) from = mail.from.encode Encoding::UTF_8, Encoding::KOI8_R From 59f4ee8e982b3b6958d3e95d92f34bb9fcdf51c3 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Tue, 21 Oct 2025 11:09:12 +0900 Subject: [PATCH 18/28] Kconv does everything almost properly, indeed --- app/models/message.rb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/app/models/message.rb b/app/models/message.rb index aa20131..7080ad4 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -18,13 +18,10 @@ def from_mail(mail, list, list_seq) end subject = mail.subject subject = Kconv.toutf8 subject if subject - from = mail.from_address&.decoded + from = Kconv.toutf8 mail.from_address.raw if !from && (list.name == 'ruby-core') && (list_seq == 161) from = mail.from.encode Encoding::UTF_8, Encoding::KOI8_R end - if (list.name == 'ruby-dev') && (list_seq == 13859) - from = Kconv.toutf8 from - end # mail.in_reply_to returns strange Array object in some cases (?), so let's use the raw value parent_message_id = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value) From ff38b43b048b0dd16bc0f22e689e1da706cdc0ed Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Tue, 21 Oct 2025 11:09:41 +0900 Subject: [PATCH 19/28] Mail#from_address could return nil e.g. ruby-core: 161 --- app/models/message.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 7080ad4..5bae69a 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -18,7 +18,7 @@ def from_mail(mail, list, list_seq) end subject = mail.subject subject = Kconv.toutf8 subject if subject - from = Kconv.toutf8 mail.from_address.raw + from = Kconv.toutf8 mail.from_address&.raw if !from && (list.name == 'ruby-core') && (list_seq == 161) from = mail.from.encode Encoding::UTF_8, Encoding::KOI8_R end From 6ecace8df6f9b5d4b1f22f783e7609c0a363ec4a Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 19:46:41 +0900 Subject: [PATCH 20/28] subject can be nil --- app/helpers/messages_helper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/helpers/messages_helper.rb b/app/helpers/messages_helper.rb index 94ef84a..66beb1e 100644 --- a/app/helpers/messages_helper.rb +++ b/app/helpers/messages_helper.rb @@ -1,6 +1,6 @@ module MessagesHelper def without_list_prefix(subject) - subject.sub(/^\[.+?\]\s*/, '') + subject&.sub(/^\[.+?\]\s*/, '') end MARGIN = 50 From ff5ff6aae3e2e36b4b11639dc4d029b8585cfe44 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 21:05:15 +0900 Subject: [PATCH 21/28] Work around "Encoding::CompatibilityError: incompatible character encodings: UTF-8 and BINARY (ASCII-8BIT)" on ruby-list: 37565, 38116, 43106 --- app/models/message.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/models/message.rb b/app/models/message.rb index 5bae69a..8adddde 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -16,6 +16,9 @@ def from_mail(mail, list, list_seq) if ((list.name == 'ruby-dev') && list_seq.in?([13859, 26229, 39731, 39734])) || ((list.name == 'ruby-core') && list_seq.in?([5231])) || ((list.name == 'ruby-list') && list_seq.in?([29637, 29711, 30148])) || ((list.name == 'ruby-talk') && list_seq.in?([5198, 61316])) body.gsub!("\u0000", '') end + if (list.name == 'ruby-list') && list_seq.in?([37565, 38116, 43106]) + mail.header[:subject].value.chop! + end subject = mail.subject subject = Kconv.toutf8 subject if subject from = Kconv.toutf8 mail.from_address&.raw From 12e446ad45bc5bceb43cc5c58d8cf1812254191c Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sat, 18 Oct 2025 21:39:57 +0900 Subject: [PATCH 22/28] Workaround broken subject on ruby-list: 41850, 43710 --- app/models/message.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/models/message.rb b/app/models/message.rb index 8adddde..aff91cf 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -19,6 +19,9 @@ def from_mail(mail, list, list_seq) if (list.name == 'ruby-list') && list_seq.in?([37565, 38116, 43106]) mail.header[:subject].value.chop! end + if (list.name == 'ruby-list') && (list_seq.in?([41850, 43710])) + mail.header[:subject].value = Kconv.toutf8 mail.header[:subject].value + end subject = mail.subject subject = Kconv.toutf8 subject if subject from = Kconv.toutf8 mail.from_address&.raw From ebf615240e1189d38af591e7bde4e2f506a702be Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Sun, 19 Oct 2025 02:11:58 +0900 Subject: [PATCH 23/28] message_id could include a broken mojibake char e.g. ruby-talk: 10751 --- app/models/message.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index aff91cf..54e779f 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -29,6 +29,8 @@ def from_mail(mail, list, list_seq) from = mail.from.encode Encoding::UTF_8, Encoding::KOI8_R end + message_id = mail.message_id.encode Encoding::UTF_8, invalid: :replace, undef: :replace + # mail.in_reply_to returns strange Array object in some cases (?), so let's use the raw value parent_message_id = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value) parent_message = Message.find_by message_id_header: parent_message_id if parent_message_id @@ -41,7 +43,7 @@ def from_mail(mail, list, list_seq) end end - new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: mail.message_id, parent_id: parent_message&.id + new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: message_id, parent_id: parent_message&.id end private def extract_message_id_from_in_reply_to(header) From 2fed22122277d44b141a69103fd14e438b64d66c Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Mon, 20 Oct 2025 16:12:15 +0900 Subject: [PATCH 24/28] message-id can be nil e.g. ruby-dev: 93-108 --- app/models/message.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/models/message.rb b/app/models/message.rb index 54e779f..8927944 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -29,7 +29,7 @@ def from_mail(mail, list, list_seq) from = mail.from.encode Encoding::UTF_8, Encoding::KOI8_R end - message_id = mail.message_id.encode Encoding::UTF_8, invalid: :replace, undef: :replace + message_id = mail.message_id&.encode Encoding::UTF_8, invalid: :replace, undef: :replace # mail.in_reply_to returns strange Array object in some cases (?), so let's use the raw value parent_message_id = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value) From 48bee57b80d07e6a370eed7826f211aac002a3fc Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Mon, 20 Oct 2025 06:05:13 +0900 Subject: [PATCH 25/28] Perf improvement by not creating AR objects --- app/models/message.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/app/models/message.rb b/app/models/message.rb index 8927944..c6d051f 100644 --- a/app/models/message.rb +++ b/app/models/message.rb @@ -32,18 +32,18 @@ def from_mail(mail, list, list_seq) message_id = mail.message_id&.encode Encoding::UTF_8, invalid: :replace, undef: :replace # mail.in_reply_to returns strange Array object in some cases (?), so let's use the raw value - parent_message_id = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value) - parent_message = Message.find_by message_id_header: parent_message_id if parent_message_id - if !parent_message && (String === mail.references) - parent_message = Message.find_by message_id_header: mail.references + parent_message_id_header = extract_message_id_from_in_reply_to(mail.header[:in_reply_to]&.value) + parent_message_id = Message.where(message_id_header: parent_message_id_header).pick(:id) if parent_message_id_header + if !parent_message_id && (String === mail.references) + parent_message_id = Message.where(message_id_header: mail.references).pick(:id) end - if !parent_message && (Array === mail.references) + if !parent_message_id && (Array === mail.references) mail.references.compact.each do |ref| - break if (parent_message = Message.find_by message_id_header: ref) + break if (parent_message_id = Message.where(message_id_header: ref).pick(:id)) end end - new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: message_id, parent_id: parent_message&.id + new list_id: list.id, list_seq: list_seq, body: body, subject: subject, from: from, published_at: mail.date, message_id_header: message_id, parent_id: parent_message_id end private def extract_message_id_from_in_reply_to(header) From 2d44e5c4d60b8b431c83f47b4d3ee20a0c56b029 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Mon, 20 Oct 2025 06:07:40 +0900 Subject: [PATCH 26/28] Report errors at once --- bin/import_mails | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/import_mails b/bin/import_mails index 0cb8cfd..515fbfb 100755 --- a/bin/import_mails +++ b/bin/import_mails @@ -14,6 +14,8 @@ end.parse!(into: params) list = List.find_by_name(params[:list]) +errors = [] + Message.transaction do (params[:from]..params[:to]).each do |seq| begin @@ -29,7 +31,10 @@ Message.transaction do rescue ActiveRecord::RecordNotUnique STDERR.puts("#{list}:#{seq} already exists in Postgres") rescue StandardError => e + errors << [seq, e] STDERR.puts("failed to import #{list}:#{seq}: #{e}") end end end + +pp errors if errors.any? From c92551ae41e7067ed48f632420f7cad8af366422 Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Mon, 20 Oct 2025 06:29:04 +0900 Subject: [PATCH 27/28] Perf improvement by not logging debug logs --- bin/import_mails | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/import_mails b/bin/import_mails index 515fbfb..a5e2a2f 100755 --- a/bin/import_mails +++ b/bin/import_mails @@ -16,6 +16,8 @@ list = List.find_by_name(params[:list]) errors = [] +Rails.logger.level = Logger::INFO + Message.transaction do (params[:from]..params[:to]).each do |seq| begin From 7fdc6583c66c3d68b97da0cccd9630337389f4ac Mon Sep 17 00:00:00 2001 From: Akira Matsuda Date: Tue, 21 Oct 2025 11:38:10 +0900 Subject: [PATCH 28/28] Mail converts linebreaks to CRLF --- test/models/message_test.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/models/message_test.rb b/test/models/message_test.rb index 02c1f5c..58b647d 100644 --- a/test/models/message_test.rb +++ b/test/models/message_test.rb @@ -9,9 +9,8 @@ class MessageTest < ActiveSupport::TestCase Hello, world! END_OF_BODY - m = Message.from_mail(mail, List.find_by_name('ruby-list'), 1) - assert_equal "Hello, world!\n", m.body + assert_equal "Hello, world!\r\n", m.body assert_equal DateTime.parse('2005-12-15T19:32:40+09:00'), m.published_at end