Add nagoya-university-conversation-corpus (#168)

Fix GH-48
red-data-tools · May 8, 2023 · 706b288 · 706b288
1 parent d70b0e0
commit 706b288
Show file tree

Hide file tree

Showing 5 changed files with 269 additions and 0 deletions.
diff --git a/example/nagoya-university-conversation-corpus.rb b/example/nagoya-university-conversation-corpus.rb
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+
+require 'datasets'
+
+nagoya_university_conversation_corpus = Datasets::NagoyaUniversityConversationCorpus.new
+
+nagoya_university_conversation_corpus.each do |data|
+  data.sentences.each do |sentence|
+    p [
+        sentence.participant_id,
+        sentence.content
+      ]
+  end
+end
diff --git a/lib/datasets/lazy.rb b/lib/datasets/lazy.rb
@@ -65,6 +65,8 @@ def const_missing(name)
   LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
   LAZY_LOADER.register(:MNIST, "datasets/mnist")
   LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
+  LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
+                       "datasets/nagoya-university-conversation-corpus")
   LAZY_LOADER.register(:Penguins, "datasets/penguins")
   LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
   LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")

diff --git a/lib/datasets/nagoya-university-conversation-corpus.rb b/lib/datasets/nagoya-university-conversation-corpus.rb
@@ -0,0 +1,109 @@
+require_relative 'dataset'
+require_relative 'zip-extractor'
+
+module Datasets
+  class NagoyaUniversityConversationCorpus < Dataset
+    Data = Struct.new(
+      :name,
+      :date,
+      :place,
+      :participants,
+      :relationships,
+      :note,
+      :sentences
+    )
+
+    Participant = Struct.new(
+      :id,
+      :attribute,
+      :birthplace,
+      :residence
+    )
+
+    Sentence = Struct.new(:participant_id, :content) do
+      def end?
+        participant_id.nil? and content.nil?
+      end
+    end
+
+    def initialize
+      super()
+      @metadata.id = 'nagoya-university-conversation-curpus'
+      @metadata.name = 'Nagoya University Conversation Curpus'
+      @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
+      @metadata.licenses = ['CC-BY-NC-ND-4.0']
+      @metadata.description = <<~DESCRIPTION
+        The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
+        total about 100 hours of chatting among native speakers of Japanese,
+        which is converted into text.
+      DESCRIPTION
+    end
+
+    def each
+      return to_enum(__method__) unless block_given?
+
+      open_data do |input_stream|
+        yield(parse_file(input_stream))
+      end
+    end
+
+    private
+
+    def open_data
+      data_path = cache_dir_path + 'nucc.zip'
+      data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
+      download(data_path, data_url)
+
+      extractor = ZipExtractor.new(data_path)
+      extractor.extract_files do |input_stream|
+        yield(input_stream)
+      end
+    end
+
+    def parse_file(input_stream)
+      data = Data.new
+      participants = []
+      sentences = []
+
+      input_stream.each do |input|
+        input.each_line(chomp: true) do |line|
+          line.force_encoding('utf-8')
+          if line.start_with?('＠データ')
+            data.name = line[4..]
+          elsif line.start_with?('＠収集年月日')
+            # mixed cases with and without'：'
+            data.date = line[6..].delete_prefix('：')
+          elsif line.start_with?('＠場所')
+            data.place = line[4..]
+          elsif line.start_with?('＠参加者の関係')
+            data.relationships = line.split('：', 2)[1]
+          elsif line.start_with?('＠参加者')
+            participant = Participant.new
+            participant.id, profiles = line[4..].split('：', 2)
+            participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
+
+            participants << participant
+          elsif line.start_with?('％ｃｏｍ')
+            data.note = line.split('：', 2)[1]
+          elsif line == '＠ＥＮＤ'
+            sentence = Sentence.new
+            sentence.participant_id = nil
+            sentence.content = nil
+
+            sentences << sentence
+          else
+            sentence = Sentence.new
+            sentence.participant_id, sentence.content = line.split('：', 2)
+
+            sentences << sentence
+          end
+        end
+      end
+
+      data.participants = participants
+      data.sentences = sentences
+
+      data
+    end
+  end
+end
diff --git a/lib/datasets/zip-extractor.rb b/lib/datasets/zip-extractor.rb
@@ -32,5 +32,17 @@ def extract_file(file_path)
       end
       nil
     end
+
+    def extract_files
+      Zip::File.open(@path) do |zip_file|
+        zip_file.each do |entry|
+          next unless entry.file?
+
+          entry.get_input_stream do |input|
+            yield(input)
+          end
+        end
+      end
+    end
   end
 end
diff --git a/test/test-nagoya-university-conversation-corpus.rb b/test/test-nagoya-university-conversation-corpus.rb
@@ -0,0 +1,132 @@
+class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
+  def setup
+    @dataset = Datasets::NagoyaUniversityConversationCorpus.new
+  end
+
+  sub_test_case("each") do
+    test("#sentences") do
+      records = @dataset.each.to_a
+      first_sentences = records[0].sentences
+      last_sentences = records[-1].sentences
+      assert_equal([
+                     856,
+                     {
+                       participant_id: 'F107',
+                       content: '＊＊＊の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても１時間ぐらいですよね。'
+                     },
+                     {
+                       participant_id: nil,
+                       content: nil
+                     },
+                     603,
+                     {
+                       participant_id: 'F007',
+                       content: 'それでは話を始めまーす。'
+                     },
+                     {
+                       participant_id: nil,
+                       content: nil
+                     }
+                   ],
+                   [
+                     first_sentences.size,
+                     first_sentences[0].to_h,
+                     first_sentences[-1].to_h,
+                     last_sentences.size,
+                     last_sentences[0].to_h,
+                     last_sentences[-1].to_h,
+                   ])
+    end
+
+    test("#participants") do
+      records = @dataset.each.to_a
+      first_participants = records[0].participants
+      last_participants = records[-1].participants
+      assert_equal([
+                     4,
+                     {
+                       id: 'F107',
+                       attribute: '女性３０代後半',
+                       birthplace: '愛知県幡豆郡出身',
+                       residence: '愛知県幡豆郡在住'
+                     },
+                     {
+                       id: 'F128',
+                       attribute: '女性２０代前半',
+                       birthplace: '愛知県西尾市出身',
+                       residence: '西尾市在住'
+                     },
+                     2,
+                     {
+                       id: 'F007',
+                       attribute: '女性５０代後半',
+                       birthplace: '東京都出身',
+                       residence: '東京都国分寺市在住'
+                     },
+                     {
+                       id: 'F003',
+                       attribute: '女性８０代後半',
+                       birthplace: '栃木県宇都宮市出身',
+                       residence: '国分寺市在住'
+                     }
+                   ],
+                   [
+                     first_participants.size,
+                     first_participants[0].to_h,
+                     first_participants[-1].to_h,
+                     last_participants.size,
+                     last_participants[0].to_h,
+                     last_participants[-1].to_h
+                   ])
+    end
+
+    test("others") do
+      records = @dataset.each.to_a
+      assert_equal([
+                     129,
+                     [
+                       '１（約３５分）',
+                       '２００１年１０月１６日',
+                       'ファミリーレストラン',
+                       '英会話教室の友人',
+                       nil
+                     ],
+                     [
+                       '１２９（３６分）',
+                       '２００３年２月１６日',
+                       '二人の自宅',
+                       '母と娘',
+                       'F007は東京に３８年、F003は東京に６０年居住。'
+                    ]
+                   ],
+                   [
+                     records.size,
+                     [
+                       records[0].name,
+                       records[0].date,
+                       records[0].place,
+                       records[0].relationships,
+                       records[0].note
+                     ],
+                     [
+                       records[-1].name,
+                       records[-1].date,
+                       records[-1].place,
+                       records[-1].relationships,
+                       records[-1].note
+                     ]
+                   ])
+    end
+  end
+
+  sub_test_case("#metadata") do
+    test("#description") do
+      description = @dataset.metadata.description
+      assert_equal(<<~DESCRIPTION, description)
+        The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
+        total about 100 hours of chatting among native speakers of Japanese,
+        which is converted into text.
+      DESCRIPTION
+    end
+  end
+end