Skip to content

Commit

Permalink
Add nagoya-university-conversation-corpus (#168)
Browse files Browse the repository at this point in the history
Fix GH-48
  • Loading branch information
tmatsuura1 committed May 8, 2023
1 parent d70b0e0 commit 706b288
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 0 deletions.
14 changes: 14 additions & 0 deletions example/nagoya-university-conversation-corpus.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env ruby

require 'datasets'

nagoya_university_conversation_corpus = Datasets::NagoyaUniversityConversationCorpus.new

nagoya_university_conversation_corpus.each do |data|
data.sentences.each do |sentence|
p [
sentence.participant_id,
sentence.content
]
end
end
2 changes: 2 additions & 0 deletions lib/datasets/lazy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def const_missing(name)
LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
LAZY_LOADER.register(:MNIST, "datasets/mnist")
LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
"datasets/nagoya-university-conversation-corpus")
LAZY_LOADER.register(:Penguins, "datasets/penguins")
LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
Expand Down
109 changes: 109 additions & 0 deletions lib/datasets/nagoya-university-conversation-corpus.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
require_relative 'dataset'
require_relative 'zip-extractor'

module Datasets
class NagoyaUniversityConversationCorpus < Dataset
Data = Struct.new(
:name,
:date,
:place,
:participants,
:relationships,
:note,
:sentences
)

Participant = Struct.new(
:id,
:attribute,
:birthplace,
:residence
)

Sentence = Struct.new(:participant_id, :content) do
def end?
participant_id.nil? and content.nil?
end
end

def initialize
super()
@metadata.id = 'nagoya-university-conversation-curpus'
@metadata.name = 'Nagoya University Conversation Curpus'
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
@metadata.licenses = ['CC-BY-NC-ND-4.0']
@metadata.description = <<~DESCRIPTION
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
total about 100 hours of chatting among native speakers of Japanese,
which is converted into text.
DESCRIPTION
end

def each
return to_enum(__method__) unless block_given?

open_data do |input_stream|
yield(parse_file(input_stream))
end
end

private

def open_data
data_path = cache_dir_path + 'nucc.zip'
data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
download(data_path, data_url)

extractor = ZipExtractor.new(data_path)
extractor.extract_files do |input_stream|
yield(input_stream)
end
end

def parse_file(input_stream)
data = Data.new
participants = []
sentences = []

input_stream.each do |input|
input.each_line(chomp: true) do |line|
line.force_encoding('utf-8')
if line.start_with?('@データ')
data.name = line[4..]
elsif line.start_with?('@収集年月日')
# mixed cases with and without':'
data.date = line[6..].delete_prefix(':')
elsif line.start_with?('@場所')
data.place = line[4..]
elsif line.start_with?('@参加者の関係')
data.relationships = line.split(':', 2)[1]
elsif line.start_with?('@参加者')
participant = Participant.new
participant.id, profiles = line[4..].split(':', 2)
participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)

participants << participant
elsif line.start_with?('%com')
data.note = line.split(':', 2)[1]
elsif line == '@END'
sentence = Sentence.new
sentence.participant_id = nil
sentence.content = nil

sentences << sentence
else
sentence = Sentence.new
sentence.participant_id, sentence.content = line.split(':', 2)

sentences << sentence
end
end
end

data.participants = participants
data.sentences = sentences

data
end
end
end
12 changes: 12 additions & 0 deletions lib/datasets/zip-extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,17 @@ def extract_file(file_path)
end
nil
end

def extract_files
Zip::File.open(@path) do |zip_file|
zip_file.each do |entry|
next unless entry.file?

entry.get_input_stream do |input|
yield(input)
end
end
end
end
end
end
132 changes: 132 additions & 0 deletions test/test-nagoya-university-conversation-corpus.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
def setup
@dataset = Datasets::NagoyaUniversityConversationCorpus.new
end

sub_test_case("each") do
test("#sentences") do
records = @dataset.each.to_a
first_sentences = records[0].sentences
last_sentences = records[-1].sentences
assert_equal([
856,
{
participant_id: 'F107',
content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。'
},
{
participant_id: nil,
content: nil
},
603,
{
participant_id: 'F007',
content: 'それでは話を始めまーす。'
},
{
participant_id: nil,
content: nil
}
],
[
first_sentences.size,
first_sentences[0].to_h,
first_sentences[-1].to_h,
last_sentences.size,
last_sentences[0].to_h,
last_sentences[-1].to_h,
])
end

test("#participants") do
records = @dataset.each.to_a
first_participants = records[0].participants
last_participants = records[-1].participants
assert_equal([
4,
{
id: 'F107',
attribute: '女性30代後半',
birthplace: '愛知県幡豆郡出身',
residence: '愛知県幡豆郡在住'
},
{
id: 'F128',
attribute: '女性20代前半',
birthplace: '愛知県西尾市出身',
residence: '西尾市在住'
},
2,
{
id: 'F007',
attribute: '女性50代後半',
birthplace: '東京都出身',
residence: '東京都国分寺市在住'
},
{
id: 'F003',
attribute: '女性80代後半',
birthplace: '栃木県宇都宮市出身',
residence: '国分寺市在住'
}
],
[
first_participants.size,
first_participants[0].to_h,
first_participants[-1].to_h,
last_participants.size,
last_participants[0].to_h,
last_participants[-1].to_h
])
end

test("others") do
records = @dataset.each.to_a
assert_equal([
129,
[
'1(約35分)',
'2001年10月16日',
'ファミリーレストラン',
'英会話教室の友人',
nil
],
[
'129(36分)',
'2003年2月16日',
'二人の自宅',
'母と娘',
'F007は東京に38年、F003は東京に60年居住。'
]
],
[
records.size,
[
records[0].name,
records[0].date,
records[0].place,
records[0].relationships,
records[0].note
],
[
records[-1].name,
records[-1].date,
records[-1].place,
records[-1].relationships,
records[-1].note
]
])
end
end

sub_test_case("#metadata") do
test("#description") do
description = @dataset.metadata.description
assert_equal(<<~DESCRIPTION, description)
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
total about 100 hours of chatting among native speakers of Japanese,
which is converted into text.
DESCRIPTION
end
end
end

0 comments on commit 706b288

Please sign in to comment.