Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nagoya-university-conversation-corpus #168

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 14 additions & 0 deletions example/nagoya-university-conversation-corpus.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env ruby

require 'datasets'

nagoya_university_conversation_corpus = Datasets::NagoyaUniversityConversationCorpus.new

nagoya_university_conversation_corpus.each do |data|
data.sentences.each do |sentence|
p [
sentence.participant_id,
sentence.content
]
end
end
2 changes: 2 additions & 0 deletions lib/datasets/lazy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def const_missing(name)
LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
LAZY_LOADER.register(:MNIST, "datasets/mnist")
LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
"datasets/nagoya-university-conversation-corpus")
LAZY_LOADER.register(:Penguins, "datasets/penguins")
LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
Expand Down
109 changes: 109 additions & 0 deletions lib/datasets/nagoya-university-conversation-corpus.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
require_relative 'dataset'
require_relative 'zip-extractor'

module Datasets
class NagoyaUniversityConversationCorpus < Dataset
Data = Struct.new(
:name,
:date,
:place,
:participants,
:relationships,
:note,
:sentences
)

Participant = Struct.new(
:id,
:attribute,
:birthplace,
:residence
)

Sentence = Struct.new(:participant_id, :content) do
def end?
participant_id.nil? and content.nil?
end
end

def initialize
super()
@metadata.id = 'nagoya-university-conversation-curpus'
@metadata.name = 'Nagoya University Conversation Curpus'
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
@metadata.licenses = ['CC-BY-NC-ND-4.0']
@metadata.description = <<~DESCRIPTION
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
total about 100 hours of chatting among native speakers of Japanese,
which is converted into text.
DESCRIPTION
end

def each
return to_enum(__method__) unless block_given?

open_data do |input_stream|
yield(parse_file(input_stream))
end
end

private

def open_data
data_path = cache_dir_path + 'nucc.zip'
data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
download(data_path, data_url)

extractor = ZipExtractor.new(data_path)
extractor.extract_files do |input_stream|
yield(input_stream)
end
end

def parse_file(input_stream)
data = Data.new
participants = []
sentences = []

input_stream.each do |input|
input.each_line(chomp: true) do |line|
line.force_encoding('utf-8')
if line.start_with?('@データ')
data.name = line[4..]
elsif line.start_with?('@収集年月日')
# mixed cases with and without':'
data.date = line[6..].delete_prefix(':')
elsif line.start_with?('@場所')
data.place = line[4..]
elsif line.start_with?('@参加者の関係')
data.relationships = line.split(':', 2)[1]
elsif line.start_with?('@参加者')
participant = Participant.new
participant.id, profiles = line[4..].split(':', 2)
participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)

participants << participant
elsif line.start_with?('%com')
data.note = line.split(':', 2)[1]
elsif line == '@END'
sentence = Sentence.new
sentence.participant_id = nil
sentence.content = nil

sentences << sentence
else
sentence = Sentence.new
sentence.participant_id, sentence.content = line.split(':', 2)

sentences << sentence
end
end
end

data.participants = participants
data.sentences = sentences

data
end
end
end
12 changes: 12 additions & 0 deletions lib/datasets/zip-extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,17 @@ def extract_file(file_path)
end
nil
end

def extract_files
Zip::File.open(@path) do |zip_file|
zip_file.each do |entry|
next unless entry.file?

entry.get_input_stream do |input|
yield(input)
end
end
end
end
end
end
132 changes: 132 additions & 0 deletions test/test-nagoya-university-conversation-corpus.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
def setup
@dataset = Datasets::NagoyaUniversityConversationCorpus.new
end

sub_test_case("each") do
test("#sentences") do
records = @dataset.each.to_a
first_sentences = records[0].sentences
last_sentences = records[-1].sentences
assert_equal([
856,
{
participant_id: 'F107',
content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。'
},
{
participant_id: nil,
content: nil
},
603,
{
participant_id: 'F007',
content: 'それでは話を始めまーす。'
},
{
participant_id: nil,
content: nil
}
],
[
first_sentences.size,
first_sentences[0].to_h,
first_sentences[-1].to_h,
last_sentences.size,
last_sentences[0].to_h,
last_sentences[-1].to_h,
])
end

test("#participants") do
records = @dataset.each.to_a
first_participants = records[0].participants
last_participants = records[-1].participants
assert_equal([
4,
{
id: 'F107',
attribute: '女性30代後半',
birthplace: '愛知県幡豆郡出身',
residence: '愛知県幡豆郡在住'
},
{
id: 'F128',
attribute: '女性20代前半',
birthplace: '愛知県西尾市出身',
residence: '西尾市在住'
},
2,
{
id: 'F007',
attribute: '女性50代後半',
birthplace: '東京都出身',
residence: '東京都国分寺市在住'
},
{
id: 'F003',
attribute: '女性80代後半',
birthplace: '栃木県宇都宮市出身',
residence: '国分寺市在住'
}
],
[
first_participants.size,
first_participants[0].to_h,
first_participants[-1].to_h,
last_participants.size,
last_participants[0].to_h,
last_participants[-1].to_h
])
end

test("others") do
records = @dataset.each.to_a
assert_equal([
129,
[
'1(約35分)',
'2001年10月16日',
'ファミリーレストラン',
'英会話教室の友人',
nil
],
[
'129(36分)',
'2003年2月16日',
'二人の自宅',
'母と娘',
'F007は東京に38年、F003は東京に60年居住。'
]
],
[
records.size,
[
records[0].name,
records[0].date,
records[0].place,
records[0].relationships,
records[0].note
],
[
records[-1].name,
records[-1].date,
records[-1].place,
records[-1].relationships,
records[-1].note
]
])
end
end

sub_test_case("#metadata") do
test("#description") do
description = @dataset.metadata.description
assert_equal(<<~DESCRIPTION, description)
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
total about 100 hours of chatting among native speakers of Japanese,
which is converted into text.
DESCRIPTION
end
end
end