-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add nagoya-university-conversation-corpus (#168)
Fix GH-48
- Loading branch information
1 parent
d70b0e0
commit 706b288
Showing
5 changed files
with
269 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env ruby | ||
|
||
require 'datasets' | ||
|
||
nagoya_university_conversation_corpus = Datasets::NagoyaUniversityConversationCorpus.new | ||
|
||
nagoya_university_conversation_corpus.each do |data| | ||
data.sentences.each do |sentence| | ||
p [ | ||
sentence.participant_id, | ||
sentence.content | ||
] | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
require_relative 'dataset' | ||
require_relative 'zip-extractor' | ||
|
||
module Datasets | ||
class NagoyaUniversityConversationCorpus < Dataset | ||
Data = Struct.new( | ||
:name, | ||
:date, | ||
:place, | ||
:participants, | ||
:relationships, | ||
:note, | ||
:sentences | ||
) | ||
|
||
Participant = Struct.new( | ||
:id, | ||
:attribute, | ||
:birthplace, | ||
:residence | ||
) | ||
|
||
Sentence = Struct.new(:participant_id, :content) do | ||
def end? | ||
participant_id.nil? and content.nil? | ||
end | ||
end | ||
|
||
def initialize | ||
super() | ||
@metadata.id = 'nagoya-university-conversation-curpus' | ||
@metadata.name = 'Nagoya University Conversation Curpus' | ||
@metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/' | ||
@metadata.licenses = ['CC-BY-NC-ND-4.0'] | ||
@metadata.description = <<~DESCRIPTION | ||
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations, | ||
total about 100 hours of chatting among native speakers of Japanese, | ||
which is converted into text. | ||
DESCRIPTION | ||
end | ||
|
||
def each | ||
return to_enum(__method__) unless block_given? | ||
|
||
open_data do |input_stream| | ||
yield(parse_file(input_stream)) | ||
end | ||
end | ||
|
||
private | ||
|
||
def open_data | ||
data_path = cache_dir_path + 'nucc.zip' | ||
data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip' | ||
download(data_path, data_url) | ||
|
||
extractor = ZipExtractor.new(data_path) | ||
extractor.extract_files do |input_stream| | ||
yield(input_stream) | ||
end | ||
end | ||
|
||
def parse_file(input_stream) | ||
data = Data.new | ||
participants = [] | ||
sentences = [] | ||
|
||
input_stream.each do |input| | ||
input.each_line(chomp: true) do |line| | ||
line.force_encoding('utf-8') | ||
if line.start_with?('@データ') | ||
data.name = line[4..] | ||
elsif line.start_with?('@収集年月日') | ||
# mixed cases with and without':' | ||
data.date = line[6..].delete_prefix(':') | ||
elsif line.start_with?('@場所') | ||
data.place = line[4..] | ||
elsif line.start_with?('@参加者の関係') | ||
data.relationships = line.split(':', 2)[1] | ||
elsif line.start_with?('@参加者') | ||
participant = Participant.new | ||
participant.id, profiles = line[4..].split(':', 2) | ||
participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3) | ||
|
||
participants << participant | ||
elsif line.start_with?('%com') | ||
data.note = line.split(':', 2)[1] | ||
elsif line == '@END' | ||
sentence = Sentence.new | ||
sentence.participant_id = nil | ||
sentence.content = nil | ||
|
||
sentences << sentence | ||
else | ||
sentence = Sentence.new | ||
sentence.participant_id, sentence.content = line.split(':', 2) | ||
|
||
sentences << sentence | ||
end | ||
end | ||
end | ||
|
||
data.participants = participants | ||
data.sentences = sentences | ||
|
||
data | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase | ||
def setup | ||
@dataset = Datasets::NagoyaUniversityConversationCorpus.new | ||
end | ||
|
||
sub_test_case("each") do | ||
test("#sentences") do | ||
records = @dataset.each.to_a | ||
first_sentences = records[0].sentences | ||
last_sentences = records[-1].sentences | ||
assert_equal([ | ||
856, | ||
{ | ||
participant_id: 'F107', | ||
content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。' | ||
}, | ||
{ | ||
participant_id: nil, | ||
content: nil | ||
}, | ||
603, | ||
{ | ||
participant_id: 'F007', | ||
content: 'それでは話を始めまーす。' | ||
}, | ||
{ | ||
participant_id: nil, | ||
content: nil | ||
} | ||
], | ||
[ | ||
first_sentences.size, | ||
first_sentences[0].to_h, | ||
first_sentences[-1].to_h, | ||
last_sentences.size, | ||
last_sentences[0].to_h, | ||
last_sentences[-1].to_h, | ||
]) | ||
end | ||
|
||
test("#participants") do | ||
records = @dataset.each.to_a | ||
first_participants = records[0].participants | ||
last_participants = records[-1].participants | ||
assert_equal([ | ||
4, | ||
{ | ||
id: 'F107', | ||
attribute: '女性30代後半', | ||
birthplace: '愛知県幡豆郡出身', | ||
residence: '愛知県幡豆郡在住' | ||
}, | ||
{ | ||
id: 'F128', | ||
attribute: '女性20代前半', | ||
birthplace: '愛知県西尾市出身', | ||
residence: '西尾市在住' | ||
}, | ||
2, | ||
{ | ||
id: 'F007', | ||
attribute: '女性50代後半', | ||
birthplace: '東京都出身', | ||
residence: '東京都国分寺市在住' | ||
}, | ||
{ | ||
id: 'F003', | ||
attribute: '女性80代後半', | ||
birthplace: '栃木県宇都宮市出身', | ||
residence: '国分寺市在住' | ||
} | ||
], | ||
[ | ||
first_participants.size, | ||
first_participants[0].to_h, | ||
first_participants[-1].to_h, | ||
last_participants.size, | ||
last_participants[0].to_h, | ||
last_participants[-1].to_h | ||
]) | ||
end | ||
|
||
test("others") do | ||
records = @dataset.each.to_a | ||
assert_equal([ | ||
129, | ||
[ | ||
'1(約35分)', | ||
'2001年10月16日', | ||
'ファミリーレストラン', | ||
'英会話教室の友人', | ||
nil | ||
], | ||
[ | ||
'129(36分)', | ||
'2003年2月16日', | ||
'二人の自宅', | ||
'母と娘', | ||
'F007は東京に38年、F003は東京に60年居住。' | ||
] | ||
], | ||
[ | ||
records.size, | ||
[ | ||
records[0].name, | ||
records[0].date, | ||
records[0].place, | ||
records[0].relationships, | ||
records[0].note | ||
], | ||
[ | ||
records[-1].name, | ||
records[-1].date, | ||
records[-1].place, | ||
records[-1].relationships, | ||
records[-1].note | ||
] | ||
]) | ||
end | ||
end | ||
|
||
sub_test_case("#metadata") do | ||
test("#description") do | ||
description = @dataset.metadata.description | ||
assert_equal(<<~DESCRIPTION, description) | ||
The "Nagoya University Conversation Corpus" is a corpus of 129 conversations, | ||
total about 100 hours of chatting among native speakers of Japanese, | ||
which is converted into text. | ||
DESCRIPTION | ||
end | ||
end | ||
end |