In [3]:
import os
import sys
import glob
from os.path import basename, dirname

In [37]:
lang1 = "en"
lang2 = "ja"

ids_folder = "./06_ids"
cleaned_folder = "./07_parallel_by_ids"

test_folder = "./08_mkdataset/test"
test_lst = "./08_mkdataset/test_lst.txt"
dev_folder = "./08_mkdataset/dev"
dev_lst = "./08_mkdataset/dev_lst.txt"
train_folder = "./08_mkdataset/train"
train_lst = "./08_mkdataset/train_lst.txt"
train_large_folder = "./08_mkdataset/train_large"
train_large_lst = "./08_mkdataset/train_large_lst.txt"

test_dev_lst = "./08_mkdataset/test_dev_lst.txt"

analysis_file = "./08_mkdataset/analysis.txt"

combined_folder = "./08_mkdataset/combine"


if (os.path.exists(test_folder) == False):
    os.makedirs(test_folder)
    
if (os.path.exists(dev_folder) == False):
    os.makedirs(dev_folder)
    
if (os.path.exists(train_folder) == False):
    os.makedirs(train_folder)
    
if (os.path.exists(train_large_folder) == False):
    os.makedirs(train_large_folder)
    
if (os.path.exists(combined_folder) == False):
    os.makedirs(combined_folder)
    
test_num = 2000
dev_num = 500
train_num = 3000

In [38]:
def get_ids_file(name):
    name = name + '.ids.txt'
    name = os.path.join(ids_folder, name)
    return name


def get_cleaned_ja_file(name):
    name = name + '.ja.txt'
    name = os.path.join(cleaned_folder, name)
    return name

def get_cleaned_en_file(name):
    name = name + '.en.txt'
    name = os.path.join(cleaned_folder, name)
    return name


def get_similarity(ids_file):
    with open(ids_file, "r") as f:
        lines = f.readlines()
    tot_len = len(lines)
    if (tot_len == 0):
        print (ids_file)
        return -1
    similarity = 0
    for line in lines:
        line_similarity = float(line.strip().split()[2])
        similarity += line_similarity
    return similarity/tot_len

def get_file_len(name):
    with open(name, "r") as f:
        lines = f.readlines()
    return len(lines)

def combine(folder1, lang1, lang2, folder2, tag):
    # test_folder en ja combined_folder test
    en_names = glob.glob("{}/*.{}.txt".format(folder1, lang1))
    en_names.sort()
    ja_names = glob.glob("{}/*.{}.txt".format(folder1, lang2))
    ja_names.sort()
    en_combined_file = os.path.join(folder2, "{}.{}".format(tag, lang1))
    ja_combined_file = os.path.join(folder2, "{}.{}".format(tag, lang2))
    for en_name, ja_name in zip(en_names, ja_names):
        en_base = basename(en_name).split('.')[0]
        ja_base = basename(ja_name).split('.')[0]
        if (en_base != ja_base):
            print (en_base, ja_base)
            exit(0)
    with open(en_combined_file, "w") as en_file:
        for name in en_names:
            with open(name, "r") as f:
                lines = f.readlines()
            for line in lines:
                line = line.strip() + '\n'
                en_file.write(line)
    with open(ja_combined_file, "w") as ja_file:
        for name in ja_names:
            with open(name, "r") as f:
                lines = f.readlines()
            for line in lines:
                line = line.strip() + '\n'
                ja_file.write(line)

In [39]:
names = glob.glob("{}/*.txt".format(ids_folder))
base_names = [basename(name).split('.')[0] for name in names]
base_names.sort()

similarity_lst = []
for i, name in enumerate(base_names):
    ids_file = get_ids_file(name)
    cleaned_ja_file = get_cleaned_ja_file(name)
    similarity = get_similarity(ids_file)
    file_len = get_file_len(cleaned_ja_file)
    
    similarity_lst.append((name, file_len, similarity))
similarity_lst.sort(key = lambda x: -x[2])

In [23]:
i = 0
test_line = 0
dev_line = 0
train_line = 0
train_large_line = 0

with open(test_lst, "w") as f:
    while (test_line < test_num):
        current_file, file_len, similarity = similarity_lst[i]
        f.write(current_file+'\n')
        ja_file = get_cleaned_ja_file(current_file)
        en_file = get_cleaned_en_file(current_file)
        command = "cp {} {}".format(ja_file, test_folder)
        os.system(command)
        command = "cp {} {}".format(en_file, test_folder)
        os.system(command)

        test_line += file_len
        i += 1
        
with open(dev_lst, "w") as f:
    while (dev_line < dev_num):
        current_file, file_len, similarity = similarity_lst[i]
        f.write(current_file+'\n')
        ja_file = get_cleaned_ja_file(current_file)
        en_file = get_cleaned_en_file(current_file)
        command = "cp {} {}".format(ja_file, dev_folder)
        os.system(command)
        command = "cp {} {}".format(en_file, dev_folder)
        os.system(command)

        dev_line += file_len
        i += 1
        
        
train_position = i
        
with open(train_lst, "w") as f:
    while (train_line < train_num):
        current_file, file_len, similarity = similarity_lst[i]
        if (similarity <= 0): continue
        f.write(current_file+'\n')
        ja_file = get_cleaned_ja_file(current_file)
        en_file = get_cleaned_en_file(current_file)
        command = "cp {} {}".format(ja_file, train_folder)
        os.system(command)
        command = "cp {} {}".format(en_file, train_folder)
        os.system(command)

        train_line += file_len
        i += 1
        
i = train_position
with open(train_large_lst, "w") as f:
    while (1):
        current_file, file_len, similarity = similarity_lst[i]
        if (similarity <= 0): break
        f.write(current_file+'\n')
        ja_file = get_cleaned_ja_file(current_file)
        en_file = get_cleaned_en_file(current_file)
        command = "cp {} {}".format(ja_file, train_large_folder)
        os.system(command)
        command = "cp {} {}".format(en_file, train_large_folder)
        os.system(command)

        train_large_line += file_len
        i += 1

    
command = "cat {} {} > {}".format(test_lst, dev_lst, test_dev_lst)
os.system(command)
    
with open(analysis_file, "w") as f:
    f.write("# of test line: {}\n".format(test_line))
    f.write("# of dev line: {}\n".format(dev_line))
    f.write("# of train line: {}\n".format(train_line))
    f.write("# of train large line: {}\n".format(train_large_line))
    
combine(test_folder, lang1, lang2, combined_folder, "test")
combine(dev_folder, lang1, lang2, combined_folder, "dev")
combine(train_folder, lang1, lang2, combined_folder, "train")
combine(train_large_folder, lang1, lang2, combined_folder, "train_large")

IndexError: list index out of range

[('1252_09_shoe-tower-assignment', 1, 0.985279234734),
 ('874_09_xie-ta-zuo-ye', 1, 0.985279234734),
 ('5655_01_characteristics-of-a-good-question', 57, 0.9809870328938946),
 ('5661_02_populations-come-in-many-forms', 32, 0.9802990830447188),
 ('5653_02_stages-of-data-analysis', 6, 0.9802817838308333),
 ('5668_01_routine-communication-in-data-analysis', 46, 0.9795872515403912),
 ('5654_01_six-types-of-questions', 55, 0.9795342573041274),
 ('5669_02_making-a-data-analysis-presentation', 44, 0.9793664739384998),
 ('5651_01_what-this-course-is-about', 19, 0.9786900499246315),
 ('5662_03_inference-what-can-go-wrong', 63, 0.9785276326159523),
 ('5658_02_using-statistical-models-to-explore-your-data-part-2',
  53,
  0.9770070690408301),
 ('5659_01_exploratory-data-analysis-when-to-stop', 63, 0.976622708182508),
 ('5663_01_general-framework', 72, 0.9757482124322503),
 ('5660_01_making-inferences-from-data-introduction', 61, 0.9753806354189836),
 ('5657_01_using-statistical-models-to-explore-y

In [66]:
with open("/share03/song/NICT_dataset/coursera/coursera_test_dev_02/new_lst.txt", "w") as f:
    for line in similarity_lst:
        line = line[0].strip()+'\n'
        f.write(line)

In [63]:
ori_path = "/share03/song/coursera_test_dev/ori/dev_test"
notdone_path = "/share03/song/NICT_dataset/coursera/en-ja_raw_02/07_parallel_by_ids"
similarity_lst
test_dev_names = glob.glob("/share03/song/coursera_test_dev/ori/test/*") #+ glob.glob("/share03/song/coursera_test_dev/ori/dev/*")
def get_title(name):
    return basename(name).split('.')[0]
td_names = [get_title(name) for name in test_dev_names]
tot_line_num = 0
i = 0
while (1):
    (title, num_line, similarity) = similarity_lst[i]
    i += 1
    if title in td_names:
        print (i, title)
    else:
        print ("Not in", i, title)
    tot_line_num += int(num_line)

    print (tot_line_num)


('Not in', 1, '1252_09_shoe-tower-assignment')
1
('Not in', 2, '874_09_xie-ta-zuo-ye')
2
(3, '5655_01_characteristics-of-a-good-question')
59
(4, '5661_02_populations-come-in-many-forms')
91
(5, '5653_02_stages-of-data-analysis')
97
(6, '5668_01_routine-communication-in-data-analysis')
143
(7, '5654_01_six-types-of-questions')
198
(8, '5669_02_making-a-data-analysis-presentation')
242
(9, '5651_01_what-this-course-is-about')
261
(10, '5662_03_inference-what-can-go-wrong')
324
(11, '5658_02_using-statistical-models-to-explore-your-data-part-2')
377
(12, '5659_01_exploratory-data-analysis-when-to-stop')
440
(13, '5663_01_general-framework')
512
(14, '5660_01_making-inferences-from-data-introduction')
573
(15, '5657_01_using-statistical-models-to-explore-your-data-part-1')
722
(16, '5665_03_prediction-analyses')
820
(17, '5633_01_summary-and-thank-you')
849
(18, '1075_01_the-lazy-rule')
861
(19, '5666_01_inference-vs-prediction')
965
(20, '6252_09_3-14-summary-of-module-3-and-preview-of-m

IndexError: list index out of range

In [57]:
tot_line_num = 0
i = 0
while (1):
    (title, num_line, similarity) = similarity_lst[i]
    i += 1
    if title in td_names:
        print (i, title)
        new_name = os.path.join(ori_path, title + '.ja.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/test/done/"))
        new_name = os.path.join(ori_path, title + '.en.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/test/done/"))
    else:
        print ("Not in", i, title)
        new_name = os.path.join(notdone_path, title + '.ja.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/test/notdone/"))
        new_name = os.path.join(notdone_path, title + '.en.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/test/notdone/"))
    tot_line_num += int(num_line)
    if (tot_line_num > 2200):
        break
        
tot_line_num = 0
while (1):
    (title, num_line, similarity)=similarity_lst[i]
    i+=1
    if title in td_names:
        print (i, title)
        new_name = os.path.join(ori_path, title + '.ja.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/dev/done/"))
        new_name = os.path.join(ori_path, title + '.en.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/dev/done/"))
    else:
        print ("Not in", i, title)
        new_name = os.path.join(notdone_path, title + '.ja.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/dev/notdone/"))
        new_name = os.path.join(notdone_path, title + '.en.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/dev/notdone/"))
    tot_line_num += int(num_line)
    if (tot_line_num > 600):
        break   
        
while (1):
    (title, num_line, similarity)=similarity_lst[i]
    i+=1
    if title in td_names:
        print (i, title)
        new_name = os.path.join(ori_path, title + '.ja.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/train/"))
        new_name = os.path.join(ori_path, title + '.en.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/train/"))
    else:
        print ("Not in", i, title)
        new_name = os.path.join(notdone_path, title + '.ja.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/train/"))
        new_name = os.path.join(notdone_path, title + '.en.txt')
        os.system("cp {} {}".format(new_name, "/share03/song/NICT_dataset/coursera/en-ja_raw_02/08_mkdataset/refined/train/"))
    tot_line_num += int(num_line)

('Not in', 1, '1252_09_shoe-tower-assignment')
('Not in', 2, '874_09_xie-ta-zuo-ye')
(3, '5655_01_characteristics-of-a-good-question')
(4, '5661_02_populations-come-in-many-forms')
(5, '5653_02_stages-of-data-analysis')
(6, '5668_01_routine-communication-in-data-analysis')
(7, '5654_01_six-types-of-questions')
(8, '5669_02_making-a-data-analysis-presentation')
(9, '5651_01_what-this-course-is-about')
(10, '5662_03_inference-what-can-go-wrong')
(11, '5658_02_using-statistical-models-to-explore-your-data-part-2')
(12, '5659_01_exploratory-data-analysis-when-to-stop')
(13, '5663_01_general-framework')
(14, '5660_01_making-inferences-from-data-introduction')
(15, '5657_01_using-statistical-models-to-explore-your-data-part-1')
(16, '5665_03_prediction-analyses')
(17, '5633_01_summary-and-thank-you')
(18, '1075_01_the-lazy-rule')
(19, '5666_01_inference-vs-prediction')
(20, '6252_09_3-14-summary-of-module-3-and-preview-of-module-4')
('Not in', 21, '4130_03_why-is-sliding-a-box-across-the-flo

('Not in', 160, '181_01_user-notifications-part-1')
(161, '5616_02_anomaly-detection-using-the-multivariate-gaussian-distribution')
('Not in', 162, '5577_07_putting-it-together')
('Not in', 163, '1354_04_introduction-to-github')
('Not in', 164, '1389_04_introduction-to-github')
(165, '804_01_food-is-more-than-just-nutrition')
(166, '5836_02_searching-for-images-a-case-study-in-deep-learning')
('Not in', 167, '4127_01_seesaws-summary')
('Not in', 168, '7581_01_introduction-and-execution')
('Not in', 169, '7644_01_1-jian-jie-he-zhi-xing-2-09')
('Not in', 170, '7734_01_introduction-and-execution')
('Not in', 171, '7746_01_1-jian-jie-he-zhi-xing-2-09')
('Not in', 172, '1514_02_overview-of-cloud-computing')
('Not in', 173, '1428_02_the-role-of-sas')
('Not in', 174, '7586_02_marketing-101-building-strong-brands-part-ii')
('Not in', 175, '7681_02_marketing-101-building-strong-brands-part-ii')
('Not in', 176, '5869_02_the-great-divide')
('Not in', 177, '1430_04_the-role-of-python')
('Not in', 

('Not in', 297, '5980_01_analyzing-the-negotiation')
('Not in', 298, '7378_01_2-1-el-mito-del-equilibrio-entre-trabajo-y-vida-personal')
('Not in', 299, '7426_01_2-1-mif-pro-balans-roboti-ta-osobistogho-zhittia')
('Not in', 300, '7777_01_2-1-wakuraihubaransunoshen-hua')
('Not in', 301, '7784_01_2-1-the-myth-of-work-life-balance')
('Not in', 302, '5566_01_model-representation-i')
('Not in', 303, '7571_01_marketing-101-construyendo-marcas-fuertes-parte-i')
('Not in', 304, '7724_01_marketing-101-construyendo-marcas-fuertes-parte-i')
('Not in', 305, '4128_01_wheels-introduction')
('Not in', 306, '7634_01_1-a-shi-chang-ying-xiao-ji-chu-ke-cheng-shu-li-qiang-shi-pin-pai-di-bu-fen-15-10')
('Not in', 307, '7736_01_1-a-shi-chang-ying-xiao-ji-chu-ke-cheng-shu-li-qiang-shi-pin-pai-di-bu-fen-15-10')
('Not in', 308, '814_02_carbohydrates')
('Not in', 309, '1412_04_making-business-defining-decisions-using-data-analytics')
('Not in', 310, '841_02_understanding-taste')
('Not in', 311, '1062_01_welcome

('Not in', 444, '6443_18_subsetting-basics')
('Not in', 445, '6562_18_subsetting-basics')
('Not in', 446, '1528_01_nodered-on-raspberry-pi-part-1')
('Not in', 447, '4115_03_why-does-a-sidewalk-perfectly-support-a-wagon')
('Not in', 448, '143_05_sorting-complexity')
('Not in', 449, '138_06_generics')
('Not in', 450, '1422_04_pwc-s-perspective-on-big-data')
('Not in', 451, '5580_02_evaluating-a-hypothesis')
('Not in', 452, '133_07_union-find-applications')
('Not in', 453, '7756_02_course-overview')
('Not in', 454, '4109_01_how-does-a-falling-ball-move-after-it-is-dropped')
('Not in', 455, '6436_11_data-types-data-frames')
('Not in', 456, '6555_11_data-types-data-frames')
('Not in', 457, '7615_03_brand-elements-color-taglines')
('Not in', 458, '7710_03_brand-elements-color-taglines')
('Not in', 459, '1515_06_ibm-cloud-summary')
('Not in', 460, '5527_01_model-representation')
('Not in', 461, '7761_03_real-time-social-data')
('Not in', 462, '1436_03_course-recap-with-amity-and-mike')
('Not 

('Not in', 595, '5542_05_gradient-descent-in-practice-i-feature-scaling')
('Not in', 596, '5555_03_hypothesis-representation')
('Not in', 597, '4362_05_as-opcoes-de-acordes-dominantes')
('Not in', 598, '4372_05_7a-de-dominante')
('Not in', 599, '4920_05_dominant-7-scale-choices')
('Not in', 600, '5581_04_model-selection-and-train-validation-test-sets')
('Not in', 601, '158_01_contract-law-part-1')
('Not in', 602, '7609_09_influence-and-how-information-spreads')
('Not in', 603, '7704_09_influence-and-how-information-spreads')
('Not in', 604, '828_02_dealing-with-picky-eaters')
('Not in', 605, '7602_02_online-offline-competition')
('Not in', 606, '7697_02_online-offline-competition')
('Not in', 607, '812_03_vegetable-stir-fry')
('Not in', 608, '7135_01_organizing-your-pitch-3-tells')
('Not in', 609, '6333_01_course-introduction')
('Not in', 610, '4116_01_how-does-a-wagon-move-as-you-let-it-roll-freely-on-a-ramp')
('Not in', 611, '136_04_resizing-arrays')
('Not in', 612, '5548_01_basic-op

('Not in', 751, '5538_09_matrix-multiplication-properties')
('Not in', 752, '6459_03_loop-functions-apply')
('Not in', 753, '6578_03_loop-functions-apply')
('Not in', 754, '5539_11_inverse-and-transpose')
('Not in', 755, '5254_01_welcome-video')
('Not in', 756, '7532_01_1-3-1-debit-and-credit-bookkeeping-i')
('Not in', 757, '7653_01_1-3-1-debit-and-credit-bookkeeping-i')
('Not in', 758, '6434_09_data-types-factors')
('Not in', 759, '6553_09_data-types-factors')
('Not in', 760, '830_02_cupcakes')
('Not in', 761, '7611_11_pricing-strategies-2-customer-factors')
('Not in', 762, '7706_11_pricing-strategies-2-customer-factors')
('Not in', 763, '1443_03_hyperparameters-tuning-in-practice-pandas-vs-caviar')
('Not in', 764, '6430_05_r-console-input-and-evaluation')
('Not in', 765, '6549_05_r-console-input-and-evaluation')
('Not in', 766, '5835_02_optimal-recommenders')
('Not in', 767, '7577_04_tres-hurras-por-la-marketing-directa')
('Not in', 768, '7640_04_4-wei-zhi-xiao-he-cai-san-sheng-3-51'

IndexError: list index out of range

In [17]:
similarity_names = [name[0] for name in similarity_lst]

In [18]:
similarity_names

['1252_09_shoe-tower-assignment',
 '874_09_xie-ta-zuo-ye',
 '5655_01_characteristics-of-a-good-question',
 '5661_02_populations-come-in-many-forms',
 '5653_02_stages-of-data-analysis',
 '5668_01_routine-communication-in-data-analysis',
 '5654_01_six-types-of-questions',
 '5669_02_making-a-data-analysis-presentation',
 '5651_01_what-this-course-is-about',
 '5662_03_inference-what-can-go-wrong',
 '5658_02_using-statistical-models-to-explore-your-data-part-2',
 '5659_01_exploratory-data-analysis-when-to-stop',
 '5663_01_general-framework',
 '5660_01_making-inferences-from-data-introduction',
 '5657_01_using-statistical-models-to-explore-your-data-part-1',
 '5665_03_prediction-analyses',
 '5633_01_summary-and-thank-you',
 '1075_01_the-lazy-rule',
 '5666_01_inference-vs-prediction',
 '6252_09_3-14-summary-of-module-3-and-preview-of-module-4',
 '4130_03_why-is-sliding-a-box-across-the-floor-usually-hardest-at-the-start',
 '18_02_on-demand-manufacturing',
 '5656_01_exploratory-data-analysis-g

In [19]:
for name in td_names:
    if name not in similarity_names:
        print (name)

7122_01_introduction-to-course
7122_01_introduction-to-course


('Not in', 0, '1252_09_shoe-tower-assignment')
('Not in', 1, '874_09_xie-ta-zuo-ye')
(2, '5655_01_characteristics-of-a-good-question')
(3, '5661_02_populations-come-in-many-forms')
(4, '5653_02_stages-of-data-analysis')
(5, '5668_01_routine-communication-in-data-analysis')
(6, '5654_01_six-types-of-questions')
(7, '5669_02_making-a-data-analysis-presentation')
(8, '5651_01_what-this-course-is-about')
(9, '5662_03_inference-what-can-go-wrong')
(10, '5658_02_using-statistical-models-to-explore-your-data-part-2')
(11, '5659_01_exploratory-data-analysis-when-to-stop')
(12, '5663_01_general-framework')
(13, '5660_01_making-inferences-from-data-introduction')
(14, '5657_01_using-statistical-models-to-explore-your-data-part-1')
(15, '5665_03_prediction-analyses')
(16, '5633_01_summary-and-thank-you')
(17, '1075_01_the-lazy-rule')
(18, '5666_01_inference-vs-prediction')
(19, '6252_09_3-14-summary-of-module-3-and-preview-of-module-4')
('Not in', 20, '4130_03_why-is-sliding-a-box-across-the-floo

In [30]:
similarity_names.sort()
similarity_names

['1062_01_welcome-to-the-course',
 '1063_02_what-to-expect',
 '1064_01_the-3-traps-of-communication',
 '1065_03_first-trap-speaking-in-third-person',
 '1066_04_second-trap-speaking-in-formal-language',
 '1067_06_third-trap-speaking-with-too-many-details',
 '1068_01_misconceptions-pros-cons',
 '1069_01_three-types-of-remote-teams',
 '1075_01_the-lazy-rule',
 '1076_02_picture-superiority-effect',
 '1080_04_uninformed-empathy-pov',
 '1082_02_achievement',
 '1085_01_questions',
 '1089_01_positional-vs-principled-bargaining',
 '1090_01_why',
 '1091_02_how',
 '1096_05_veggies',
 '1100_05_meaningful-numbers',
 '1121_03_motivation',
 '1126_01_introduction',
 '1132_02_adjusting-model-assumptions',
 '1136_02_adjusting-model-assumptions',
 '1143_08_clase-magistral-ritmo-melodico-y-ritmos-de-letra-parte-4',
 '1147_01_1-1-course-introduction',
 '114_01_welcome',
 '115_01_solving-the-sum-of-two-digits-programming-challenge-screencast',
 '116_02_coming-up',
 '117_01_largest-number',
 '119_08_merge-so