In [None]:
import pandas as pd

df = pd.DataFrame(columns=[
    "audio_filepath", 
    "reader_id", "book_id", 
    "audio_quality", "split", 
    "text_no_preprocessing", "text_normalized", "text", 
    "duration", "bandwidth"
])

In [None]:
bw_df = pd.read_csv("books_bandwidth.tsv", sep='\t')
bw_df = bw_df[["BOOK_ID", "BANDWIDTH", "COMMENT"]]
bw_df.columns = "book_id", "bandwidth", "comment"
bw_df.head()

Unnamed: 0,book_id,bandwidth,comment
0,6973,17657,
1,7967,17140,
2,9288,17312,
3,9783,16709,
4,10425,17915,


In [None]:
reader_list = [
    '92_clean', '6097_clean', '9017_clean', 
    
    '6097_other', '6670_other', '6671_other',
    '8051_other', '9136_other', '11614_other',
    '11697_other', '12787_other'
]

In [None]:
import json
import os
from tqdm.auto import tqdm

for r in reader_list:
    rid, aq = r.split('_')
    print(r)

    for split in ("train", "test", "dev"):
        jfname = f"{rid}_manifest_{aq}_{split}.json"

        this_df = pd.DataFrame(columns=[
            "audio_filepath", 
            "reader_id", "book_id", 
            "audio_quality", "split", 
            "text_no_preprocessing", "text_normalized", "text", 
            "duration", "bandwidth"
        ])

        if not os.path.exists(jfname):
            continue

        print(jfname)
        with open(jfname, 'r') as j:
            lines = j.readlines()
            lnum = len(lines)
            for line in tqdm(lines, total=lnum):
                line = line.strip()
                row = json.loads(line)
                row["reader_id"] = rid
                row["book_id"] = row["audio_filepath"].split('/')[2]
                row["audio_quality"] = aq
                row["split"] = split
                row["bandwidth"] = bw_df.loc[bw_df["book_id"]==int(row["book_id"]), "bandwidth"].values[0]

                this_df = this_df.append(pd.DataFrame([row]))
        
        df = df.append(this_df)

df.head()

92_clean
92_manifest_clean_train.json


  0%|          | 0/35146 [00:00<?, ?it/s]

92_manifest_clean_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

92_manifest_clean_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

6097_clean
6097_manifest_clean_train.json


  0%|          | 0/39373 [00:00<?, ?it/s]

6097_manifest_clean_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

6097_manifest_clean_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

9017_clean
9017_manifest_clean_train.json


  0%|          | 0/51470 [00:00<?, ?it/s]

9017_manifest_clean_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

9017_manifest_clean_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

6097_other
6097_manifest_other_train.json


  0%|          | 0/4200 [00:00<?, ?it/s]

6670_other
6670_manifest_other_train.json


  0%|          | 0/21240 [00:00<?, ?it/s]

6670_manifest_other_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

6670_manifest_other_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

6671_other
6671_manifest_other_train.json


  0%|          | 0/27072 [00:00<?, ?it/s]

6671_manifest_other_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

6671_manifest_other_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

8051_other
8051_manifest_other_train.json


  0%|          | 0/32937 [00:00<?, ?it/s]

8051_manifest_other_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

8051_manifest_other_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

9136_other
9136_manifest_other_train.json


  0%|          | 0/28290 [00:00<?, ?it/s]

9136_manifest_other_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

9136_manifest_other_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

11614_other
11614_manifest_other_train.json


  0%|          | 0/29962 [00:00<?, ?it/s]

11614_manifest_other_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

11614_manifest_other_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

11697_other
11697_manifest_other_train.json


  0%|          | 0/21214 [00:00<?, ?it/s]

11697_manifest_other_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

11697_manifest_other_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

12787_other
12787_manifest_other_train.json


  0%|          | 0/31574 [00:00<?, ?it/s]

12787_manifest_other_test.json


  0%|          | 0/100 [00:00<?, ?it/s]

12787_manifest_other_dev.json


  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,audio_filepath,reader_id,book_id,audio_quality,split,text_no_preprocessing,text_normalized,text,duration,bandwidth
0,audio/92_clean/10425/secretagent_01_conrad_000...,92,10425,clean,train,"going out in the morning,","going out in the morning,",going out in the morning,1.53,17915
0,audio/92_clean/10425/secretagent_01_conrad_000...,92,10425,clean,train,left his shop nominally in charge of his broth...,left his shop nominally in charge of his broth...,left his shop nominally in charge of his broth...,3.3,17915
0,audio/92_clean/10425/secretagent_01_conrad_000...,92,10425,clean,train,because there was very little business at any ...,because there was very little business at any ...,because there was very little business at any ...,2.47,17915
0,audio/92_clean/10425/secretagent_01_conrad_000...,92,10425,clean,train,and practically none at all before the evening.,and practically none at all before the evening.,and practically none at all before the evening,2.72,17915
0,audio/92_clean/10425/secretagent_01_conrad_000...,92,10425,clean,train,Mr Verloc cared but little about his ostensibl...,Mister Verloc cared but little about his osten...,mister verloc cared but little about his osten...,3.7,17915


In [None]:
some_set = set()
def filter_string(some_str):
    if type(some_str) is not str:
        return
    for c in some_str:
        if not c.isalpha():
            some_set.add(c)

filter_string("Hello, world!")
some_set

{' ', '!', ','}

In [None]:
for col in ("audio_filepath", "text_no_preprocessing", "text_normalized", "text"):
    print(col)
    some_set = set()
    df[col].apply(filter_string)
    print(sorted(list(some_set)))
    print()

audio_filepath
['.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_']

text_no_preprocessing
[' ', '!', '"', '$', '%', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', '_', '`', '£', '«', '»', '—', '“', '”', '⸺']

text_normalized
[' ', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?']

text
[' ', "'"]



In [None]:
df = df.reset_index(drop=True)

In [None]:
for i, row in df.iterrows():
    text0, text1 = row["text_no_preprocessing"].strip().split(' '), row["text_normalized"].strip().split(' ')

    for word1 in sorted(text1):
        if word1 in text0:
            text0.remove(word1)
            text1.remove(word1)
    
    for word0, word1 in zip(text0, text1):
        print(word0, word1)
    
    if i == 200:
        raise KeyboardInterrupt


Mr Mister
_The The
Torch_, Torch,
_The The
Gong_—rousing Gong
Mr Mister
Mr Mister
Mr Mister
Mr Mister
day—and day-and
early—as early-as
Mr Mister
“business "business
houses” houses"
“Of "Of
mother,” mother,"
Mr Mister
Mr Mister
Mr Mister
address—at address-at
squibs—and squibs-and


KeyboardInterrupt: ignored

In [None]:
for i, row in df.iterrows():
    text0, text1 = row["text_no_preprocessing"].strip().split(' '), row["text_normalized"].strip().split(' ')

    for word1 in sorted(text1):
        if word1 in text0:
            text0.remove(word1)
            text1.remove(word1)
    
    for word0, word1 in zip(text0, text1):
        if word0.isnumeric():
            print(word0, word1)
    
    if i == 5000:
        raise KeyboardInterrupt


32 thirty


KeyboardInterrupt: ignored

In [None]:
temp_chrs = ['$', '%', '&', '£']
for i, row in df.iterrows():
    text0_, text1_ = row["text_no_preprocessing"].strip(), row["text_normalized"].strip()
    text0, text1 = text0_.split(' '), text1_.split(' ')

    for word1 in sorted(text1):
        if word1 in text0:
            text0.remove(word1)
            text1.remove(word1)
    
    for word0, word1 in zip(text0, text1):
        for c in temp_chrs:
            if c in word0:
                print(text0_, '/', text1_)
                temp_chrs.remove(c)
                break
    
    if len(temp_chrs) == 0:
        raise KeyboardInterrupt


paying the enormous sum of £10 for so exclusive a luxury. / paying the enormous sum of ten pounds for so exclusive a luxury.
and 25% is mountain country, the larger portion in the southern states. / and twenty five percent is mountain country, the larger portion in the southern states.
The cost of a certain plantation made in 1751 is, however, reported as less than $3. / The cost of a certain plantation made in seventeen fifty one is, however, reported as less than three dollars.
after setting forth that  whereas many Parents & Masters, / after setting forth that  whereas many Parents and Masters,


KeyboardInterrupt: ignored

In [None]:
import re

apostrophe1 = re.compile("([a-zA-Z])(')([a-zA-Z])")
apostrophe2 = re.compile("(s)(')( )")

def easy_norm(some_str):
    some_str = apostrophe1.sub(r"\1따옴표\3", some_str)
    some_str = apostrophe1.sub(r"\1따옴표\3", some_str) # sha'n't
    some_str = apostrophe2.sub(r"\1따옴표\3", some_str)
    for c in ('!', '"', "'", '(', ')', ':', ';', '?'):
        some_str = some_str.replace(c, '')
    for c in (',', '-', '.'):
        some_str = some_str.replace(c, ' ')
    some_str = some_str.replace("따옴표", "'")
    while "  " in some_str:
        some_str = some_str.replace("  ", ' ')
    return some_str.strip().lower()

for i, row in df.iterrows():
    text1, text2 = row["text_normalized"].strip(), row["text"].strip()
    norm_text1 = easy_norm(text1)
    if norm_text1 == text2:
        continue
    elif norm_text1.replace("goodbye", "good bye") == text2:
        continue
    
    print(text1, '\n', text2, "\n\n")


tocontrol them. 
 to control them 


