In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
googleDrivePathPrefix = 'drive/My Drive/Colab Notebooks'

In [3]:
from os import path
import re
import pandas as pd
import numpy as np


In [4]:
data_examples = []
with open(path.join(googleDrivePathPrefix,'data/cmn.txt'), 'r',encoding='utf8') as f:
  for line in f.readlines():
    data_examples.append(line)

In [5]:
english=[]
chinese=[]
for data in data_examples:
  splits=re.split('\t',data)
  english.append(splits[0])
  chinese.append(splits[1])

In [6]:
df = pd.DataFrame(data={'english':english,'chinese':chinese})

In [7]:
df.head()

Unnamed: 0,english,chinese
0,Hi.,嗨。
1,Hi.,你好。
2,Run.,你用跑的。
3,Wait!,等等！
4,Wait!,等一下！


In [8]:
df.count()

english    24697
chinese    24697
dtype: int64

Filter texts with quotation marks `"`

- The intention is to remove cases where there is conversational nature in the the sentence, e.g.

In [9]:
df.loc[24690]

english    "Is that Tom calling again?" "Yes. He calls ev...
chinese            “又是汤姆的电话？” “嗯。最近他每天晚上都会打过来。当时就不该给他我的号码的。”
Name: 24690, dtype: object

In [10]:
conversational_sample_mask = df['english'].str.contains('" "')
print(f'Conversational example counts: {conversational_sample_mask.sum()}')

Conversational example counts: 60


In [11]:
df_conversational_samples = df[conversational_sample_mask]
df_conversational_samples.head()

Unnamed: 0,english,chinese
9689,"""More coffee?"" ""No, thanks.""",“还要咖啡吗？”“不了，谢谢。”
10702,"""She likes music."" ""So do I.""",“她喜欢音乐。”“我也是。”
11583,"""I like traveling."" ""So do I.""",“我喜欢旅游。“ ”我也是。“
11584,"""Who is in the car?"" ""Tom is.""",“谁在车里？” “汤姆。”
14101,"""What's in that box?"" ""Nothing.""",“盒子里有什么？”“什么都没有。”


In [12]:
df_filtered_conversation = df[~conversational_sample_mask]
print(f'Data sample counts (filtered conversations):\n{df_filtered_conversation.count()}')

Data sample counts (filtered conversations):
english    24637
chinese    24637
dtype: int64


In [13]:
df_filtered_conversation.sample(5)

Unnamed: 0,english,chinese
6294,He turned down my offer.,他拒絕了我的提議。
17713,I wonder which way is the shortest.,我想知道哪一條路是最短的。
19272,She substituted margarine for butter.,她用人造黄油代替了黄油。
20698,The meaning of this sentence is obscure.,这句句子意思模糊。
9805,He has more books than I do.,他拥有的书比我的多。


Filter chinese text with english characters, numbers and non-essential punctuation marks excluding `[。，？！]`.

- To simplify separating individual characters in chinese texts, we filter out samples with english characters, numbers, and non-essential punctuation marks.
- Thus, we can simply split each character in the chinese texts and treat them as individual chinese character or punctuation mark.

In [14]:
chinese_with_eng_char_sample_mask = df['chinese'].str.contains("[A-Za-z0-9\-/•（）《》「」“”\[\]\"]")
chinese_with_eng_char_sample_mask.sum()

607

In [15]:
df_chinese_with_eng_char_sample = df_filtered_conversation[chinese_with_eng_char_sample_mask]
df_chinese_with_eng_char_sample.sample(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,english,chinese
17441,He arrived at the station at seven.,他7点到了火车站。
20372,Water freezes at 32 degrees Fahrenheit.,水在華氏32度結成冰。
22596,"""A rolling stone gathers no moss"" is a proverb.",“滚石不生苔”是一句谚语。
13906,Tom's house has three bedrooms.,Tom的房子有三個房間。
18376,CDs have taken the place of records.,CD已经取代了胶木唱片。


In [16]:
df_filtered_final = df_filtered_conversation[~chinese_with_eng_char_sample_mask]
print(f'Data sample counts (filtered chinese text with english characters):\n{df_filtered_final.count()}')

Data sample counts (filtered chinese text with english characters):
english    24089
chinese    24089
dtype: int64


  """Entry point for launching an IPython kernel.


In [17]:
df_filtered_final.sample(5)

Unnamed: 0,english,chinese
19136,I was given a nice watch by my uncle.,我叔叔给了我一块漂亮的手表。
10092,I wish I were a good singer.,但願我是一個好歌手。
9215,It was a great shock to me.,這對我是很大的震撼。
18108,They all looked for the lost child.,他們都在尋找這個走失的孩子。
24402,"In case the shipment is delayed, we have speci...",万一船运迟了，我们有特别的迟到保险。


Replace chinese text with ascii punctuation marks, e.g. `[.,?!]` with chinese unicode punctuation marks `[。，？！]`.

In [18]:
chinese_with_ascii_punctuation_mask = df_filtered_final['chinese'].str.contains("[?.!,]")
chinese_with_ascii_punctuation_mask.sum()

587

In [19]:
df_chinese_with_ascii_punct = df_filtered_final[chinese_with_ascii_punctuation_mask]
ind = np.random.choice(df_chinese_with_ascii_punct.index,5)
df_chinese_with_ascii_punct.loc[ind[:]]

Unnamed: 0,english,chinese
23809,"If you are not going to the concert, then neit...","如果你不去音樂會, 我也不去。"
7992,Does he have any brothers?,他有任何的兄弟嗎?
6326,How's the weather there?,那里的气候怎么样?
753,How about you?,那你呢?
4679,Don't you play tennis?,你不打網球嗎?


In [20]:
def replace_punct(sentence):
  sentence = re.sub(r"[?]", r"？", sentence)
  sentence = re.sub(r"[.]", r"。", sentence)
  sentence = re.sub(r"[!]", r"！", sentence)
  sentence = re.sub(r"[,]", r"，", sentence)
  return sentence

In [21]:
df_replace_punctuation = df_filtered_final
df_replace_punctuation['chinese'] = df_replace_punctuation['chinese'].apply(lambda x: replace_punct(x))
df_replace_punctuation[chinese_with_ascii_punctuation_mask].loc[ind[:]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,english,chinese
23809,"If you are not going to the concert, then neit...",如果你不去音樂會， 我也不去。
7992,Does he have any brothers?,他有任何的兄弟嗎？
6326,How's the weather there?,那里的气候怎么样？
753,How about you?,那你呢？
4679,Don't you play tennis?,你不打網球嗎？


In [22]:
df_replace_punctuation['chinese'].str.contains("[?.!,]").sum()

0

Separate punctuation marks with additional space for the purpose of string splitting.

In [23]:
df_separate_punctuation = df_replace_punctuation.applymap(lambda x: re.sub(r"([.,?!。，？！])", r" \1 ", x))

In [24]:
df_separate_punctuation.sample(5)

Unnamed: 0,english,chinese
8255,I think you look like Tom .,我看你长得像汤姆 。
15742,I've changed my website's layout .,我改了一下我网站的版面设计 。
11017,I started thinking about Tom .,我开始想起汤姆 。
17500,"He seldom , if ever , goes to church .",他很少 ， 如果有的話 ， 去教堂 。
1934,It's frozen hard .,真的很冰冷 。


Mapping function to do the text spliting.

- Split english text with space.
- Split chinese text at every character, discard the first and last element.

In [25]:
df_individual_char = df_separate_punctuation
df_individual_char['english_split']=df_individual_char['english'].map(lambda x: re.split(' ',x))
df_individual_char['chinese_split']=df_individual_char['chinese'].map(lambda x: re.split('',x)[1:-2])

In [26]:
#ind = np.random.choice(df_individual_char['english'].count(),5)
ind = np.array([23215,  7732, 22636,  6434, 20126])  # fix 
ind

array([23215,  7732, 22636,  6434, 20126])

In [27]:
df_individual_char.loc[ind]

Unnamed: 0,english,chinese,english_split,chinese_split
23215,I hope he'll be able to come ! I'd like to se...,我希望他能來 ！ 我想見他 。,"[I, hope, he'll, be, able, to, come, !, , I'd,...","[我, 希, 望, 他, 能, 來, , ！, , 我, 想, 見, 他, , 。]"
7732,Tom is able to swim well .,湯姆游泳可以游得很好 。,"[Tom, is, able, to, swim, well, ., ]","[湯, 姆, 游, 泳, 可, 以, 游, 得, 很, 好, , 。]"
22636,He amazed everyone by passing his driving test .,他通過駕駛考試的事讓每一個人都覺得驚訝 。,"[He, amazed, everyone, by, passing, his, drivi...","[他, 通, 過, 駕, 駛, 考, 試, 的, 事, 讓, 每, 一, 個, 人, 都, ..."
6434,I saw him sawing a tree .,我看见他正在锯一棵树 。,"[I, saw, him, sawing, a, tree, ., ]","[我, 看, 见, 他, 正, 在, 锯, 一, 棵, 树, , 。]"
20126,I got a farewell present from everyone .,每個人都送了我一份歡送禮物 。,"[I, got, a, farewell, present, from, everyone,...","[每, 個, 人, 都, 送, 了, 我, 一, 份, 歡, 送, 禮, 物, , 。]"


Remove undesired characters in the text arrays.

- Remove '' in english text.
- Remove ' ' in chinese text.

In [28]:
def remove_char(x, char):
  x = np.asarray(x)
  mask = x == np.repeat(char, len(x))
  return x[~mask]


In [29]:
tmp=df_individual_char['english_split'].loc[ind[0]]
print(f'Raw input: {tmp}\n')
result = remove_char(tmp, '')
print(f'Removed char \'\': {result}')

Raw input: ['I', 'hope', "he'll", 'be', 'able', 'to', 'come', '!', '', "I'd", 'like', 'to', 'see', 'him', '.', '']

Removed char '': ['I' 'hope' "he'll" 'be' 'able' 'to' 'come' '!' "I'd" 'like' 'to' 'see'
 'him' '.']


In [30]:
tmp=df_individual_char['chinese_split'].loc[ind[2]]
print(f'Raw input: {tmp}\n')
result = remove_char(tmp, ' ')
print(f'Removed char \' \': {result}')

Raw input: ['他', '通', '過', '駕', '駛', '考', '試', '的', '事', '讓', '每', '一', '個', '人', '都', '覺', '得', '驚', '訝', ' ', '。']

Removed char ' ': ['他' '通' '過' '駕' '駛' '考' '試' '的' '事' '讓' '每' '一' '個' '人' '都' '覺' '得' '驚'
 '訝' '。']


In [31]:
df_remove_array_char = df_individual_char
df_remove_array_char['english_split']=df_remove_array_char['english_split'].map(lambda x: remove_char(x,''))
df_remove_array_char['chinese_split']=df_remove_array_char['chinese_split'].map(lambda x: remove_char(x,' '))

In [32]:
df_remove_array_char.loc[ind]

Unnamed: 0,english,chinese,english_split,chinese_split
23215,I hope he'll be able to come ! I'd like to se...,我希望他能來 ！ 我想見他 。,"[I, hope, he'll, be, able, to, come, !, I'd, l...","[我, 希, 望, 他, 能, 來, ！, 我, 想, 見, 他, 。]"
7732,Tom is able to swim well .,湯姆游泳可以游得很好 。,"[Tom, is, able, to, swim, well, .]","[湯, 姆, 游, 泳, 可, 以, 游, 得, 很, 好, 。]"
22636,He amazed everyone by passing his driving test .,他通過駕駛考試的事讓每一個人都覺得驚訝 。,"[He, amazed, everyone, by, passing, his, drivi...","[他, 通, 過, 駕, 駛, 考, 試, 的, 事, 讓, 每, 一, 個, 人, 都, ..."
6434,I saw him sawing a tree .,我看见他正在锯一棵树 。,"[I, saw, him, sawing, a, tree, .]","[我, 看, 见, 他, 正, 在, 锯, 一, 棵, 树, 。]"
20126,I got a farewell present from everyone .,每個人都送了我一份歡送禮物 。,"[I, got, a, farewell, present, from, everyone, .]","[每, 個, 人, 都, 送, 了, 我, 一, 份, 歡, 送, 禮, 物, 。]"


Append `<start>` and `<end>` token to the chinese text.

In [33]:
def append_token(x):
  x=np.array(x,dtype='U7')
  start_token_added = np.insert(x,0,"<start>",axis=0)
  end_token_added = np.concatenate((start_token_added,["<end>"]),axis=0)
  return end_token_added

In [34]:
append_token(df_remove_array_char['chinese_split'].loc[ind[0]])

array(['<start>', '我', '希', '望', '他', '能', '來', '！', '我', '想', '見', '他',
       '。', '<end>'], dtype='<U7')

In [35]:
df_final = df_remove_array_char
df_final['chinese_split']=df_final['chinese_split'].map(lambda x: append_token(x))

In [36]:
for i in ind:
  print(f'{i}-th sample:\n')
  eng = df_final['english_split'].loc[i]
  chn = df_final['chinese_split'].loc[i]
  print(f'english: {eng}\n')
  print(f'chinese: {chn}\n\n')

23215-th sample:

english: ['I' 'hope' "he'll" 'be' 'able' 'to' 'come' '!' "I'd" 'like' 'to' 'see'
 'him' '.']

chinese: ['<start>' '我' '希' '望' '他' '能' '來' '！' '我' '想' '見' '他' '。' '<end>']


7732-th sample:

english: ['Tom' 'is' 'able' 'to' 'swim' 'well' '.']

chinese: ['<start>' '湯' '姆' '游' '泳' '可' '以' '游' '得' '很' '好' '。' '<end>']


22636-th sample:

english: ['He' 'amazed' 'everyone' 'by' 'passing' 'his' 'driving' 'test' '.']

chinese: ['<start>' '他' '通' '過' '駕' '駛' '考' '試' '的' '事' '讓' '每' '一' '個' '人' '都' '覺'
 '得' '驚' '訝' '。' '<end>']


6434-th sample:

english: ['I' 'saw' 'him' 'sawing' 'a' 'tree' '.']

chinese: ['<start>' '我' '看' '见' '他' '正' '在' '锯' '一' '棵' '树' '。' '<end>']


20126-th sample:

english: ['I' 'got' 'a' 'farewell' 'present' 'from' 'everyone' '.']

chinese: ['<start>' '每' '個' '人' '都' '送' '了' '我' '一' '份' '歡' '送' '禮' '物' '。' '<end>']




Save the processed dataset.

In [37]:
df_final.to_json(path.join(googleDrivePathPrefix,'data/cmn-processed.json'))

In [38]:
df_reproduced = pd.read_json(path.join(googleDrivePathPrefix,'data/cmn-processed.json'))
df_reproduced.sample(5)

Unnamed: 0,english,chinese,english_split,chinese_split
13453,It is really time for us to go .,这真是我们该走的时候了 。,"[It, is, really, time, for, us, to, go, .]","[<start>, 这, 真, 是, 我, 们, 该, 走, 的, 时, 候, 了, 。, ..."
15353,He gave his life for his country .,他把他的一生獻給了他的國家 。,"[He, gave, his, life, for, his, country, .]","[<start>, 他, 把, 他, 的, 一, 生, 獻, 給, 了, 他, 的, 國, ..."
831,It's a secret .,它是個秘密 。,"[It's, a, secret, .]","[<start>, 它, 是, 個, 秘, 密, 。, <end>]"
13137,How much is the rent per month ?,一個月的租金多少 ？,"[How, much, is, the, rent, per, month, ?]","[<start>, 一, 個, 月, 的, 租, 金, 多, 少, ？, <end>]"
8932,He has to repair the clock .,他必須修理這個時鐘 。,"[He, has, to, repair, the, clock, .]","[<start>, 他, 必, 須, 修, 理, 這, 個, 時, 鐘, 。, <end>]"
