<a href="https://colab.research.google.com/github/subhobrata/DeepNLP/blob/master/8_5_nmt_munging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from argparse import Namespace
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
args = Namespace(
    source_data_path="/content/gdrive/My Drive/data/nmt/eng-fra.txt",
    output_data_path="/content/gdrive/My Drive/data/nmt/simplest_eng_fra.csv",
    perc_train=0.7,
    perc_val=0.15,
    perc_test=0.15,
    seed=1337
)

assert args.perc_test > 0 and (args.perc_test + args.perc_val + args.perc_train == 1.0)

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
with open(args.source_data_path) as fp:
    lines = fp.readlines()
    
lines = [line.replace("\n", "").lower().split("\t") for line in lines]

In [0]:
data = []
for english_sentence, french_sentence in lines:
    data.append({"english_tokens": word_tokenize(english_sentence, language="english"),
                 "french_tokens": word_tokenize(french_sentence, language="french")})

In [0]:
filter_phrases = (
    ("i", "am"), ("i", "'m"), 
    ("he", "is"), ("he", "'s"),
    ("she", "is"), ("she", "'s"),
    ("you", "are"), ("you", "'re"),
    ("we", "are"), ("we", "'re"),
    ("they", "are"), ("they", "'re")
)

In [0]:
data_subset = {phrase: [] for phrase in filter_phrases}
for datum in data:
    key = tuple(datum['english_tokens'][:2])
    if key in data_subset:
        data_subset[key].append(datum)

In [14]:
counts = {k: len(v) for k,v in data_subset.items()}
counts, sum(counts.values())

({('he', "'s"): 787,
  ('he', 'is'): 1069,
  ('i', "'m"): 4760,
  ('i', 'am'): 805,
  ('she', "'s"): 316,
  ('she', 'is'): 504,
  ('they', "'re"): 470,
  ('they', 'are'): 194,
  ('we', "'re"): 1053,
  ('we', 'are'): 181,
  ('you', "'re"): 2474,
  ('you', 'are'): 449},
 13062)

In [0]:
np.random.seed(args.seed)

dataset_stage3 = []
for phrase, datum_list in sorted(data_subset.items()):
    np.random.shuffle(datum_list)
    n_train = int(len(datum_list) * args.perc_train)
    n_val = int(len(datum_list) * args.perc_val)

    for datum in datum_list[:n_train]:
        datum['split'] = 'train'
        
    for datum in datum_list[n_train:n_train+n_val]:
        datum['split'] = 'val'
        
    for datum in datum_list[n_train+n_val:]:
        datum['split'] = 'test'
    
    dataset_stage3.extend(datum_list)

In [0]:
# here we pop and assign into the dictionary, thus modifying in place
for datum in dataset_stage3:
    datum['source_language'] = " ".join(datum.pop('english_tokens'))
    datum['target_language'] = " ".join(datum.pop('french_tokens'))

In [0]:
nmt_df = pd.DataFrame(dataset_stage3)

In [18]:
nmt_df.head()

Unnamed: 0,source_language,split,target_language
0,he 's the cutest boy in town .,train,c'est le garçon le plus mignon en ville .
1,he 's a nonsmoker .,train,il est non-fumeur .
2,he 's smarter than me .,train,il est plus intelligent que moi .
3,he 's a lovely young man .,train,c'est un adorable jeune homme .
4,he 's three years older than me .,train,il a trois ans de plus que moi .


In [0]:
nmt_df.to_csv(args.output_data_path)