In [1]:
from os.path import join
from collections import Counter

In [5]:
langs = ['en', 'de']
data_dir = '/nfs/team/nlp/users/rgupta/NMT/code/GNN-Semantic-Similarity/local/'

In [6]:
ls $data_dir

[0m[01;34mfigs[0m/  [01;34mgrid_search[0m/  [01;34mlogs[0m/


In [3]:
lang2data = {lang: [] for lang in langs}

for lang in langs:
    with open(join(data_dir, f'corpora/iwslt14-en-de-words-full.{lang}')) as f:
        lang2data[lang] = [line.strip() for line in f.readlines()]

In [4]:
lang2data['en'][:5]

['It  can  be  a  very  complicated  thing ,  the  ocean .',
 'And  it  can  be  a  very  complicated  thing ,  what  human  health  is .',
 "And  bringing  those  two  together  might  seem  a  very  daunting  task ,  but  what  I ' m  going  to  try  to  say  is  that  even  in  that  complexity ,  there ' s  some  simple  themes  that  I  think ,  if  we  understand ,  we  can  really  move  forward .",
 "And  those  simple  themes  aren ' t  really  themes  about  the  complex  science  of  what ' s  going  on ,  but  things  that  we  all  pretty  well  know .",
 "And  I ' m  going  to  start  with  this  one :  If  mom ma  ain ' t  happy ,  ain ' t  nobody  happy ."]

In [11]:
lang2count = {lang: Counter([word for line in data for word in line.split()]) for lang, data in lang2data.items()}
lang2count_lower = {lang: Counter([word.lower() for line in data for word in line.split()]) for lang, data in lang2data.items()}

en
Count 20266
Count norm 18019
Data 166837
de
Count 28767
Count norm 24635
Data 166837


In [19]:
lang2set = {lang: set() for lang in langs}
lang2set_lower = {lang: set() for lang in langs}

for lang in langs:
    for line in lang2data[lang]:
        for word in line.strip().split():
            lang2set[lang].add(word)
            lang2set_lower[lang].add(word.lower())

In [21]:
for lang in langs:
    print(lang)
    print("Count", len(lang2count[lang]))
    print("Set", len(lang2set[lang]))

    print("Count lower", len(lang2count_lower[lang]))
    print("Set lower", len(lang2set_lower[lang]))

    print("Data", len(lang2data[lang]))

en
Count 20266
Set 20266
Count lower 18019
Set lower 18019
Data 166837
de
Count 28767
Set 28767
Count lower 24635
Set lower 24635
Data 166837


In [28]:
for lang in langs:
    with open(join(data_dir, f"vocabs/words-mix.{lang}"), 'w') as f:
        for token, count in lang2count[lang].most_common():
            f.write(f"{token} {count}\n")
            
    with open(join(data_dir, f"vocabs/words-lower.{lang}"), 'w') as f:
        for token, count in lang2count_lower[lang].most_common():
            f.write(f"{token} {count}\n")

## Create dataset with lower-cased words

In [30]:
for split in ['full', 'train', 'dev', 'test']:
    for lang in langs:
        with open(join(data_dir, f'corpora/iwslt14-en-de/words-mix/words-mix-{split}.{lang}')) as f:
            data = f.readlines()
        with open(join(data_dir, f'corpora/iwslt14-en-de/words-lower/words-lower-{split}.{lang}'), 'w') as f:
            for line in data:
                f.write(line.lower())