## Use the dictionary from dict.cc to generate our word list

### Download the dictionary from http://www.dict.cc/?s=about%3Awordlist

### Print out the first 20 lines of the dictionary

In [2]:
!head -n 20 de-en.txt

# DE-EN vocabulary database	compiled by dict.cc
# Date and time	2016-08-29 23:46
# License	THIS WORK IS PROTECTED BY INTERNATIONAL COPYRIGHT LAWS!
# License	Private use is allowed as long as the data, or parts of it, are not published or given away.
# License	By using this file, you agree to be bound to the Terms of Use published at the following URL:  
# License	http://www.dict.cc/translation_file_request.php
# Contains data from	http://dict.tu-chemnitz.de/ with friendly permission by Frank Richter, TU Chemnitz 
# Brought to you by	Paul Hemetsberger and the users of http://www.dict.cc/, 2002 - 2016

&#945;-Keratin {n}	&#945;-keratin	noun
&#945;-Lactalbumin {n} <&#945;-La>	&#945;-lactalbumin <&#945;-La>	noun
&#946;-Mercaptoethanol {n}	&#946;-mercaptoethanol	noun
&#963;-Algebra {f}	&#963;-field	noun
&#963;-Algebra {f}	sigma algebra	noun
& Co.	and company <& Co.>	
'Die' heißt mein Unterrock, und 'der' hängt im Schrank. [regional] [Satz, mit dem Kinder gerügt werden, die vo

### Insert csv header after licensing information

In [3]:
!sed "9 a GermanWord\tEnglishWord\tWordType" "de-en.txt" > "dictionary-as-csv-file.txt"

### Use pandas library to import csv file

In [4]:
import pandas as pd


dictcc_df = pd.read_csv("dictionary-as-csv-file.txt", sep='\t', header=8)

### display some of the contents of the dictcc dataframe (dictcc_df)

In [5]:
dictcc_df

Unnamed: 0,GermanWord,EnglishWord,WordType
0,&#945;-Keratin {n},&#945;-keratin,noun
1,&#945;-Lactalbumin {n} <&#945;-La>,&#945;-lactalbumin <&#945;-La>,noun
2,&#946;-Mercaptoethanol {n},&#946;-mercaptoethanol,noun
3,&#963;-Algebra {f},&#963;-field,noun
4,&#963;-Algebra {f},sigma algebra,noun
5,& Co.,and company <& Co.>,
6,"'Die' heißt mein Unterrock, und 'der' hängt im...",'She' is the cat's mother. [used to encourage ...,
7,'n Abend allerseits! [ugs.],Evening all! [coll.],
8,'nauf [regional] [hinauf],up,adv
9,'Nduja {f} [auch: Nduja],'nduja [also: nduja],noun


### Convert WordType Column to a pandas.Categorical

In [153]:
word_types = dictcc_df["WordType"].astype('category')
dictcc_df["WordType"] = word_types
# show data types of each column in the dataframe
dictcc_df.dtypes

GermanWord       object
EnglishWord      object
WordType       category
dtype: object

### List the current distribution of word types in our dataframe

In [154]:
dictcc_df["WordType"].value_counts()

noun                  759619
verb                  126806
adj                    94507
adv                    26277
adj past-p             12519
adj pres-p              4907
past-p                  2712
adj adv                 1687
prep                     976
pron                     500
conj                     350
pres-p                   308
prefix                   177
past-p adj                75
suffix                    67
pres-p adj                43
adv adj                   30
adv prep                  27
adj pron                  27
adj.                      16
adv conj                  11
adv noun                  11
adj suffix                10
adj archaic:adv           10
prep conj                 10
noun adv                   6
adv pron                   5
adj coll:adv               5
adv past-p                 4
adj noun                   4
                       ...  
adj archaic:past-p         2
adj attr.                  2
adj adv past-p             2
[none]        

### Filter the results that we want

#### We want results that are less than 'x' letters long

In [155]:
lt_x_letters = dictcc_df_filtered["EnglishWord"].str.len() < 9
dictcc_df_filtered[lt_x_letters]
dictcc_df_filtered = dictcc_df_filtered[lt_x_letters]
dictcc_df_filtered.describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,112422,112422,111436
unique,81982,42674,39
top,knapp,boom,noun
freq,26,35,68793


#### We want to remove all duplicates

In [156]:
dictcc_df_filtered = dictcc_df_filtered.drop_duplicates("EnglishWord")
dictcc_df_filtered.describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,42674,42674,42075
unique,36572,42674,27
top,angemessen,famine,noun
freq,13,1,27744


#### We want to remove words that are difficult to spell (long vowel repitions)

In [172]:
long_vowel_rep_pattern = r'\b\w+[aeiou][aeiou][aeiou]\w+\b'
is_long_vowel_rep = dictcc_df_filtered["EnglishWord"].str.contains(long_vowel_rep_pattern, na=False)
dictcc_df_filtered = dictcc_df_filtered[~is_long_vowel_rep]
dictcc_df_filtered.describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,42359,42359,41765
unique,36339,42359,27
top,angemessen,famine,noun
freq,13,1,27585


### Show the distribution of word types after filtering

In [174]:
dictcc_df_filtered["WordType"].value_counts()

noun                  27585
adj                    7245
adj past-p             2123
verb                   1973
adv                    1231
adj pres-p              683
past-p                  615
adj adv                 132
pron                     44
prep                     42
pres-p                   34
conj                     22
past-p adj                8
prefix                    7
adj pron                  5
pres-p adj                3
adv conj                  2
adj archaic:adv           2
adv dated:adj             1
adj archaic:past-p        1
adj obs:past-p            1
adv prep conj             1
adv prep                  1
adv adj                   1
[none]                    1
adv pron                  1
pres-p archaic:adj        1
[none][none]              0
ad jpast-p                0
rel pron                  0
                      ...  
adj adv past-p            0
prep conj                 0
adj adv pres-p            0
adj adv pron              0
pron adj            

### Apply best practices for wordlists (from BIP-0039)

In [490]:
nouns = dictcc_df_filtered[dictcc_df_filtered["WordType"] == "noun"]
adjectives = dictcc_df_filtered[dictcc_df_filtered["WordType"] == "adj"]
verb = dictcc_df_filtered[dictcc_df_filtered["WordType"] == "verb"]

#### a) smart selection of words

- the wordlist is created in such way that it's enough to type the first four letters to unambiguously identify the word

#### b) similar words avoided

- word pairs like "build" and "built", "woman" and "women", or "quick" and "quickly" not only make remembering the sentence difficult, but are also more error prone and more difficult to guess

In [342]:
# `pip install python-levenshtein`
# used to calculate the Levenshtein distance between words
import Levenshtein as lev

In [511]:
# Maximize the Levenshtein distance between neighbouring words
def spread_words(dataframe_values, min_distance = 50, min_lev = 5):
    words = []
    words.extend(dataframe_values)
    short_distances = 0
    for i in range(len(words)-1):
        next = i + 1
        if lev.distance(words[i],words[next]) < min_lev:
            short_distances = short_distances + 1
            words.append(words[next])
            words.remove(words[next])
    # The value for min_distance was derived
    # by simple trial and error
    if short_distances < min_distance:
        # The remaining words with short distance 
        # will have to be sorted out by hand.
        print("neighbours with short distances: " + str(short_distances))
        return words
    else:
        # Recurse until we minimize short distances
        # as much as possible.
        return spread_words(words)

In [512]:
min_distance_nouns = spread_words(nouns["EnglishWord"].values,25)

neighbours with short distances: 44


In [505]:
min_distance_adjectives = spread_words(adjectives["EnglishWord"].values)
len(min_distance_adjectives)

neighbours with short distances: 44


7245

In [518]:
# The distance between verbs seems to be good enough already
min_distance_verbs = verbs["EnglishWord"].values

#### Write nouns to csv file

In [520]:
import csv
with open("nouns.csv", "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in min_distance_nouns:
        writer.writerow([val])