# Importing our wordlists

## Dictcc

#### Download the dictionary from http://www.dict.cc/?s=about%3Awordlist

#### Print out the first 20 lines of the dictionary

In [2]:
!head -n 20 de-en.txt

# DE-EN vocabulary database	compiled by dict.cc
# Date and time	2016-08-29 23:46
# License	THIS WORK IS PROTECTED BY INTERNATIONAL COPYRIGHT LAWS!
# License	Private use is allowed as long as the data, or parts of it, are not published or given away.
# License	By using this file, you agree to be bound to the Terms of Use published at the following URL:  
# License	http://www.dict.cc/translation_file_request.php
# Contains data from	http://dict.tu-chemnitz.de/ with friendly permission by Frank Richter, TU Chemnitz 
# Brought to you by	Paul Hemetsberger and the users of http://www.dict.cc/, 2002 - 2016

&#945;-Keratin {n}	&#945;-keratin	noun
&#945;-Lactalbumin {n} <&#945;-La>	&#945;-lactalbumin <&#945;-La>	noun
&#946;-Mercaptoethanol {n}	&#946;-mercaptoethanol	noun
&#963;-Algebra {f}	&#963;-field	noun
&#963;-Algebra {f}	sigma algebra	noun
& Co.	and company <& Co.>	
'Die' heißt mein Unterrock, und 'der' hängt im Schrank. [regional] [Satz, mit dem Kinder gerügt werden, die vo

#### Insert csv header after licensing information

In [3]:
!sed "9 a GermanWord\tEnglishWord\tWordType" "de-en.txt" > "dictionary-as-csv-file.txt"

#### Use pandas library to import csv file

In [651]:
import pandas as pd


dictcc_df = pd.read_csv("dictionary-as-csv-file.txt", sep='\t', header=8)

#### display some of the contents of the dictcc dataframe (dictcc_df)

In [5]:
dictcc_df

Unnamed: 0,GermanWord,EnglishWord,WordType
0,&#945;-Keratin {n},&#945;-keratin,noun
1,&#945;-Lactalbumin {n} <&#945;-La>,&#945;-lactalbumin <&#945;-La>,noun
2,&#946;-Mercaptoethanol {n},&#946;-mercaptoethanol,noun
3,&#963;-Algebra {f},&#963;-field,noun
4,&#963;-Algebra {f},sigma algebra,noun
5,& Co.,and company <& Co.>,
6,"'Die' heißt mein Unterrock, und 'der' hängt im...",'She' is the cat's mother. [used to encourage ...,
7,'n Abend allerseits! [ugs.],Evening all! [coll.],
8,'nauf [regional] [hinauf],up,adv
9,'Nduja {f} [auch: Nduja],'nduja [also: nduja],noun


#### Convert WordType Column to a pandas.Categorical

In [652]:
word_types = dictcc_df["WordType"].astype('category')
dictcc_df["WordType"] = word_types
# show data types of each column in the dataframe
dictcc_df.dtypes

GermanWord       object
EnglishWord      object
WordType       category
dtype: object

#### List the current distribution of word types in dictcc dataframe

In [653]:
dictcc_df["WordType"].value_counts()

noun                  759619
verb                  126806
adj                    94507
adv                    26277
adj past-p             12519
adj pres-p              4907
past-p                  2712
adj adv                 1687
prep                     976
pron                     500
conj                     350
pres-p                   308
prefix                   177
past-p adj                75
suffix                    67
pres-p adj                43
adv adj                   30
adv prep                  27
adj pron                  27
adj.                      16
adv conj                  11
adv noun                  11
adj suffix                10
adj archaic:adv           10
prep conj                 10
noun adv                   6
adv pron                   5
adj coll:adv               5
adv past-p                 4
adj noun                   4
                       ...  
adj archaic:past-p         2
adj attr.                  2
adj adv past-p             2
[none]        

## Moby

#### Download the corpus from http://icon.shef.ac.uk/Moby/mpos.html

In [774]:
# the readme file gives some information on how to parse the file

result = []
# replace all '\r' with newlines then change encoding to UTF8
moby_words = !cat nltk/corpora/moby/mpos/mobyposi.i | tr -s '\r' '\n' | iconv --from-code=ISO88591 --to-code=UTF8
result.extend(moby_words)

moby_df = pd.DataFrame(data = result, columns = ['Words'])
nouns = moby_df[moby_df["Words"].str.contains('×N$')].copy()
# TODO: remove trailing ×N
nouns["WordType"] = "noun"
verbs = moby_df[moby_df["Words"].str.contains('×V$')].copy()
verbs["WordType"] = "verb"
adjectives = moby_df[moby_df["Words"].str.contains('×A$')].copy()
adjectives["WordType"] = "adj"
# combine nouns, verbs, adjectives
# the results here are quite messy

## Combine wordlists

## Filter the results that we want

#### We want results that are less than 'x' letters long (x+3 for verbs since they are in their infinitive form in the dictcc wordlist)

In [686]:
lt_x_letters = (dictcc_df["EnglishWord"].str.len() < 9) |\
               ((dictcc_df["EnglishWord"].str.contains('^to\s\w+\s')) &\
                (dictcc_df["EnglishWord"].str.len() < 11)\
               )
dictcc_df_filtered = dictcc_df[lt_x_letters]
dictcc_df_filtered.describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,123504,123504,121233
unique,90439,49804,43
top,knapp,boom,noun
freq,26,35,74948


#### We want to remove all duplicates

In [687]:
dictcc_df_filtered = dictcc_df_filtered.drop_duplicates("EnglishWord")
dictcc_df_filtered.describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,49804,49804,48362
unique,42726,49804,31
top,angemessen,famine,noun
freq,13,1,32169


#### We want to remove words that are difficult to spell (long vowel repitions)

In [703]:
long_vowel_rep_pattern = r'\b\w+[aeiou][aeiou][aeiou]\w+\b'
is_long_vowel_rep = dictcc_df_filtered["EnglishWord"].str.contains(long_vowel_rep_pattern, na=False)
dictcc_df_filtered = dictcc_df_filtered[~is_long_vowel_rep]
dictcc_df_filtered.describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,49487,49487,48051
unique,42493,49487,31
top,angemessen,famine,noun
freq,13,1,32009


#### We want to remove all names and animals

TODO:

#### We want to avoid similar words situated on neighbouring geo-coordinate boundaries

- Nouns like "cobra" and "domra" should not be located at Geo-Coordinate "55°x11°" and "55°x12°"
- TODO: the spread_words() method doesn't actually solve this problem. We will need to update it by calculating the distance to all its adjacent neighbours

In [676]:
# `pip install python-levenshtein`
# used to calculate the Levenshtein distance between words
import Levenshtein as lev

In [697]:
# Maximize the Levenshtein distance between neighbouring words
def spread_words(dataframe_values, min_distance = 25, min_lev = 5):
    words = []
    words.extend(dataframe_values)
    short_distances = 0
    for i in range(len(words)-1):
        next = i + 1
        if lev.distance(words[i],words[next]) < min_lev:
            short_distances = short_distances + 1
            words.append(words[next])
            words.remove(words[next])
    # The value for min_distance was derived
    # by simple trial and error
    if short_distances < min_distance:
        # The remaining words with short distance 
        # will have to be sorted out by hand.
        return words
    else:
        # Recurse until we minimize short distances
        # as much as possible.
        return spread_words(words)

In [698]:
# Insert distance of neighbour
def insert_neighbour_distance(words):
    result = []
    word_with_neighbour_distance = ()
    for i in range(len(words)-1):
        next = i + 1
        lev_distance = lev.distance(words[i],words[next])
        word_with_neighbour_distance = words[i], lev_distance
        result.append(word_with_neighbour_distance)
    return pd.DataFrame(data = result, columns=['Words', 'NeighbourDistance'])

In [562]:
min_distance_nouns = spread_words(nouns["EnglishWord"].values)
nouns_ready_for_export = insert_neighbour_distance(min_distance_nouns)
nouns_ready_for_export[:10]

Unnamed: 0,Words,NeighbourDistance
0,cubicle,7
1,cachexia,7
2,shield,6
3,tatbebs,7
4,flush,7
5,ectopia,6
6,eversion,8
7,abrachia,5
8,anotia,6
9,query,6


In [702]:
min_distance_adjectives = spread_words(adjectives["EnglishWord"].values,50)
adjectives_ready_for_export = insert_neighbour_distance(min_distance_adjectives)
adjectives_ready_for_export[:10]
len(adjectives_ready_for_export)

7712

In [706]:
# TODO: spread_words() on verbs doesn't seem to work that well
verbs_ready_for_export = insert_neighbour_distance(verbs["EnglishWord"].values)
verbs_ready_for_export[:10]
len(verbs_ready_for_export)

2659

#### Show the distribution of word types after filtering

In [704]:
dictcc_df_filtered["WordType"].value_counts()

noun                  32009
adj                    7713
verb                   2660
adj past-p             2172
adv                    1575
past-p                  723
adj pres-p              683
adj adv                 163
prefix                   76
prep                     70
pron                     58
suffix                   41
pres-p                   34
conj                     30
past-p adj               11
adj pron                  7
adj suffix                5
pres-p adj                3
adv conj                  3
adv prep                  2
adj archaic:adv           2
adv adj                   2
[none]                    1
adv pron                  1
adv prep conj             1
adv dated:adj             1
adv suffix                1
pres-p archaic:adj        1
adj attr.                 1
adj archaic:past-p        1
                      ...  
adj coll:adv              0
adj coll.:adv             0
pron adj                  0
adj adv prep conj         0
adj adv pres-p      

### Export our filtered word lists to csv files

In [707]:
nouns_ready_for_export.to_csv("nouns.csv", index=False)
adjectives_ready_for_export.to_csv("adjectives.csv", index=False)
verbs_ready_for_export.to_csv("verbs.csv", index=False)

# Test pairings

In [787]:
print(nouns_ready_for_export.sample()['Words'].values[0] + ' ' +\
      verbs_ready_for_export.sample()['Words'].values[0] + ' ' +\
      adjectives_ready_for_export.sample()['Words'].values[0] + ' ' +\
      nouns_ready_for_export.sample()['Words'].values[0])

creation to pity sweaty dustbowl
