# Importing our wordlists

Here we import all of our wordlists and add them to an array which me can merge at the end. 

This wordlists should not be filtered at this point. However they should all contain the same columns to make merging easier for later.

In [175]:
wordlists = []

## Dictcc

#### Download the dictionary from http://www.dict.cc/?s=about%3Awordlist

#### Print out the first 20 lines of the dictionary

In [176]:
!head -n 20 de-en.txt

# DE-EN vocabulary database	compiled by dict.cc
# Date and time	2016-08-29 23:46
# License	THIS WORK IS PROTECTED BY INTERNATIONAL COPYRIGHT LAWS!
# License	Private use is allowed as long as the data, or parts of it, are not published or given away.
# License	By using this file, you agree to be bound to the Terms of Use published at the following URL:  
# License	http://www.dict.cc/translation_file_request.php
# Contains data from	http://dict.tu-chemnitz.de/ with friendly permission by Frank Richter, TU Chemnitz 
# Brought to you by	Paul Hemetsberger and the users of http://www.dict.cc/, 2002 - 2016

&#945;-Keratin {n}	&#945;-keratin	noun
&#945;-Lactalbumin {n} <&#945;-La>	&#945;-lactalbumin <&#945;-La>	noun
&#946;-Mercaptoethanol {n}	&#946;-mercaptoethanol	noun
&#963;-Algebra {f}	&#963;-field	noun
&#963;-Algebra {f}	sigma algebra	noun
& Co.	and company <& Co.>	
'Die' heißt mein Unterrock, und 'der' hängt im Schrank. [regional] [Satz, mit dem Kinder gerügt werden, die vo

#### Use pandas library to import csv file

In [177]:
import pandas as pd


dictcc_df = pd.read_csv("de-en.txt", 
                        sep='\t',
                        skiprows=8,
                        header=None, 
                        names=["GermanWord","Word","WordType"])

#### Preview a few entries of the wordlist

In [178]:
dictcc_df[90:100]

Unnamed: 0,GermanWord,Word,WordType
90,(aktiv) Werbung machen für,to tout,verb
91,(aktive) Langzeitverbindung {f} [Standverbindu...,nailed-up connection <NUC>,noun
92,(aktuelles) Zeitgeschehen {n},current events {pl},noun
93,(akustisch) verstehen,to hear,verb
94,(akustische) Haarzelle {f},auditory cell,noun
95,(akustischer) Dissipationsgrad {m},(acoustic) dissipation factor,noun
96,(akute) Rückenmuskelnekrose {f},(acute) back muscle necrosis,noun
97,(akuter) Hörsturz {m},acute hearing loss,noun
98,(akuter) Myokardinfarkt {m} <AMI / MI>,(acute) myocardial infarction <AMI / MI>,noun
99,(akutes) Lungenversagen {n},acute respiratory distress syndrome <ARDS>,noun


#### We only need "Word" and "WordType" column

In [179]:
dictcc_df = dictcc_df[["Word", "WordType"]][:].copy()

#### Convert WordType Column to a pandas.Categorical

In [180]:
word_types = dictcc_df["WordType"].astype('category')
dictcc_df["WordType"] = word_types
# show data types of each column in the dataframe
dictcc_df.dtypes

Word          object
WordType    category
dtype: object

#### List the current distribution of word types in dictcc dataframe

In [181]:
dictcc_df["WordType"].value_counts().head()

noun          759619
verb          126806
adj            94507
adv            26277
adj past-p     12519
Name: WordType, dtype: int64

#### Add dictcc corpus to our wordlists array

In [182]:
wordlists.append(dictcc_df)

## Moby

#### Download the corpus from http://icon.shef.ac.uk/Moby/mpos.html

#### Perform some basic cleanup on the wordlist

In [183]:
# the readme file in `nltk/corpora/moby/mpos` gives some information on how to parse the file

result = []
# replace all DOS line endings '\r' with newlines then change encoding to UTF8
moby_words = !cat nltk/corpora/moby/mpos/mobyposi.i | tr -s '\r' '\n' | iconv --from-code=ISO88591 --to-code=UTF8
result.extend(moby_words)
moby_df = pd.DataFrame(data = result, columns = ['Word'])

- sort out the nouns, verbs and adjectives

In [184]:
# Matches nouns
nouns = moby_df[moby_df["Word"].str.contains('×[Np]$')].copy()
nouns["WordType"] = "noun"
# Matches verbs
verbs = moby_df[moby_df["Word"].str.contains('×[Vti]$')].copy()
verbs["WordType"] = "verb"
# Magtches adjectives
adjectives = moby_df[moby_df["Word"].str.contains('×A$')].copy()
adjectives["WordType"] = "adj"

- remove the trailing stuff and concatenate the nouns, verbs and adjectives

In [185]:
nouns["Word"] = nouns["Word"].str.replace(r'×N$','')
verbs["Word"] = verbs["Word"].str.replace(r'×[Vti]$','')
adjectives["Word"] = adjectives["Word"].str.replace(r'×A$','')
# Merge nouns, verbs and adjectives into one dataframe
moby_df = pd.concat([nouns,verbs,adjectives])

#### Add moby corpus to wordlists array

In [186]:
wordlists.append(moby_df)

## Combine all wordlists

In [241]:
wordlist = pd.concat(wordlists)
wordlist.describe()

Unnamed: 0,Word,WordType
count,1277915,1203402
unique,923477,62
top,depression,noun
freq,36,867851


# Filter for results that we want

- We want to remove words that contain non word characters (whitespace, hypens, etc.)

In [242]:
# we choose [a-z] here and not [A-Za-z] because we do _not_
# want to match words starting with uppercase characters.
word_chars = r'^[a-z]+$'
is_word_chars = wordlist["Word"].str.contains(word_chars, na=False)
wordlist_filtered = wordlist[is_word_chars]
wordlist_filtered.describe()

Unnamed: 0,Word,WordType
count,326484,324632
unique,166915,40
top,depression,noun
freq,36,188966


-  We want results that are less than 'x' letters long (x+3 for verbs since they are in their infinitive form in the dictcc wordlist)

In [243]:
lt_x_letters = (wordlist_filtered["Word"].str.len() < 9) |\
               ((wordlist_filtered["Word"].str.contains('^to\s\w+\s')) &\
                (wordlist_filtered["Word"].str.len() < 11)\
               )
wordlist_filtered = wordlist_filtered[lt_x_letters]
wordlist_filtered.describe()

Unnamed: 0,Word,WordType
count,139647,138790
unique,59722,38
top,boom,noun
freq,35,88092


- We want to remove all duplicates

In [245]:
wordlist_filtered = wordlist_filtered.drop_duplicates("Word")
wordlist_filtered.describe()

Unnamed: 0,Word,WordType
count,59722,59244
unique,59722,26
top,unstably,noun
freq,1,37176


- We want to remove words that are difficult to spell

TODO:

In [255]:
# Words with uncommon vowel duplicates (examples: ["piing", "reeject"])

- We want to remove all names and animals

TODO:


- We want to use nltk to perform some more advanced filtering

TODO:


## Maximize distance between neighbouring words

- Nouns like "cobra" and "domra" should not be located at Geo-Coordinate "55°x11°" and "55°x12°"
- TODO: the spread_words() method doesn't actually solve this problem. We will need to update it by calculating the distance to **all 8** of its adjacent neighbours

In [246]:
# `pip install python-levenshtein`
# used to calculate the Levenshtein distance between words
import Levenshtein as lev

# Maximize the Levenshtein distance between neighbouring words
def spread_words(dataframe_values, min_distance = 25, min_lev = 5):
    words = []
    words.extend(dataframe_values)
    short_distances = 0
    for i in range(len(words)-1):
        next = i + 1
        if lev.distance(words[i],words[next]) < min_lev:
            short_distances = short_distances + 1
            words.append(words[next])
            words.remove(words[next])
    # The value for min_distance was derived
    # by simple trial and error
    if short_distances < min_distance:
        # The remaining words with short distance 
        # will have to be sorted out by hand.
        return words
    else:
        # Recurse until we minimize short distances
        # as much as possible.
        return spread_words(words)

# Insert distance of neighbour
def insert_neighbour_distance(words):
    result = []
    word_with_neighbour_distance = ()
    for i in range(len(words)-1):
        next = i + 1
        lev_distance = lev.distance(words[i],words[next])
        word_with_neighbour_distance = words[i], lev_distance
        result.append(word_with_neighbour_distance)
    return pd.DataFrame(data = result, columns=['Words', 'NeighbourDistance'])

### Spread nouns

In [252]:
nouns = wordlist_filtered[wordlist_filtered["WordType"] == "noun"]
# randomize for better performance
nouns = nouns.sample(len(nouns))
min_distance_nouns = spread_words(nouns["Word"].values,50,3)
nouns_ready_for_export = insert_neighbour_distance(min_distance_nouns)
nouns_ready_for_export[:10]

Unnamed: 0,Words,NeighbourDistance
0,excud,5
1,clast,7
2,manpages,6
3,hype,7
4,torments,5
5,grimes,6
6,ending,6
7,logs,5
8,furore,5
9,yerba,7


### Spread adjectives

In [253]:
adjectives = wordlist_filtered[wordlist_filtered["WordType"] == "adj"]
# randomize for better performance
adjectives = adjectives.sample(len(adjectives))
min_distance_adjectives = spread_words(nouns["Word"].values,50,3)
adjectives_ready_for_export = insert_neighbour_distance(min_distance_adjectives)
adjectives_ready_for_export[:10]

Unnamed: 0,Words,NeighbourDistance
0,excud,5
1,clast,7
2,manpages,6
3,hype,7
4,torments,5
5,grimes,6
6,ending,6
7,logs,5
8,furore,5
9,yerba,7


### Spread verbs

In [254]:
# use "adj past-p" as verbs conjugated in the past tense until 
# we use nltk to properly conjugate all verbs in our wordlist
verbs_init = wordlist_filtered[((wordlist_filtered["WordType"] == "adj past-p") | (wordlist_filtered["WordType"] == "verb"))]
verbs = verbs_init.sample(n=len(verbs_init))
min_distance_verbs = spread_words(verbs["Word"].values,50,3)
verbs_ready_for_export = insert_neighbour_distance(min_distance_verbs)
verbs_ready_for_export[:10]

Unnamed: 0,Words,NeighbourDistance
0,pupping,7
1,presaged,6
2,piing,8
3,outboast,8
4,replume,6
5,pupated,4
6,copped,6
7,reeject,5
8,hemmed,6
9,prebeset,7


#### Show the distribution of word types after filtering

In [258]:
wordlist_filtered["WordType"].value_counts().head()

noun          37176
adj           12177
verb           4913
adj past-p     2126
adv            1232
Name: WordType, dtype: int64

### Export our filtered word lists to csv files

In [259]:
nouns_ready_for_export.to_csv("nouns.csv", index=False)
adjectives_ready_for_export.to_csv("adjectives.csv", index=False)
verbs_ready_for_export.to_csv("verbs.csv", index=False)

# Test pairings

In [260]:
print(nouns_ready_for_export.sample()['Words'].values[0] + ' ' +\
      verbs_ready_for_export.sample()['Words'].values[0] + ' ' +\
      adjectives_ready_for_export.sample()['Words'].values[0] + ' ' +\
      nouns_ready_for_export.sample()['Words'].values[0])

obesity prearm autogyro hechsher
