## Use the dictionary from dict.cc to generate our word list

### Download the dictionary from http://www.dict.cc/?s=about%3Awordlist

### Print out the first 20 lines of the dictionary

In [4]:
!head -n 20 de-en.txt

# DE-EN vocabulary database	compiled by dict.cc
# Date and time	2016-08-29 23:46
# License	THIS WORK IS PROTECTED BY INTERNATIONAL COPYRIGHT LAWS!
# License	Private use is allowed as long as the data, or parts of it, are not published or given away.
# License	By using this file, you agree to be bound to the Terms of Use published at the following URL:  
# License	http://www.dict.cc/translation_file_request.php
# Contains data from	http://dict.tu-chemnitz.de/ with friendly permission by Frank Richter, TU Chemnitz 
# Brought to you by	Paul Hemetsberger and the users of http://www.dict.cc/, 2002 - 2016

&#945;-Keratin {n}	&#945;-keratin	noun
&#945;-Lactalbumin {n} <&#945;-La>	&#945;-lactalbumin <&#945;-La>	noun
&#946;-Mercaptoethanol {n}	&#946;-mercaptoethanol	noun
&#963;-Algebra {f}	&#963;-field	noun
&#963;-Algebra {f}	sigma algebra	noun
& Co.	and company <& Co.>	
'Die' heißt mein Unterrock, und 'der' hängt im Schrank. [regional] [Satz, mit dem Kinder gerügt werden, die vo

### Insert csv header after licensing information

In [6]:
!sed "9 a GermanWord\tEnglishWord\tWordType" "de-en.txt" > "dictionary-as-csv-file.txt"

### Use pandas library to import csv file

In [29]:
import pandas as pd


dictcc_df = pd.read_csv("dictionary-as-csv-file.txt", sep='\t', header=8)

### display some of the contents of the dictcc dataframe (dictcc_df)

In [31]:
dictcc_df

Unnamed: 0,GermanWord,EnglishWord,WordType
0,&#945;-Keratin {n},&#945;-keratin,noun
1,&#945;-Lactalbumin {n} <&#945;-La>,&#945;-lactalbumin <&#945;-La>,noun
2,&#946;-Mercaptoethanol {n},&#946;-mercaptoethanol,noun
3,&#963;-Algebra {f},&#963;-field,noun
4,&#963;-Algebra {f},sigma algebra,noun
5,& Co.,and company <& Co.>,
6,"'Die' heißt mein Unterrock, und 'der' hängt im...",'She' is the cat's mother. [used to encourage ...,
7,'n Abend allerseits! [ugs.],Evening all! [coll.],
8,'nauf [regional] [hinauf],up,adv
9,'Nduja {f} [auch: Nduja],'nduja [also: nduja],noun


### Convert WordType Column to a pandas.Categorical

In [37]:
word_types = dictcc_df["WordType"].astype('category')
dictcc_df["WordType"] = word_types
# show data types of each column in the dataframe
dictcc_df.dtypes

GermanWord       object
EnglishWord      object
WordType       category
dtype: object

### Perform some basic sanity checking on the data

#### what is the distribution of word types in our dataframe?

In [43]:
dictcc_df["WordType"].value_counts()

noun                  759619
verb                  126806
adj                    94507
adv                    26277
adj past-p             12519
adj pres-p              4907
past-p                  2712
adj adv                 1687
prep                     976
pron                     500
conj                     350
pres-p                   308
prefix                   177
past-p adj                75
suffix                    67
pres-p adj                43
adv adj                   30
adv prep                  27
adj pron                  27
adj.                      16
adv conj                  11
adv noun                  11
adj suffix                10
adj archaic:adv           10
prep conj                 10
noun adv                   6
adv pron                   5
adj coll:adv               5
adv past-p                 4
adj noun                   4
                       ...  
adj archaic:past-p         2
adj attr.                  2
adj adv past-p             2
[none]        

#### How many nouns do we have that contain a single word?

In [67]:
# extract all nouns 
is_noun = dictcc_df['WordType'] == "noun"
nouns = dictcc_df[is_noun]
# match only single word nouns
single_word_pattern = r'^\w+$' # regex
is_single_word = nouns["EnglishWord"].str.contains(single_word_pattern, na=False)
# print nouns that are single words
single_word_nouns = nouns[is_single_word]

In [73]:
single_word_nouns.describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,138733,138733,138733
unique,101845,67564,1
top,Verwirrung {f},depression,noun
freq,30,35,138733


- it looks like we have 67564 unique single word english nouns in the dictionary 

#### How many nouns do we have that are less than 'x' letters long?

In [99]:
lt_x_letters = single_word_nouns["EnglishWord"].str.len() < 9
single_word_nouns[lt_x_letters].describe()

Unnamed: 0,GermanWord,EnglishWord,WordType
count,68793,68793,68793
unique,53525,29095,1
top,Schwindel {m},boom,noun
freq,17,35,68793


- we only have 29095 nouns to work with here.