In [1]:
import numpy as np
import tensorflow as tf


In [50]:
ALPHABET = "abcdefghijklmnopqrstuvwxyz" # list of character
MAX_WORD_LEN = 15 # longest word our network can parse
ONE_HOT_SIZE = len(ALPHABET) * MAX_WORD_LEN # how long our one hot array should be

# get position of a char in alphabet
def getPos(char):
    if len(char) > 1:
        raise Exception("char should be a single character, length was: {}".format(len(char)))
    char = char.lower()
    if char not in ALPHABET:
        raise Exception("char should be in the alphabet, char was: {}".format(char))
    return ALPHABET.index(char)

# create a one hot array for value, length of length
def oneHot(val, length):
    arr = [0]*length
    arr[val] = 1
    return arr

# get one hot array for single char
def charOneHot(char):
    val = getPos(char)
    return oneHot(val, len(ALPHABET))

# get one hot arrays for each char in a string
def oneHotString(string):
    strLen = len(string)
    result = [0]*strLen
    for i in range(0, strLen):
        result[i] = charOneHot(string[i])
        
    return result

# transform a word to a one hot array 
def wordToInput(word):
    if len(word) > MAX_WORD_LEN:
        raise Exception("Word is too long, must be < 15 char, actual: {}".format(len(word)))
    result = [0]*ONE_HOT_SIZE
    index = 0
    for char in word:
        result[getPos(char) + index] = 1
        index = index + len(ALPHABET)
    return result
    
# great, now we can turn words into one hot arrays
# now we have to get and label training data

# functions for data wrangling

# returns true if word only contains letters in alphabet
def isAlpha(word):
    word = word.lower()
    for char in word:
        if char not in ALPHABET:
            return False
        
    return True
    

False

In [93]:
LANGUAGES = ["English", "German"]
MIN_WORD_LEN = 3

# dictionary source: https://github.com/hermitdave/FrequencyWords

# data wrangling time, things we have to do:
# remove words shorter than 3(not required but it'll help) 
# remove words containing words outside of the english alphabet (this means no accents)
# remove words longer than 15 characters (or whatever we decide our maximum to be)
# the dataset I'm using also has frequency for each word, we don't need that

english = open("english_dict.txt", "r", encoding = "utf-8")

english_lines = english.readlines()
english_words = []
for line in english_lines:
    split = line.split(" ")
    word = split[0]
    if len(word) > MIN_WORD_LEN and len(word) < MAX_WORD_LEN and isAlpha(word):
        english_words.append(word)
        
        
german = open("german_dict.txt", "r", encoding = "utf-8")

german_lines = german.readlines()
german_words = []
for line in german_lines:
    split = line.split(" ")
    word = split[0]
    if len(word) > MIN_WORD_LEN and len(word) < MAX_WORD_LEN and isAlpha(word):
        german_words.append(word)
        

print("English words: {}".format(len(english_words)))
print("German words: {}".format(len(german_words)))

# next steps: 
# balancing dataset, should have equal samples from each
# machine learning time

English words: 44295
German words: 39692
