In [4]:
import numpy as np
import tensorflow as tf


In [159]:
ALPHABET = "abcdefghijklmnopqrstuvwxyz" # list of character
MAX_WORD_LEN = 15 # longest word our network can parse
ONE_HOT_SIZE = len(ALPHABET) * MAX_WORD_LEN # how long our one hot array should be

# get position of a char in alphabet
def getPos(char):
    if len(char) > 1:
        raise Exception("char should be a single character, length was: {}".format(len(char)))
    char = char.lower()
    if char not in ALPHABET:
        raise Exception("char should be in the alphabet, char was: {}".format(char))
    return ALPHABET.index(char)

# create a one hot array for value, length of length
def oneHot(val, length):
    arr = [0]*length
    arr[val] = 1
    return arr

# get one hot array for single char
def charOneHot(char):
    val = getPos(char)
    return oneHot(val, len(ALPHABET))

# get one hot arrays for each char in a string
def oneHotString(string):
    strLen = len(string)
    result = [0]*strLen
    for i in range(0, strLen):
        result[i] = charOneHot(string[i])
        
    return result

# transform a word to a one hot array 
def wordToInput(word):
    if len(word) > MAX_WORD_LEN:
        raise Exception("Word is too long, must be < 15 char, actual: {}".format(len(word)))
    result = [0]*ONE_HOT_SIZE
    index = 0
    for char in word:
        result[getPos(char) + index] = 1
        index = index + len(ALPHABET)
    return result
    
# great, now we can turn words into one hot arrays
# now we have to get and label training data

# functions for data wrangling

# returns true if word only contains letters in alphabet
def isAlpha(word):
    word = word.lower()
    for char in word:
        if char not in ALPHABET:
            return False
        
    return True
    

In [160]:
LANGUAGES = ["English", "German"]
MIN_WORD_LEN = 3

# dictionary source: https://github.com/hermitdave/FrequencyWords

# data wrangling time, things we have to do:
# remove words shorter than 3(not required but it'll help) 
# remove words containing words outside of the english alphabet (this means no accents)
# remove words longer than 15 characters (or whatever we decide our maximum to be)
# the dataset I'm using also has frequency for each word, we don't need that

english = open("english_dict.txt", "r", encoding = "utf-8")

english_lines = english.readlines()
english_words = []
for line in english_lines:
    split = line.split(" ")
    word = split[0]
    if len(word) > MIN_WORD_LEN and len(word) < MAX_WORD_LEN and isAlpha(word):
        english_words.append(word)
        
        
german = open("german_dict.txt", "r", encoding = "utf-8")

german_lines = german.readlines()
german_words = []
for line in german_lines:
    split = line.split(" ")
    word = split[0]
    if len(word) > MIN_WORD_LEN and len(word) < MAX_WORD_LEN and isAlpha(word):
        german_words.append(word)
        
print("Word counts before slice:")
print("English words: {}".format(len(english_words)))
print("German words: {} \n".format(len(german_words)))


#balancing
SAMPLES = 38000 # number of words for each language, should be less than smallest dictionary

english_words = english_words[0:SAMPLES]
german_words = german_words[0:SAMPLES]

print("Word counts after slice:")
print("English words: {}".format(len(english_words)))
print("German words: {}".format(len(german_words)))


Word counts before slice:
English words: 44295
German words: 39692 

Word counts after slice:
English words: 38000
German words: 38000


In [198]:
# time to label data
from sklearn.utils import shuffle


# oops I gotta actually use the one hot stuff
X_train = []
for word in english_words:
    X_train.append(wordToInput(word))

for word in german_words:
    X_train.append(wordToInput(word))

X_train = np.array(X_train)

y_train = []
y_train.extend([0]*SAMPLES)
y_train.extend([1]*SAMPLES)
y_train = tf.keras.utils.to_categorical(y_train)

y_train = np.array(y_train)

X_train, y_train = shuffle(X_train, y_train)
# data labeled in np array


In [203]:
import pickle

pickle_out = open("X.pickle", "wb")
pickle.dump(X_train, pickle_out)
pickle_out.close()

pickle_out = open("y.pickle", "wb")
pickle.dump(y_train, pickle_out)
pickle_out.close()

In [204]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D

model = Sequential()
model.add(tf.keras.layers.Dense(ONE_HOT_SIZE))
model.add(Activation("relu"))
model.add(tf.keras.layers.Dense(128))
model.add(Activation("relu"))
model.add(Dropout(0.25))
model.add(tf.keras.layers.Dense(128))
model.add(Activation("relu"))
model.add(Dropout(0.25))
model.add(tf.keras.layers.Dense(2)) # output layer
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])

model.fit(X_train, y_train, batch_size = 32, validation_split = 0.1, epochs = 10)


Train on 68400 samples, validate on 7600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x29c0775d160>

In [205]:
model.save("English-German-v2.model")

In [206]:
model = tf.keras.models.load_model("English-German-v2.model")



In [224]:
word = "hello"

word_vec = np.array(wordToInput(word)).reshape(-1, ONE_HOT_SIZE)
LANGUAGES[np.argmax(model.predict(word_vec))]
print("confidence: {}%")



'German'