In [1]:
using Flux, LinearAlgebra, Random

# Preprocessing

In [2]:
#collect all the characters appear in the dataset
chars = []
#iterator
for (root, dirs, files) in walkdir(String(@__DIR__)*"/data/names")
    for file in files
        names = readlines(joinpath(root, file))
        for name in names
            chars = [chars;collect(lowercase(name))]
        end
        chars = [chars;collect(String(read(joinpath(root, file))))]
    end
end

In [3]:
alphabet = [unique(chars)..., '_']; #unique characters form the alphabet

In [4]:
#get all the categories from the name of text files
categories = readdir(String(@__DIR__)*"/data/names")
for i=1:length(categories)
    category = categories[i][1:end-4]
    categories[i] = category
end

In [5]:
#create a dictionary with category names as keys and names in array format as values
category_lines = Dict()
for category in categories
    category_lines[category] = lowercase.(readlines(joinpath(String(@__DIR__)*"/data/names", category*".txt")))
end

In [6]:
#transform dataset into pairs with onehot encoded inputs and outputs
data = []
for category in categories
    names = category_lines[category]
    for name in names
        (x,y) = (Flux.onehotbatch(name,alphabet),Flux.onehot(category,categories))
        push!(data,(x,y))
    end
end
trainset = shuffle(data)[1:15000];
testset = shuffle(data)[15001:end];

# Build RNN

In [7]:
struct RNNmodel
    Wxh
    Wo
    bxh
    bo
end

RNNmodel(in::Integer, hidden::Integer, out::Integer) =
  RNNmodel(param(randn(hidden, in+hidden)./100), param(randn(out, in+hidden)./100), 
    param(randn(hidden)./100), param(randn(out)./100))

function (rnn::RNNmodel)(x,h)
    h_affine = rnn.Wxh * vcat(x,h) .+ rnn.bxh
    o_affine = rnn.Wo * vcat(x,h) .+ rnn.bo
    return (tanh.(h_affine), softmax(o_affine))
end

In [8]:
r_model = RNNmodel(90,128,18)
r_model(rand(90),rand(128))[2] #example output

Tracked 18-element Array{Float64,1}:
 0.05933923887513576 
 0.055724892966194275
 0.04914517767388216 
 0.05749802498883375 
 0.05156896960664484 
 0.054879248122913823
 0.04653966449042874 
 0.05478175184326987 
 0.05273678456669417 
 0.061118863263577704
 0.05811884706231737 
 0.0565279832219866  
 0.06679124091890422 
 0.05888569604275681 
 0.06256484007959731 
 0.044151697896476105
 0.052431238695839086
 0.057195839684547285

In [9]:
loss(y_est,y_real) = Flux.crossentropy(y_est,y_real) #loss function
opt = ADAM(0.001) #ADAM optimizer is defined
ps = (r_model.Wxh,r_model.Wo,r_model.bxh,r_model.bo); #model parameters

In [10]:
#online training
θ = Flux.Params([r_model.Wxh,r_model.Wo,r_model.bxh,r_model.bo])
for epoch=1:100
    instances = shuffle(collect(1:15000))[1:1000]
    #for (x,y) in dataset
    for i in instances
        (x,y) = trainset[i]
        global hidden = zeros(128)
        for s=1:size(x)[2]
            global out
            (hidden,out) = r_model(float(x[:,s]),hidden)
        end
        grads = Tracker.gradient(() -> loss(out,y), θ)
        for p in (r_model.Wxh,r_model.Wo,r_model.bxh,r_model.bo)
            Tracker.update!(opt, p, grads[p])
        end
    end
end

In [11]:
#accuracy in training set
count_true, count_false = 0.0, 0.0
for (x,y) in trainset
    global hidden = zeros(128)
    for s=1:size(x)[2]
        global out
        (hidden,out) = r_model(float(x[:,s]),hidden)
    end
    if argmax(out) == argmax(y)
        count_true += 1
    else
        count_false += 1
    end
end
println(count_true/(count_true+count_false))

0.7676666666666667


In [12]:
#accuracy in test set
count_true, count_false = 0.0, 0.0
for (x,y) in testset
    global hidden = zeros(128)
    for s=1:size(x)[2]
        global out
        (hidden,out) = r_model(float(x[:,s]),hidden)
    end
    if argmax(out) == argmax(y)
        count_true += 1
    else
        count_false += 1
    end
end
println(count_true/(count_true+count_false))

0.7658651951123374


In [13]:
function classify_name(name::String)
    name_hot = Flux.onehotbatch(lowercase(name),alphabet)
    global hidden = zeros(128)
    for s=1:size(name_hot)[2]
        global out
        (hidden,out) = r_model(float(name_hot[:,s]),hidden)
    end
    return categories[argmax(out)]
end

classify_name (generic function with 1 method)

In [36]:
println(classify_name("Lewandowski"))
println(classify_name("Müller"))
println(classify_name("Czech"))
println(classify_name("Harari"))
println(classify_name("De Boer"))
println(classify_name("Ziu"))
println(classify_name("Albertini"))
println(classify_name("Jackson"))
println(classify_name("Gonzalez"))

Russian
German
Czech
Arabic
German
Chinese
Italian
English
German
