# Chapter 12.5: Biblical Hebrew

## Preparation

Load necessary packages:

In [None]:
using CSV, DataFrames, JudiLing, RCall

Load the data

In [None]:
hebrew = DataFrame(CSV.File("../dat/hebrew_verbs.csv"));

In [None]:
names(hebrew)

In [None]:
size(hebrew)

In [None]:
hebrew[1:10,:Word]

## Set up form and meaning matrices

In [None]:
@time cue_obj = JudiLing.make_cue_matrix(hebrew,               
                                         grams=3, 
                                         target_col="Word",  
                                         tokenized=false); 

In [None]:
@time S = JudiLing.make_S_matrix(hebrew, 
    ["Lexeme"], 
    ["Binyan", "Tense", "Person", "Number", "Gender", "Conjunction", "Article", 
     "Preposition", "Status", "Suffix_Number", "Suffix_Person", "Suffix_Gender", 
     "Suffix_Function",  "Interrogative", "Mood"], 
    ncol = 1000,
    add_noise=true,    
    sd_noise=1.0,
    seed=217);  

note: how exactly the random generator works can change with updates of julia

## Endstate of Learning

### Calculate mappings using the endstate of learning

Comprehension:

In [None]:
@time F = JudiLing.make_transform_matrix(cue_obj.C, S); 

In [None]:
@time Shat = cue_obj.C * F;

Production:

In [None]:
@time G = JudiLing.make_transform_matrix(S, cue_obj.C);   

In [None]:
@time Chat = S * G;  

Evaluate:

In [None]:
@time JudiLing.eval_SC(Shat, S)

In [None]:
@time JudiLing.eval_SC(Chat, cue_obj.C)

Learn paths:

In [None]:
@time prod_res = JudiLing.learn_paths(hebrew, cue_obj, S, F, Chat, 
                                      threshold=0.01, verbose=false);

In [None]:
JudiLing.eval_acc(prod_res, cue_obj)

### Accuracy by morphological feature

In [None]:
@time acc = JudiLing.accuracy_comprehension(S, Shat, 
            hebrew,
            target_col=:Word,
            base=["Lexeme"],
            inflections=["Binyan", "Tense", "Person", "Number", "Gender", 
                         "Conjunction", "Article", "Preposition", "Status", 
                         "Suffix_Number", "Suffix_Person", "Suffix_Gender", 
                         "Suffix_Function",  "Interrogative", "Mood"]);

In [None]:
first(acc.dfr, 10)

In [None]:
CSV.write("../res/acc_dfr_eol.csv", acc.dfr)

In [None]:
R"""
acc = read.csv("../res/acc_dfr_eol.csv", T)
f = function(v) sum(v=="true")/length(v[v!=""])
v = apply(acc[,5:ncol(acc)], 2, f)
library(lattice)
dotplot(rev(sort(v)), xlab="accuracy")
"""

## Exercise: Frequency-informed learning

### Calculating the mappings

In [None]:
@time F = JudiLing.make_transform_matrix(cue_obj.C, S, hebrew.Frequency); 

In [None]:
@time Shat = cue_obj.C * F;

In [None]:
@time JudiLing.eval_SC(Shat, S)

In [None]:
@time G = JudiLing.make_transform_matrix(S, cue_obj.C, hebrew.Frequency);   

In [None]:
@time Chat = S * G;  

In [None]:
@time JudiLing.eval_SC(Chat, cue_obj.C)

In [None]:
@time prod_res = JudiLing.learn_paths(hebrew, cue_obj, S, F, Chat, threshold=0.01, verbose=false);

In [None]:
@time JudiLing.eval_acc(prod_res, cue_obj)

### Accuracy by morphological feature

In [None]:
@time acc2 = JudiLing.accuracy_comprehension(S, Shat, 
             hebrew,
             target_col=:Word,
             base=["Lexeme"],
             inflections=["Binyan", "Tense", "Person", "Number", "Gender",
             "Conjunction", "Article", "Preposition", "Status",
             "Suffix_Number", "Suffix_Person", "Suffix_Gender",
             "Suffix_Function", "Interrogative", "Mood"]);
CSV.write("../res/acc2_dfr_fil.csv", acc2.dfr)

In [None]:
R"""
acc2 = read.csv("../res/acc2_dfr_fil.csv", T)
f = function(v) sum(v=="true")/length(v[v!=""])
v = apply(acc2[,5:ncol(acc)], 2, f)
library(lattice)
save(v, file="../res/acc2_v.rda")
dotplot(rev(sort(v)), xlab="accuracy")
"""

the above analyses were type-based, in what follows, we carry out token-based evaluations of accuracy, and make a joint token/type by EOL/FIL dotplot

In [None]:
R"""
dat = read.csv("../dat/hebrew_verbs.csv", T, stringsAsFactors=FALSE)
# EOL type based
acc = read.csv("../res/acc_dfr_eol.csv", T)
f = function(v) sum(v=="true")/length(v[v!=""])
v = apply(acc[,5:ncol(acc)], 2, f)
#
# FIL type-based
acc2 = read.csv("../res/acc2_dfr_fil.csv", T)
v2 = apply(acc2[,5:ncol(acc2)], 2, f)
#
# combine type-based evaluations in one dataframe
dfr = data.frame(type=c(names(v), names(v2)),
                 value=c(unname(v), unname(v2)),
                 learning = c(rep("EOL", length(v)), rep("FIL", length(v2))))
dfr$type=ordered(dfr$type, names(v))
#
# token-based evaluation
acc$Frequency = dat$Frequency
acc2$Frequency = dat$Frequency
#
# a convenience function
ftok = function(column, dfr=acc) {
  correct_tokens = sum(dfr[dfr[,column]=="true", "Frequency"])
  incorrect_tokens = sum(dfr[dfr[,column]=="false", "Frequency"])
  return(correct_tokens/(correct_tokens+incorrect_tokens))
}
#
v1tok = sapply(colnames(acc)[5:21], ftok, acc)
v2tok = sapply(colnames(acc)[5:21], ftok, acc2)
#
# put token-based evaluations together in a second data frame
dfr2 = data.frame(type=c(names(v1tok), names(v2tok)),
                  value=c(unname(v1tok), unname(v2tok)), 
                  learning = c(rep("EOL", length(v)), rep("FIL", length(v2))))

# combine the two dataframes
dfr$weighting = "types"
dfr2$weighting = "tokens"
dfr3 = rbind(dfr, dfr2)
dfr3$weighting = factor(dfr3$weighting)
cat(" \n")

"""


In [None]:
R"""
table(dfr3$learning, dfr3$weighting)
"""

In [None]:
R"""
save(dfr3, file="../res/dfr3.rda", compress="xz")
"""

In [None]:
R"""
options(repr.plot.width=8, repr.plot.height=6)
dotplot(type~value|weighting, groups=learning, data=dfr3, xlab="accuracy", 
        pch=19, cex=c(1.5, 0.8), col=rev(c("black", "gray")))
"""