# Chapter 12.6: Navajo verbs

## Preparation

Load packages

In [None]:
using JudiLing, DataFrames, RCall

Load dataset

In [None]:
navajo = JudiLing.load_dataset("../dat/navajo.csv");

In [None]:
size(navajo)

Inspect data

In [None]:
navajo[1:5,1:8]

In [None]:
@rput navajo
R"""
range(table(navajo$Meaning))
"""

In [None]:
R"""
length(unique(navajo$Segments))/nrow(navajo)
"""

In [None]:
names(navajo)

## Modeling with triphones

### Evaluation on full dataset

Create C and S matrices

In [None]:
cue_obj = JudiLing.make_cue_matrix(
    navajo,
    grams=3,
    target_col=:Segments,
    tokenized=true,
    sep_token = "-",
    keep_sep=true
    );

In [None]:
n_features = size(cue_obj.C, 2)

In [None]:
S = JudiLing.make_S_matrix(
    navajo,
    ["Meaning"],
    ["Mode","SubjPerson","SubjNumber","ObjPerson","ObjNumber", 
     "ObjLex", "Transitivity"],
    ncol=n_features,
    add_noise=true);

In [None]:
#Alternative versions
#["Mode","Subj. person A","Subj. number A","Obj. person A","Obj. number A", "Obj. lex", "Transitivity"],
#["Mode","Subj. person","Subj. number B","Obj. person A","Obj. number A", "Obj. lex", "Transitivity"],
#["Mode","Subj. person","Subj. number B","Obj. person","Obj. number", "Obj. lex", "Transitivity"],

#### Comprehension

In [None]:
F = JudiLing.make_transform_matrix(cue_obj.C, S);

In [None]:
Shat = cue_obj.C * F;

In [None]:
JudiLing.eval_SC(Shat, S)

There are homophones in the dataset, with lenient evaluation, we get:

In [None]:
JudiLing.eval_SC(Shat, S, navajo, :Segments)

#### Production

In [None]:
G = JudiLing.make_transform_matrix(S, cue_obj.C);

In [None]:
Chat = S * G;

Evaluation of precision of the predictions for what triphones should be present:

In [None]:
@show JudiLing.eval_SC(Chat, cue_obj.C);

Ordering the triphones into words.

In [None]:
A = cue_obj.A;

In [None]:
max_t = JudiLing.cal_max_timestep(navajo, :Segments, tokenized=true, sep_token="-")

In [None]:
@time res_learn, gpi_learn = JudiLing.learn_paths(
    navajo,
    navajo,
    cue_obj.C,
    S,
    F,
    Chat,
    A,
    cue_obj.i2f,
    cue_obj.f2i, 
    check_gold_path = true,
    gold_ind = cue_obj.gold_ind,
    Shat_val = Shat,
    max_t = max_t,
    max_can = 10,
    grams = 3,
    threshold = 0.05,
    tokenized = true,
    keep_sep = true,
    sep_token = "-",
    target_col = :Segments,
    verbose = false
);

In [None]:
acc_learn = JudiLing.eval_acc(res_learn, cue_obj.gold_ind, verbose = false)

println("Acc for learn: $acc_learn")

### Evaluation on training and test data

In [None]:
data_train, data_val = JudiLing.loading_data_careful_split(
            "../dat/navajo.csv", #data path
            "nav", #data_prefix,
            "../dat/split", #data_output_dir,
            ["Meaning","Mode","SubjPerson","SubjNumber", "ObjPerson","ObjNumber", "ObjAnimacy", "Transitivity"],
            val_sample_size = 570,
            n_grams_target_col="Segments",
            n_grams_tokenized=true,
            n_grams_sep_token="-",
            grams = 3,
            n_grams_keep_sep= true,
            verbose=false);

In [None]:
cue_obj_train, cue_obj_val  = JudiLing.make_cue_matrix(
    data_train,
    data_val,
    grams=3,
    target_col=:Segments,
    tokenized=true,
    sep_token = "-",
    keep_sep=true
    );

In [None]:
n_features = size(cue_obj_train.C, 2)
S_train, S_val = JudiLing.make_S_matrix(
    data_train,
    data_val,
    ["Meaning"],
    ["Mode","SubjPerson","SubjNumber","ObjPerson","ObjNumber", "ObjAnimacy", "Transitivity"],
    ncol=n_features,
    add_noise=true);

#### Comprehension

In [None]:
F = JudiLing.make_transform_matrix(cue_obj_train.C, S_train);
Shat_train = cue_obj_train.C * F;
Shat_val = cue_obj_val.C * F;
JudiLing.eval_SC(Shat_train, S_train, data_train, :Segments)

In [None]:
JudiLing.eval_SC(Shat_val, S_val, S_train, data_val, data_train, :Segments)

#### Production

In [None]:
G = JudiLing.make_transform_matrix(S_train, cue_obj_train.C);
Chat_train = S_train * G;
Chat_val = S_val * G;

In [None]:
@show JudiLing.eval_SC(Chat_train, cue_obj_train.C);

In [None]:
@show JudiLing.eval_SC(Chat_val, cue_obj_val.C);

Next, we put the triphones in order.

In [None]:
max_t = JudiLing.cal_max_timestep(data_train, data_val, :Segments, tokenized=true, sep_token="-")

sequencing accuracy on training data

In [None]:
@time res_train = JudiLing.learn_paths(
  data_train,
  data_train,
  cue_obj_train.C,
  S_train,
  F,
  Chat_train,
  cue_obj_train.A,
  cue_obj_train.i2f,
  cue_obj_train.f2i, 
  max_t=max_t,
  max_can=10,
  threshold=0.01,
  grams=3,
  tokenized=true,
  sep_token="-",
  keep_sep=true,
  target_col=:Segments,
  issparse=:dense,
  verbose=false);

In [None]:
acc_learn = JudiLing.eval_acc(res_train, cue_obj_train.gold_ind, verbose = false)

sequencing accuracy on test data

In [None]:
@time res_val, gpi_val = JudiLing.learn_paths(
  data_train,
  data_val,
  cue_obj_train.C,
  S_val,
  F,
  Chat_val,
  cue_obj_val.A,
  cue_obj_train.i2f,
  cue_obj_train.f2i, 
  gold_ind=cue_obj_val.gold_ind,
  Shat_val=Shat_val,
  check_gold_path=true,
  max_t=max_t,
  max_can=10,
  threshold=0.01,
  grams=3,
  is_tolerant=true,
  tolerance=-0.1,
  max_tolerance=1,
  tokenized=true,
  sep_token="-",
  keep_sep=true,
  target_col=:Segments,
  issparse=:dense,
  verbose=false);

In [None]:
acc_val = JudiLing.eval_acc(res_val, cue_obj_val.gold_ind, verbose=false)

In [None]:
df = JudiLing.write2df(res_val, data_val, cue_obj_train, cue_obj_val, grams=3, tokenized=true, sep_token="-",
                       output_sep_token="-", target_col=:Segments);

In [None]:
ranks = repeat([100], nrow(data_val));
for i in 1:nrow(data_val)
    tdf = df[df.utterance .== i, :]
    ch = findall(tdf.iscorrect)
    if length(ch) != 0
        ranks[i] = ch[1]
    end
end

In [None]:
# top 1 accuracy
sum(ranks .== 1)/length(ranks)

In [None]:
# top 3 accuracy
sum(ranks .<= 3)/length(ranks)

In [None]:
# top 5 accuracy
sum(ranks .<= 5)/length(ranks)

In [None]:
# top 10 accuracy
sum(ranks .<= 10)/length(ranks)

## Exercises

Training a deep learning model:


IMPORTANT: You first need to restart this notebook!

In [None]:
using Flux
using JudiLing

Load the datasplit.

In [None]:
data_train, data_val = JudiLing.loading_data_careful_split(
            "../dat/navajo.csv", #data path
            "nav", #data_prefix,
            "../dat/split", #data_output_dir,
            ["Meaning","Mode","SubjPerson","SubjNumber", "ObjPerson","ObjNumber", "ObjAnimacy", "Transitivity"],
            val_sample_size = 570,
            n_grams_target_col="Segments",
            n_grams_tokenized=true,
            n_grams_sep_token="-",
            grams = 3,
            n_grams_keep_sep= true,
            verbose=false);

Create cue matrices.

In [None]:
cue_obj_train, cue_obj_val  = JudiLing.make_cue_matrix(
    data_train,
    data_val,
    grams=3,
    target_col=:Segments,
    tokenized=true,
    sep_token = "-",
    keep_sep=true
    );

Create S matrices.

In [None]:
n_features = size(cue_obj_train.C, 2)
S_train, S_val = JudiLing.make_S_matrix(
    data_train,
    data_val,
    ["Meaning"],
    ["Mode","SubjPerson","SubjNumber","ObjPerson","ObjNumber", "ObjAnimacy", "Transitivity"],
    ncol=n_features,
    add_noise=true);

Train a DDL comprehension model:

In [None]:
res_comp = JudiLing.get_and_train_model(cue_obj_train.C, S_train,
    cue_obj_val.C, S_val, data_train, data_val, "Segments", "../res/navajo_comp.bson", early_stopping=20)

Predict training and validation semantic matrices:

In [None]:
Shat_train = JudiLing.predict_from_deep_model(res_comp.model, cue_obj_train.C)
Shat_val = JudiLing.predict_from_deep_model(res_comp.model, cue_obj_val.C)

Evaluate the predicted matrices

In [None]:
@show JudiLing.eval_SC(Shat_train, S_train, data_train, :Segments)
@show JudiLing.eval_SC(Shat_val, S_val, S_train, data_val, data_train, :Segments)

We get a training accuracy of 99% and a validation accuracy of 95%. This is slightly higher than what we were able to obtain with LDL.