# Chapter 15.3: Speech errors

In [None]:
using DataFrames, CSV, JudiLing, RCall, Statistics

# Setting up prior knowledge

## Data

In [None]:
words = JudiLing.load_dataset("../dat/wordinfo_fasttext_aligned.csv");
size(words)

In [None]:
S, wordlist = JudiLing.load_S_matrix("../dat/fasttext_wordinfo_aligned.csv"; header = false, sep = ",");
size(S)

## Setting up the mappings

In [None]:
cue_obj = JudiLing.make_cue_matrix(words,
                                   grams=3,
                                   target_col = :Phon);

In [None]:
size(cue_obj.C)

In [None]:
F = JudiLing.make_transform_matrix(cue_obj.C, S);
Shat = cue_obj.C * F;

In [None]:
size(F)

In [None]:
JudiLing.eval_SC(Shat, S, words, :Phon)

In [None]:
JudiLing.write_comprehension_eval(Shat, S, words, :Phon, "../res/comprehension.csv")

In [None]:
G = JudiLing.make_transform_matrix(S, cue_obj.C);
Chat = S * G;

In [None]:
max_t = JudiLing.cal_max_timestep(words, words, "Phon")

In [None]:
res = JudiLing.learn_paths(words,
                           cue_obj,
                           S,
                           F,
                           Chat,
                           threshold=0.005,
                           Shat_val=Shat, 
                           verbose=true);

In [None]:
JudiLing.eval_acc(res, cue_obj)

In [None]:
JudiLing.write2csv(res, words, cue_obj, cue_obj,
"../res/production.csv", target_col=:Phon)

__production neighbors of *thinly*:__
*  *thickly, thick, thinned, finely, thin, sliced, crudely, trickly, poorly*

__production neighbors of *sliced*:__
*  *diced, slices, chopped, spliced, cubed, peeled, dice, thinly, sauced*

# Modeling  slicely thinned 

### Option 1: generating from the stems

v(slice)+v(ly),  v(thin)+v(ed)

Inflectional morphology of the dataset

In [None]:
combine(groupby(words, :Flection), nrow)

Creating semantic vectors for -ly and -ed:

In [None]:
inflections = unique(words.Flection)

In [None]:
flection_matrix = zeros((length(inflections), size(S,2)))

In [None]:
for (i, flect) in enumerate(inflections)
    flection_matrix[i, :] = mean(S[words.Flection .== flect, :], dims=1)
end

In [None]:
flection_matrix

Setting up the semantic vectors of the speech errors.

In [None]:
Serror = [flection_matrix[inflections .== "LY",:] .+ S[wordlist .== "slice",:]
          flection_matrix[inflections .== "ED",:] .+ S[wordlist .== "thin",:]]
m = hcat(DataFrame(:word => ["slicely", "thinned"]), DataFrame(Serror, :auto))
CSV.write("../res/Serror1.csv", m, header=false)

In [None]:
Serrors, errors = JudiLing.load_S_matrix("../res/Serror1.csv"; header = false, sep = ",");
size(Serrors)

In [None]:
speech_errors = JudiLing.load_dataset("../dat/speech_errors.csv");
speech_errors

Setting up cue matrices for training and error data:

In [None]:
cue_obj_train, cue_obj_errors = JudiLing.make_combined_cue_matrix(
  words[:,[:Phon]],
  speech_errors[:,[:Phon]],
  grams=3,
  target_col="Phon");

Computing the mapping:

In [None]:
F = JudiLing.make_transform_matrix(cue_obj_train.C, S);

In [None]:
G = JudiLing.make_transform_matrix(S, cue_obj_train.C);
Chat_errors = Serrors * G;

Running learn paths:

In [None]:
prod_errors = JudiLing.learn_paths(
   words[:,[:Phon]],   
   speech_errors[:,[:Phon]],   
   cue_obj_train.C, 
   Serrors, 
   F, 
   Chat_errors, 
   cue_obj_errors.A, 
   cue_obj_train.i2f, 
   cue_obj_train.f2i, 
   max_t=max_t,
   threshold=0.01,
   grams=3,
   target_col="Phon", 
   is_tolerant = true,
   tolerance = -0.1, 
   max_tolerance = 1,
   max_can = 10,
   verbose=false);

In [None]:
JudiLing.write2csv(prod_errors, speech_errors, cue_obj_train, cue_obj_errors,
"../res/production_errors1.csv", target_col=:Phon)

__*slicely* is at rank 5:__ 

  *slices, slicing, sliced, slice, __slicely__, diced, pliesed, dice, plieses, thinly*
  
__*thinned* is at rank 1:__ 

  *__thinned__, thin, thins, thickly, thinly, thick, grazed, veiled, dry, clotted*


### Option 2: replacing stem vectors, revising earlier assignment 

v(slice)+v(ly)-v(thin),  v(thin)+v(ed)-v(slice)

In [None]:
Serror = [flection_matrix[inflections .== "LY",:] .+ S[wordlist .== "slice",:] .- S[wordlist .== "thin",:]
          flection_matrix[inflections .== "ED",:] .+ S[wordlist .== "thin",:] .- S[wordlist .== "slice",:]]
m = hcat(DataFrame(:word => ["slicely", "thinned"]), DataFrame(Serror, :auto))
CSV.write("../res/Serror2.csv", m, header=false)

In [None]:
Serrors, errors = JudiLing.load_S_matrix("../res/Serror2.csv"; header = false, sep = ",");

In [None]:
speech_errors = JudiLing.load_dataset("../dat/speech_errors.csv")

In [None]:
cue_obj_train, cue_obj_errors = JudiLing.make_combined_cue_matrix(
  words[:,[:Phon]],
  speech_errors[:,[:Phon]],
  grams=3,
  target_col="Phon");

In [None]:
G = JudiLing.make_transform_matrix(S, cue_obj_train.C);
Chat_errors = Serrors * G;

In [None]:
prod_errors = JudiLing.learn_paths(
   words[:,[:Phon]],   
   speech_errors[:,[:Phon]],   
   cue_obj_train.C, 
   Serrors, 
   F, 
   Chat_errors, 
   cue_obj_errors.A, 
   cue_obj_train.i2f, 
   cue_obj_train.f2i, 
   max_t=max_t,
   threshold=0.005,
   grams=3,
   target_col="Phon", 
   is_tolerant = true,
   tolerance = -0.1, 
   max_tolerance = 1,
   max_can = 10,
   verbose=false);

In [None]:
JudiLing.write2csv(prod_errors, speech_errors, cue_obj_train, cue_obj_errors,
"../res/production_errors2.csv", target_col=:Phon)

__*slicely* is at rank 2:__ 

  *slice, __slicely__, slices, take, morsel, slicing, chop, sliced, dice, dices*
  
__*thinned* is at rank 1:__

  *__thinned__, thin, thinning, thinned, thinneded, thins, thickly, thickling, thinneds, thick*
  (thinned occurs once as TInd and once as TInIt)

### Option 3: replacing stem vectors, backing off from closest inflectional neighbors

v(slice)+v(ly)-v(thinned),  v(thin)+v(ed)-v(slices)

In [None]:
Serror = [flection_matrix[inflections .== "LY",:] .+ S[wordlist .== "slice",:] .- S[wordlist .== "thinned",:]
          flection_matrix[inflections .== "ED",:] .+ S[wordlist .== "thin",:] .- S[wordlist .== "slices",:]]
m = hcat(DataFrame(:word => ["slicely", "thinned"]), DataFrame(Serror, :auto))
CSV.write("../res/Serror3.csv", m, header=false)

In [None]:
Serrors, errors = JudiLing.load_S_matrix("../res/Serror3.csv"; header = false, sep = ",");

In [None]:
speech_errors = JudiLing.load_dataset("../dat/speech_errors.csv");

In [None]:
cue_obj_train, cue_obj_errors = JudiLing.make_combined_cue_matrix(
  words[:,[:Phon]],
  speech_errors[:,[:Phon]],
  grams=3,
  target_col="Phon");

In [None]:
G = JudiLing.make_transform_matrix(S, cue_obj_train.C);
Chat_errors = Serrors * G;

In [None]:
prod_errors = JudiLing.learn_paths(
   words[:,[:Phon]],   
   speech_errors[:,[:Phon]],   
   cue_obj_train.C, 
   Serrors, 
   F, 
   Chat_errors, 
   cue_obj_errors.A, 
   cue_obj_train.i2f, 
   cue_obj_train.f2i, 
   max_t=max_t,
   threshold=0.005,
   grams=3,
   target_col="Phon", 
   is_tolerant = true,
   tolerance = -0.1, 
   max_tolerance = 1,
   max_can = 10,
   verbose=false);

In [None]:
JudiLing.write2csv(prod_errors, speech_errors, cue_obj_train, cue_obj_errors,
"../res/production_errors3.csv", target_col=:Phon)

__*slicely* is at rank 7:__ 

  *slice,  slices, slicing, chop, wrench, dice, slicely, slap, flip, wheel*
  
__*thinned* is at rank 1:__

  *thinned, thin, thinning, lacked, thinny, thins, damped, thick, grade*