-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathsubed-word-data.el
More file actions
820 lines (756 loc) · 35.5 KB
/
Copy pathsubed-word-data.el
File metadata and controls
820 lines (756 loc) · 35.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
;;; subed-word-data.el --- Use word-level timing data when splitting subtitles -*- lexical-binding: t; -*-
;;; License:
;;
;; Copyright (C) 2022 Sacha Chua
;; Author: Sacha Chua <sacha@sachachua.com>
;; Keywords: multimedia
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
;;; Commentary:
;; This file parses timing data such as the ones
;; you get from YouTube .srv2, WhisperX JSON, or
;; the Montreal Forced Aligner and tries to match
;; the timing data with the remaining text in the
;; current subtitle in order to determine the word
;; timestamp for splitting the subtitle.
;; To try to automatically load word data from a similarly-named file
;; in the buffer, add this to your configuration:
;; (with-eval-after-load 'subed
;; (add-hook 'subed-mode-hook 'subed-word-data-load-maybe))
;;
;; After loading word data, you can add word-level
;; timestamps to VTT files with
;; subed-word-data-add-word-timestamps and remove them with
;; subed-word-data-remove-word-timestamps .
;;; Code:
(require 'xml)
(require 'dom)
(defvar-local subed-word-data--cache nil
"Word-level timing in the form ((start . ms) (end . ms) (text . ms))")
(defcustom subed-word-data-score-faces '((0.8 . compilation-info)
(0.4 . compilation-warning)
(0 . compilation-error))
"Alist of score thresholds and faces to use."
:type '(alist :key-type float :value-type face))
(defface subed-word-data-face '((((class color) (background light))
:foreground "darkgreen")
(((class color) (background dark))
:foreground "lightgreen"))
"Face used for words with word data available.")
(defun subed-word-data--extract-words-from-srv2 (data)
"Extract the timing from DATA in SRV2 format.
Return a list of ((start . ?), (end . ?) (text . ?))."
(when (stringp data)
(with-temp-buffer (insert data) (setq data (xml-parse-region))))
(let* ((text-elements (reverse (dom-by-tag data 'text)))
(last-start
(and text-elements
(+ (string-to-number
(alist-get 't (xml-node-attributes (car text-elements))))
(string-to-number (alist-get 'd (xml-node-attributes (car text-elements))))))))
(reverse
(mapcar #'(lambda (element)
(let ((rec (list (cons 'start (string-to-number (alist-get 't (xml-node-attributes element))))
(cons 'end (min (+ (string-to-number (alist-get 't (xml-node-attributes element)))
(string-to-number (alist-get 'd (xml-node-attributes element))))
last-start))
(cons 'text
(replace-regexp-in-string "'" "'"
(car (xml-node-children element)))
))))
(setq last-start (alist-get 'start rec))
rec))
text-elements))))
(defun subed-word-data--extract-words-from-youtube-vtt (file &optional from-string)
"Extract the timing from FILE which is a VTT from YouTube.
Return a list of ((start . ?), (end . ?) (text . ?) (speaker . ?)).
If FROM-STRING is non-nil, treat FILE as the data itself."
(with-temp-buffer
(subed-vtt-mode)
(if from-string
(insert file)
(insert-file-contents file))
(let ((list (subed-subtitle-list))
results
s
start
stop
i)
(dolist (sub list)
(when (string-match "<c>" (elt sub 3))
(setq s (elt sub 3))
(setq i 0)
(setq start (elt sub 1))
(while (and (< i (length s))
(string-match "\\(.+?\\)<\\([0-9]+:[0-9]+:[0-9]+\\.[0-9]+\\)>" s i))
(setq stop (1- (save-match-data (subed-timestamp-to-msecs (match-string 2 s)))))
(push `((text . ,(save-match-data
(string-trim (replace-regexp-in-string "</?c>" "" (match-string 1 s)))))
(start . ,start)
(end . ,stop))
results)
(setq i (match-end 0)
start (1+ stop)))
(if (and (< i (length s))
(not (string= "" (string-trim (substring s i)))))
(push `((text . ,(string-trim
(save-match-data (replace-regexp-in-string "</?c>" "" (substring s i)))))
(start . ,start)
(end . ,(elt sub 2)))
results))))
(nreverse results))))
(defun subed-word-data--extract-words-from-whisperx-json (file &optional from-string)
"Extract the timing from FILE in WhisperX's JSON format.
Return a list of ((start . ?), (end . ?) (text . ?) (score . ?)).
If FROM-STRING is non-nil, treat FILE as the data itself."
(let* ((json-object-type 'alist)
(json-array-type 'list)
(data (if from-string
(json-read-from-string file)
(json-read-file file)))
(base (seq-mapcat
(lambda (segment)
(seq-map (lambda (info)
(let-alist info
`((start . ,(and .start (* 1000 .start)))
(end . ,(and .end (* 1000 .end)))
(text . ,(identity .word))
(speaker . ,(identity .speaker))
(score . ,(identity .score)))))
(alist-get 'words segment)))
(alist-get 'segments data)))
last-end
current)
;; numbers at the end of a sentence sometimes don't end up with times
;; so we need to fix them
(while current
(unless (alist-get 'start (car current)) ; start
(set-cdr (assoc 1 'start (car current)) (1+ last-end)))
(unless (alist-get 'end (car current)) ; start
(set-cdr (assoc 1 'end (car current)) (1- (alist-get 'start (cadr current)))))
(setq
last-end (alist-get 'end (car current))
current (cdr current)))
base))
(defun subed-word-data--extract-words-from-textgrid (filename &optional from-string)
"Parse a Praat TextGrid file and return a list of intervals.
Return a list of ((start . ?), (end . ?) (text . ?)).
If FROM-STRING is non-nil, treat FILE as the data itself."
(interactive "fFile: ")
(with-temp-buffer
(if from-string (insert filename) (insert-file-contents filename))
(let (intervals)
(goto-char (point-min))
(let ((limit (or (save-excursion (re-search-forward "name *= *\"phones\"" nil t))
(point-max))))
(while (re-search-forward
"intervals *\\[\\([0-9]+\\)\\]:[ \n\t\r]*xmin = \\([0-9.]+\\)[ \n\t\r]*xmax = \\([0-9.]+\\)[ \n\t\r]*text = \"\\([^\"]+\\)\""
limit t)
(push `((start . ,(* 1000 (string-to-number (match-string 2))))
(end . ,(* 1000 (string-to-number (match-string 3))))
(text . ,(match-string 4)))
intervals)))
(reverse intervals))))
(defun subed-word-data--load (data)
"Load word-level timing from DATA.
Supports WhisperX JSON, YouTube VTT, and Youtube SRV2 files."
(when data
(setq-local subed-word-data--cache data)
(add-hook 'subed-split-subtitle-timestamp-functions #'subed-word-data-split-at-word-timestamp -5 t)
(add-hook 'subed-region-adjusted-hook #'subed-word-data-refresh-region)
(subed-word-data-refresh-text-properties)
data))
(defun subed-word-data-parse-file (file &optional offset-ms)
"Parse FILE for word data.
Return a list of ((start . ?), (end . ?) (text . ?)).
If OFFSET-MS is provided, add it to the times."
(let ((data (pcase (file-name-extension file)
("json" (subed-word-data--extract-words-from-whisperx-json file))
("srv2" (subed-word-data--extract-words-from-srv2 (xml-parse-file file)))
("vtt" (subed-word-data--extract-words-from-youtube-vtt file))
("TextGrid" (subed-word-data--extract-words-from-textgrid file)))))
(when offset-ms (setq data (subed-word-data-adjust-times data offset-ms)))
data))
;;;###autoload
(defun subed-word-data-load-from-file (file &optional offset)
"Load word-level timing from FILE.
Supports WhisperX JSON, YouTube VTT, Youtube SRV2, and TextGrid files."
(interactive (list (read-file-name "JSON, VTT, srv2, or TextGrid: "
nil
nil
nil
nil
(lambda (f)
(or (file-directory-p f)
(string-match
"\\.\\(json\\|srv2\\|vtt\\|TextGrid\\)\\'"
f))))
(when current-prefix-arg
(read-string "Start offset: "))))
(subed-word-data--load (subed-word-data-parse-file file offset)))
(defun subed-word-data-load-from-string (string)
"Load word-level timing from STRING.
For now, only JSON or SRV2 files are supported."
(subed-word-data--load (cond
((string-match "^{" string)
(subed-word-data--extract-words-from-whisperx-json string t))
((string-match "^WEBVTT" string)
(subed-word-data--extract-words-from-youtube-vtt string t))
(t
(subed-word-data--extract-words-from-srv2 string)))))
(defvar subed-word-data-extensions '(".en.srv2" ".srv2" ".json" ".vtt") "Extensions to search for word data.")
;;;###autoload
(defun subed-word-data-load-maybe ()
"Load word data if available. Suitable for adding to `subed-mode-hook'."
(when (buffer-file-name)
(let (file)
(catch 'found
(mapc (lambda (ext)
(when (file-exists-p (concat (file-name-sans-extension (buffer-file-name)) ext))
(setq file (concat (file-name-sans-extension (buffer-file-name)) ext))
(throw 'found)))
subed-word-data-extensions))
(when (and file (subed-word-data-load-from-file file))
(message "Word data loaded.")))))
(defvar subed-word-data-normalizing-functions '(subed-word-data-normalize-word-default)
"Functions to run to normalize words before comparison.")
(defun subed-word-data-normalize-word-default (s)
"Downcase S and remove non-alphanumeric characters for comparison."
(replace-regexp-in-string "[^[:alnum:]]" "" (downcase s)))
(defun subed-word-data-normalize-word (word)
"Normalize WORD to make it easier to compare."
(mapc (lambda (func)
(setq word (funcall func word)))
subed-word-data-normalizing-functions)
word)
(defun subed-word-data-compare-normalized-string= (word1 word2)
"Compare two words and return t if they are the same after normalization."
(string= (subed-word-data-normalize-word word1)
(subed-word-data-normalize-word word2)))
(defvar subed-word-data-compare-normalized-string-distance-threshold 0.2
"Factor used for similarity comparison.")
;;;###autoload
(defun subed-word-data-compare-normalized-string-distance (word1 word2)
"Compare two words and return t if they are similar enough after normalization.
See `subed-word-data-compare-normalized-string-distance-threshold'."
(let ((w1 (subed-word-data-normalize-word word1))
(w2 (subed-word-data-normalize-word word2)))
(< (/ (string-distance w1 w2)
(* 1.0 (max (length w1) (length w2))))
subed-word-data-compare-normalized-string-distance-threshold)))
(defvar subed-word-data-compare-function 'subed-word-data-compare-normalized-string-distance
"Function to use to compare.")
(defun subed-word-data-compare (word1 word2)
"Use the `subed-word-data-compare' function to compare WORD1 and WORD2.
Return non-nil if they are the same after normalization."
(funcall subed-word-data-compare-function word1 word2))
(defun subed-word-data--look-up-word ()
"Find the word timing that matches the one at point (approximately)."
(save-excursion
(skip-syntax-backward "w")
(let* ((end (subed-subtitle-msecs-stop))
(start (subed-subtitle-msecs-start))
(remaining-words (split-string
(buffer-substring
(point)
(or (subed-jump-to-subtitle-end) (point)))))
(words (if remaining-words
(reverse (seq-filter
(lambda (o)
(and (or (not (alist-get 'end o)) (<= (alist-get 'end o) end))
(or (not (alist-get 'start o)) (>= (alist-get 'start o) start))
(not (string-match "^\n*$" (alist-get 'text o)))))
subed-word-data--cache))))
(offset 0)
(done (null remaining-words))
candidate)
(while (not done)
(setq candidate (elt words (+ (1- (length remaining-words)) offset)))
(cond
((and candidate (subed-word-data-compare
(car remaining-words)
(alist-get 'text candidate)))
(setq done t))
((> offset (length words)) (setq done t))
((> offset 0) (setq offset (- offset)))
(t (setq offset (1+ (- offset))))))
candidate)))
(defun subed-word-data-split-at-word-timestamp ()
"Return the starting timestamp if the word is found."
(cond
((get-text-property (point) 'subed-word-start)
(- time subed-subtitle-spacing))
(subed-word-data--cache
(let ((time (assoc-default 'start (subed-word-data--look-up-word))))
(when time (- time subed-subtitle-spacing))))))
(defun subed-word-data-subtitle-entries ()
"Return the entries that start and end within the current subtitle."
(let ((start (- (subed-subtitle-msecs-start) subed-word-data-fuzz-ms))
(stop (+ (subed-subtitle-msecs-stop) subed-word-data-fuzz-ms)))
(seq-filter
(lambda (o)
(and (<= (or (alist-get 'end o) most-positive-fixnum) stop)
(>= (or (alist-get 'start o) 0) start)
(not (string-match "^\n*$" (alist-get 'text o)))))
subed-word-data--cache)))
(defvar subed-word-data-threshold 5
"Number of words to consider for matching.")
(defvar subed-word-data-fuzz-ms 500
"Milliseconds to consider before or after a subtitle.")
(defun subed-word-data-refresh-text-properties-for-subtitle ()
"Refresh the text properties for the current subtitle."
(interactive)
(remove-text-properties (subed-jump-to-subtitle-text) (subed-jump-to-subtitle-end)
'(subed-word-data-start subed-word-data-end font-lock-face))
(let* ((text-start (progn (subed-jump-to-subtitle-text) (point)))
pos
(word-data (reverse (subed-word-data-subtitle-entries)))
candidate
cand-count)
(subed-jump-to-subtitle-end)
(while (> (point) text-start)
;; Work our way backwards, matching against remaining words
(setq pos (point))
(backward-word)
(let ((try-list word-data)
(current-word (buffer-substring (point) pos))
candidate)
(setq candidate (car try-list) cand-count 0)
(setq try-list (cdr try-list))
(while (and candidate
(< cand-count subed-word-data-threshold)
(not (subed-word-data-compare current-word
(alist-get 'text candidate))))
(setq candidate (car try-list) cand-count (1+ cand-count))
(when (> cand-count subed-word-data-threshold)
(setq candidate nil))
(setq try-list (cdr try-list)))
(when (and candidate (subed-word-data-compare (buffer-substring (point) pos)
(alist-get 'text candidate)))
(subed-word-data--add-word-properties (point) pos candidate)
(setq word-data try-list))))))
(defun subed-word-data-add-word-timestamps (&optional beg end)
"Add word timestamps.
It uses the text properties to determine the start of each word.
This only works for VTTs."
(interactive (if (region-active-p)
(list (region-beginning)
(region-end))
(list (point-min) (point-max))))
(save-excursion
(subed-word-data-remove-word-timestamps beg end)
(subed-for-each-subtitle beg end t
(let ((start (save-excursion (subed-jump-to-subtitle-text) (point))))
(subed-jump-to-subtitle-end)
(while (> (point) start)
(backward-word)
(when (get-text-property (point) 'subed-word-data-start)
(save-excursion
(insert (format "<%s>" (subed-msecs-to-timestamp (get-text-property (point) 'subed-word-data-start)))))))))))
(defun subed-word-data-remove-word-timestamps (&optional beg end)
"Remove all word timestamps."
(interactive (if (region-active-p)
(list (region-beginning)
(region-end))
(list (point-min) (point-max))))
(goto-char (or end (point-max)))
(while (re-search-backward "<[0-9]+:[0-9]+:[0-9]+\\.[0-9]+>" beg t)
(replace-match "")))
(defun subed-word-data-refresh-region (beg end)
"Refresh text properties in region."
(when subed-word-data--cache
(subed-for-each-subtitle beg end nil
(subed-word-data-refresh-text-properties-for-subtitle))))
(defsubst subed-word-data--candidate-face (candidate)
"Return the face to use for CANDIDATE."
(if (and (alist-get 'score candidate)
subed-word-data-score-faces)
(cdr (seq-find (lambda (threshold) (>= (alist-get 'score candidate) (car threshold)))
subed-word-data-score-faces))
'subed-word-data-face))
(defsubst subed-word-data--add-word-properties (start end candidate)
"Add properties from START to END for CANDIDATE."
(let ((face (subed-word-data--candidate-face candidate)))
(add-text-properties start end
(list 'subed-word-data-start
(assoc-default 'start candidate)
'subed-word-data-end
(assoc-default 'end candidate)
'subed-word-data-score
(assoc-default 'score candidate)
'font-lock-face face))
(add-face-text-property start end face)))
(defun subed-word-data-refresh-text-properties ()
"Add word data properties and face when available."
(interactive)
(save-excursion
(remove-text-properties (point-min) (point-max) '(subed-word-data-start subed-word-data-end font-lock-face))
(when subed-word-data--cache
(goto-char (point-min))
(unless (subed-jump-to-subtitle-id) (subed-forward-subtitle-id))
(while (not (eobp))
(let* ((text-start (progn (subed-jump-to-subtitle-text) (point)))
pos
(word-data (reverse (subed-word-data-subtitle-entries)))
candidate)
(subed-jump-to-subtitle-end)
(while (> (point) text-start)
;; Work our way backwards, matching against remaining words
(setq pos (point))
(backward-word)
(let ((try-list word-data)
candidate)
(setq candidate (car try-list))
(setq try-list (cdr try-list))
(while (and candidate
(not (subed-word-data-compare (buffer-substring (point) pos)
(alist-get 'text candidate))))
(setq candidate (car try-list))
(setq try-list (cdr try-list)))
(when (and candidate (subed-word-data-compare (buffer-substring (point) pos)
(alist-get 'text candidate)))
( subed-word-data--add-word-properties (point) pos candidate)
(setq word-data try-list)))))
(or (subed-forward-subtitle-id)
(goto-char (point-max)))))))
(defun subed-word-data-pause-msecs ()
"Return the number of milliseconds between this word and the previous word.
Requires the text properties to be set."
(let ((current (get-text-property (point) 'subed-word-data-start)))
(save-excursion
(skip-syntax-backward "w")
(backward-word)
(when (get-text-property (point) 'subed-word-data-end)
(- current (get-text-property (point) 'subed-word-data-end))))))
(defun subed-word-data-jump-to-longest-pause-in-current-subtitle ()
"Jump to the word after the longest pause in the current subtitle.
Requires the text properties to be set."
(interactive)
(let ((start (or (subed-jump-to-subtitle-text) (point)))
(end (or (subed-jump-to-subtitle-end) (point)))
pos last-start-time pause (max-pause 0) max-pos)
(backward-word)
(setq max-pos (point))
(while (> (point) start)
(setq pos (point) last-start-time (get-text-property (point) 'subed-word-data-start))
(backward-word)
(if (get-text-property (point) 'subed-word-data-end)
(progn
(setq pause (and last-start-time (- last-start-time
(get-text-property (point) 'subed-word-data-end))))
(when (and pause (> pause max-pause))
(setq max-pos pos
max-pause pause)))
(setq last-start-time nil)))
(goto-char max-pos)))
(defun subed-word-data-find-minimum-distance (from-n to-n distance-fn &optional short-circuit low-threshold)
"Return (number distance) that minimizes DISTANCE-FN.
Check the range FROM (inclusive) to TO (exclusive).
If SHORT-CIRCUIT is specified, call that function with i as the argument and stop when it returns true.
If LOW-THRESHOLD is specified, stop when the distance is less than or equal to that number."
(setq low-threshold (or low-threshold 0))
(catch 'found
(let* (min-distance
current-distance
i)
(setq i from-n)
(while (< i to-n)
(setq current-distance (funcall distance-fn i))
(when (or (null min-distance) (< current-distance (cdr min-distance)))
(setq min-distance (cons i current-distance))
(when (<= current-distance low-threshold)
(throw 'found min-distance)))
(when (and short-circuit (funcall short-circuit i))
(throw 'found min-distance))
(setq i (1+ i)))
min-distance)))
(defun subed-word-data-find-approximate-match (phrase list-of-words &optional short-circuit)
"Match PHRASE against the beginning of LIST-OF-WORDS.
LIST-OF-WORDS is a list of strings or a list of alists that have 'text.
If SHORT-CIRCUIT is non-nil, use it as a regexp that short-circuits recognition and stops there.
Return (distance . list of words) that minimizes the string distance from PHRASE.
distance is expressed as a ratio of number of edits / maximum length of phrase or words.
"
(let ((min-distance
(subed-word-data-find-minimum-distance
1
(+ (length (split-string phrase " ")) 8)
(lambda (num-words)
(let ((cand (mapconcat
(lambda (o)
(if (stringp o) o (alist-get 'text o)))
(seq-take list-of-words num-words)
" ")))
(/
(* 1.0
(string-distance phrase cand))
(max (length phrase)
(length cand)))))
(if (and short-circuit
(not (string-match short-circuit phrase)))
(lambda (num-words)
(string-match
short-circuit
(mapconcat
(lambda (o)
(if (stringp o) o (alist-get 'text o)))
(seq-take list-of-words num-words)
" ")))))))
(cons (cdr min-distance) (seq-take list-of-words (car min-distance)))))
;; (subed-word-data-find-approximate-match "Go into the room." (split-string "Go in to the room. There you will" " "))
;; (subed-word-data-find-approximate-match "The quick brown fox jumps over the lazy dog" (split-string "The quick, oops, the quick brown fox jumps over the lazy dog and goes all sorts of places" " ") "\\<oops\\>")
;; (subed-word-data-find-approximate-match "I already talk pretty quickly," (split-string "I already talk pretty quickly. Oops. I already talk pretty quickly, so I'm not going" " ") "\\<oops\\>")
;; (subed-word-data-find-approximate-match "The quick brown fox" (split-string "Well, let's get started. The quick, I mean, the quick brown fox jumps over the lazy dog." " ") "\\<oops\\>") ; hmm, the processing of this one could be improved.
;;;###autoload
(defun subed-word-data-fix-subtitle-timing (beg end)
"Sets subtitle starts and stops based on the word data.
Assumes words haven't been edited."
(interactive (list (if (region-active-p) (min (point) (mark)))
(if (region-active-p) (max (point) (mark)))))
(unless subed-word-data--cache
(call-interactively #'subed-word-data-load-from-file))
(setq beg (or beg (point-min)))
(setq end (if end (save-excursion
(goto-char end)
(subed-jump-to-subtitle-end)
(point))
(point-max)))
(goto-char beg)
(if (subed-subtitle-msecs-start)
(subed-jump-to-subtitle-text)
(subed-forward-subtitle-text))
(let* ((start-ms (save-excursion
(goto-char beg)
(or (subed-subtitle-msecs-start)
(progn
(subed-forward-subtitle-time-start)
(subed-subtitle-msecs-start)))))
(data (seq-drop-while
(lambda (o)
(< (or (alist-get 'start o) 0)
start-ms))
subed-word-data--cache))
candidate)
(while (and (not (> (point) end)) data)
(setq current-sub (replace-regexp-in-string "\n" " " (subed-subtitle-text)))
(let ((candidate (subed-word-data-find-approximate-match current-sub data)))
(subed-set-subtitle-time-start
(alist-get
'start
(seq-find (lambda (o) (alist-get 'start o))
(cdr candidate))))
(subed-set-subtitle-time-stop
(alist-get
'end
(seq-find (lambda (o) (alist-get 'end o))
(reverse (cdr candidate)))))
(subed-word-data-refresh-text-properties-for-subtitle)
(setq data (seq-drop data (length (cdr candidate)))))
(unless (subed-forward-subtitle-text)
(goto-char (point-max))))))
(defun subed-word-data-move-untimed-words-from-previous ()
"Move untimed words from previous subtitle to current one."
(interactive)
(save-excursion
(subed-backward-subtitle-end)
(text-property-search-backward 'subed-word-data-end)
(goto-char (next-single-property-change (point) 'subed-word-data-end))
(let* ((start (point))
(text (buffer-substring start (subed-jump-to-subtitle-end))))
(delete-region start (point))
(subed-forward-subtitle-text)
(insert text " "))))
(defvar subed-word-data-script-difference-threshold 0.2
"*If string difference is above this threshold, include original text as a comment.")
(defvar subed-word-data-oops-regexp "\\<oops\\>"
"*Regular expression matching the signal used after a false start.")
(defun subed-word-data-combine-script-and-transcript (phrases bag-of-words &optional oops-regexp keep-transcript-words)
"Use PHRASES to split the words in BAG-OF-WORDS.
If OOPS-REGEXP is non-nil, use that as the regular expression that signals a false start.
If KEEP-TRANSCRIPT-WORDS is non-nil, don't correct transcript words.
Return a list of subtitles and comments."
(let* ((phrase-length (length phrases))
(phrase-cursor 0)
(case-fold-search t)
lookback
min-candidate
result)
(while (and (< phrase-cursor phrase-length)
bag-of-words)
(when (and oops-regexp (string-match oops-regexp (car bag-of-words)))
;; discard that word and figure out where we're restarting
(setf (elt (cdr (car result)) 3)
(concat (elt (cdr (car result)) 3) " " (car bag-of-words)))
(setf (elt (cdr (car result)) 4)
(string-trim
(concat (or (elt (cdr (car result)) 4) "") "\n#+SKIP")))
(setq bag-of-words (cdr bag-of-words)))
(setq phrase-cursor
(- phrase-cursor
(car (subed-word-data-find-minimum-distance
0 (1+ (min phrase-cursor 4))
(lambda (i)
(if (< (- phrase-cursor i) 0)
most-positive-fixnum
(car (subed-word-data-find-approximate-match
(elt phrases (- phrase-cursor i))
bag-of-words
oops-regexp))))))))
;; mark the previous ones as oopses also
(dolist (o result)
(when (>= (car o) phrase-cursor)
(unless (string-match "#\\+SKIP" (or (elt (cdr o) 4) ""))
(setf (elt (cdr o) 4)
(string-trim
(concat
(or (elt (cdr o) 4) "")
"\n"
"#+SKIP"))))))
(setq candidate
(subed-word-data-find-approximate-match
(elt phrases phrase-cursor)
bag-of-words
oops-regexp))
(setq result
(cons
(cons
phrase-cursor
(if (and oops-regexp
(string-match oops-regexp (string-join (cdr candidate) " "))
(not (string-match oops-regexp (elt phrases phrase-cursor))))
(list nil 0 0 (string-join (cdr candidate) " ") "#+SKIP")
(setq phrase-cursor (1+ phrase-cursor))
(list nil 0 0
(if (or keep-transcript-words
(> (car candidate) subed-word-data-script-difference-threshold))
(string-join (cdr candidate) " ")
(elt phrases (1- phrase-cursor)))
(cond
((and keep-transcript-words (> (car candidate) 0))
(concat "#+SCRIPT: " (elt phrases (1- phrase-cursor)) "\n"
"#+DISTANCE: " (format "%.2f" (car candidate))))
((= (car candidate) 0) nil)
((> (car candidate) subed-word-data-script-difference-threshold)
(concat "#+SCRIPT: " (elt phrases (1- phrase-cursor)) "\n"
"#+DISTANCE: " (format "%.2f" (car candidate))))
((< (car candidate) subed-word-data-script-difference-threshold)
(concat "#+TRANSCRIPT: " (string-join (cdr candidate) " ") "\n"
"#+DISTANCE: " (format "%.2f" (car candidate))))
))))
result))
;; found a good match, move to the next phrase
(setq bag-of-words (seq-drop bag-of-words (length (cdr candidate)))))
(mapcar 'cdr (reverse result))))
(defun subed-word-data-use-script-file (script-file output-file &optional oops-regexp keep-transcript-words)
"Use the info from SCRIPT-FILE to correct the current transcript.
Write OUTPUT-FILE so that it uses the words and phrasing from SCRIPT-FILE,
but includes any extra phrases from TRANSCRIPT-FILE (such as oopses).
If OOPS-REGEXP is non-nil, use that as the regular expression that signals a false start.
If KEEP-TRANSCRIPT-WORDS is non-nil, don't correct current transcript words.
When called with `\\[universal-argument]', don't correct current transcript words."
(interactive (list
(read-file-name "Script: ")
(read-file-name "Output file: ")
subed-word-data-oops-regexp
current-prefix-arg))
(let* ((phrases
(if (string= "vtt" (file-name-extension script-file))
(mapcar (lambda (o) (elt o 3)) (subed-parse-file script-file))
(with-temp-buffer
(insert-file script-file)
(split-string (string-trim (buffer-string)) "\n"))))
(bag-of-words
(split-string
(if (derived-mode-p 'subed-mode)
(mapconcat (lambda (o) (elt o 3)) (subed-subtitle-list) " ")
(string-trim (buffer-string)))
"[ \n]+"))
(result (subed-word-data-combine-script-and-transcript phrases bag-of-words oops-regexp)))
(subed-create-file
output-file
result
t)
(find-file output-file)))
(defun subed-word-data-adjust-times (list start-msecs)
"Shifts times in LIST so that START-MSECS is now equivalent to 0.
Discard previous word data."
(when (stringp start-msecs) (setq start-msecs (subed-timestamp-to-msecs start-msecs)))
(seq-keep
(lambda (row)
(when (and (alist-get 'start row) (>= (alist-get 'start row) start-msecs))
(when (alist-get 'start row)
(setcdr (assoc 'start row) (- (alist-get 'start row) start-msecs)))
(when (alist-get 'end row)
(setcdr (assoc 'end row) (- (alist-get 'end row) start-msecs)))
row))
list))
(defvar subed-word-data-subtitle-match-threshold 0.7
"Fraction of subtitle with word data to be considered well-recognized.")
(defvar subed-align-preprocess-functions)
(defun subed-word-data-match-ratio ()
"Return the ratio of characters with good matches in subtitle cue text."
(require 'subed-align)
(let* ((text (subed-subtitle-text))
(properties (object-intervals text)))
(/
(apply '+
(seq-keep
(lambda (interval)
(if (plist-get (elt interval 2) 'subed-word-data-score)
(- (elt interval 1)
(elt interval 0))))
properties))
(* 1.0
;; run text through subed-align-preprocess-functions
(length
(elt
(car
(seq-reduce
(lambda (prev val)
(funcall val prev))
subed-align-preprocess-functions
(list
(list nil nil nil text))))
3))))))
(defun subed-word-data-next-subtitle-with-poor-matches ()
(interactive)
"Jump to the next subtitle without lots of word matches."
(catch 'found
(while (and (not (eobp))
(subed-forward-subtitle-text))
(when (< (subed-word-data-match-ratio) subed-word-data-subtitle-match-threshold)
(throw 'found (point))))))
(defun subed-word-data-next-subtitle-with-good-matches ()
(interactive)
"Jump to the next subtitle with lots of word matches."
(catch 'found
(while (and (not (eobp))
(subed-forward-subtitle-text))
(when (>= (subed-word-data-match-ratio) subed-word-data-subtitle-match-threshold)
(throw 'found (point))))))
(defun subed-word-data-previous-subtitle-with-poor-matches ()
(interactive)
"Jump to the previous subtitle without lots of word matches."
(catch 'found
(while (and (not (bobp))
(subed-backward-subtitle-text))
(when (< (subed-word-data-match-ratio) subed-word-data-subtitle-match-threshold)
(throw 'found (point))))))
(defun subed-word-data-previous-subtitle-with-good-matches ()
(interactive)
"Jump to the previous subtitle with lots of word matches."
(catch 'found
(while (and (not (bobp))
(subed-backward-subtitle-text))
(when (>= (subed-word-data-match-ratio) subed-word-data-subtitle-match-threshold)
(throw 'found (point))))))
(provide 'subed-word-data)
;;; subed-word-data.el ends here