From 1b5014bdb27365676d92e131e1b6579e5e0fa4f5 Mon Sep 17 00:00:00 2001 From: James Kolb Date: Mon, 16 May 2016 20:21:17 -0400 Subject: [PATCH] Fix unicode splitting bug. The old code finds prefixes/suffixes by rune index but splits strings at byte index instead. This can split unicode chars into invalid chars for large files and can cause panics in diffCharsToLines later on when lookup is done on the invalid runes. --- diffmatchpatch/dmp.go | 30 +++++++++++++++--------------- diffmatchpatch/dmp_test.go | 5 +++++ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/diffmatchpatch/dmp.go b/diffmatchpatch/dmp.go index 5b5453f..9c41a49 100644 --- a/diffmatchpatch/dmp.go +++ b/diffmatchpatch/dmp.go @@ -1230,19 +1230,19 @@ func (dmp *DiffMatchPatch) DiffCleanupMerge(diffs []Diff) []Diff { countDelete := 0 countInsert := 0 commonlength := 0 - textDelete := "" - textInsert := "" + textDelete := []rune(nil) + textInsert := []rune(nil) for pointer < len(diffs) { switch diffs[pointer].Type { case DiffInsert: countInsert++ - textInsert += diffs[pointer].Text + textInsert = append(textInsert, []rune(diffs[pointer].Text)...) pointer++ break case DiffDelete: countDelete++ - textDelete += diffs[pointer].Text + textDelete = append(textDelete, []rune(diffs[pointer].Text)...) pointer++ break case DiffEqual: @@ -1250,24 +1250,24 @@ func (dmp *DiffMatchPatch) DiffCleanupMerge(diffs []Diff) []Diff { if countDelete+countInsert > 1 { if countDelete != 0 && countInsert != 0 { // Factor out any common prefixies. - commonlength = dmp.DiffCommonPrefix(textInsert, textDelete) + commonlength = commonPrefixLength(textInsert, textDelete) if commonlength != 0 { x := pointer - countDelete - countInsert if x > 0 && diffs[x-1].Type == DiffEqual { - diffs[x-1].Text += textInsert[:commonlength] + diffs[x-1].Text += string(textInsert[:commonlength]) } else { - diffs = append([]Diff{Diff{DiffEqual, textInsert[:commonlength]}}, diffs...) + diffs = append([]Diff{Diff{DiffEqual, string(textInsert[:commonlength])}}, diffs...) pointer++ } textInsert = textInsert[commonlength:] textDelete = textDelete[commonlength:] } // Factor out any common suffixies. - commonlength = dmp.DiffCommonSuffix(textInsert, textDelete) + commonlength = commonSuffixLength(textInsert, textDelete) if commonlength != 0 { insertIndex := len(textInsert) - commonlength deleteIndex := len(textDelete) - commonlength - diffs[pointer].Text = textInsert[insertIndex:] + diffs[pointer].Text + diffs[pointer].Text = string(textInsert[insertIndex:]) + diffs[pointer].Text textInsert = textInsert[:insertIndex] textDelete = textDelete[:deleteIndex] } @@ -1276,16 +1276,16 @@ func (dmp *DiffMatchPatch) DiffCleanupMerge(diffs []Diff) []Diff { if countDelete == 0 { diffs = splice(diffs, pointer-countInsert, countDelete+countInsert, - Diff{DiffInsert, textInsert}) + Diff{DiffInsert, string(textInsert)}) } else if countInsert == 0 { diffs = splice(diffs, pointer-countDelete, countDelete+countInsert, - Diff{DiffDelete, textDelete}) + Diff{DiffDelete, string(textDelete)}) } else { diffs = splice(diffs, pointer-countDelete-countInsert, countDelete+countInsert, - Diff{DiffDelete, textDelete}, - Diff{DiffInsert, textInsert}) + Diff{DiffDelete, string(textDelete)}, + Diff{DiffInsert, string(textInsert)}) } pointer = pointer - countDelete - countInsert + 1 @@ -1304,8 +1304,8 @@ func (dmp *DiffMatchPatch) DiffCleanupMerge(diffs []Diff) []Diff { } countInsert = 0 countDelete = 0 - textDelete = "" - textInsert = "" + textDelete = nil + textInsert = nil break } } diff --git a/diffmatchpatch/dmp_test.go b/diffmatchpatch/dmp_test.go index 403166c..7d64b6e 100644 --- a/diffmatchpatch/dmp_test.go +++ b/diffmatchpatch/dmp_test.go @@ -351,6 +351,11 @@ func Test_diffCleanupMerge(t *testing.T) { diffs = dmp.DiffCleanupMerge(diffs) assertDiffEqual(t, []Diff{Diff{DiffEqual, "xa"}, Diff{DiffDelete, "d"}, Diff{DiffInsert, "b"}, Diff{DiffEqual, "cy"}}, diffs) + // Same test as above but with unicode (\u0101 will appear in diffs with at least 257 unique lines) + diffs = []Diff{Diff{DiffEqual, "x"}, Diff{DiffDelete, "\u0101"}, Diff{DiffInsert, "\u0101bc"}, Diff{DiffDelete, "dc"}, Diff{DiffEqual, "y"}} + diffs = dmp.DiffCleanupMerge(diffs) + assertDiffEqual(t, []Diff{Diff{DiffEqual, "x\u0101"}, Diff{DiffDelete, "d"}, Diff{DiffInsert, "b"}, Diff{DiffEqual, "cy"}}, diffs) + // Slide edit left. diffs = []Diff{Diff{DiffEqual, "a"}, Diff{DiffInsert, "ba"}, Diff{DiffEqual, "c"}} diffs = dmp.DiffCleanupMerge(diffs)