-
Notifications
You must be signed in to change notification settings - Fork 2
/
minhash.js
executable file
·137 lines (120 loc) · 9.16 KB
/
minhash.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
void function() {
'use strict';
const maxShingleID = Math.pow(2, 32-1);
const nextPrime = 4294967311;
const numHashes = 256
var crc32 = require('crc-32');
var natural = require('natural');
var NGrams = natural.NGrams;
const coeffA = [1737785533,1765439879,1662630800,1084975228,341414066,232531847,1121392417,2014673063,567934308,1318726960,1456120004,3232225,1273779376,1511214936,663223541,604587718,1997407686,2131009365,585324094,780980520,1782778798,238365119,1862924389,1675708909,649782163,880863752,910202909,2106981095,919496758,865726573,559524446,1096755754,1848180334,1394188713,471023138,762475741,547922281,1187557093,1423463148,550423718,1419548119,992336339,2043435375,1229454535,2135156067,1443236097,159293955,875949484,721564628,1252074821,1159187374,1495822558,87252092,798798571,1845239353,840224805,1195059557,593936607,255486346,449041957,1361278841,1135966001,1875817230,889786791,1014518046,1026012917,1845403510,688631152,2137881672,531836417,1076407841,511603231,254599276,1088238204,2027779316,2111595941,2079932074,864133301,33686896,1880604073,956551724,1445251369,802496410,672764388,1032376429,622093382,558018547,438625521,1381205179,583406792,1970873582,2014028347,2080663807,1615715722,1741671861,886771632,1770235641,1248598467,1312134074,1675102246,1876233443,424655246,995667879,1557648672,1369814076,128743175,379229919,1662245942,1592866018,666764026,999739796,296959731,706969897,497702683,1200216611,542066700,120621388,1668551141,11970577,734078903,1404959554,1431393908,1647870589,1437574417,1328338896,1929150206,2077334295,605543580,548337226,800717999,170301705,1904789930,1789411302,1234584482,325995317,452354940,1571561494,234745038,1218554550,1021200080,438449682,1953987503,1665463972,1689030974,1896537278,624349692,375679152,1962670760,623705629,1682772857,812662955,245000824,1242102590,2025225143,1276522357,2135359287,2017882189,1274802843,2024910302,1895518274,1727870064,1140112359,651381823,917989342,1925858609,807579034,99853798,455523270,1647585431,670076286,231332546,560116949,1966118176,2124253674,2038964869,1893386701,1344027173,381415516,1394707444,714666006,1408112613,1390955418,243006020,1884615235,2106941515,1930356318,1316133156,214769224,1427649188,1841893397,1249486641,1371529665,733140803,2057612252,804354782,639678896,119299921,437419540,2086367141,1825665844,1610762356,434530539,62396173,1577972550,1337302835,229593846,1169019038,749636942,417509306,246268088,422236005,500356674,1413663705,1496369438,1244596000,10481695,666633610,409940899,931964531,698929356,1071093926,762185603,1081675461,1334737455,156978577,300224480,1631379254,509959405,694743014,1163227547,526842785,233284124,1068627537,529839940,2019579773,989129395,53440275,1820687217,122519057,1423573677,1712078511,2002917898,1807745678,2021138569,1416796608,262662836,464432495,2063966682,332972390,1183927627,782262692,2095045987,2055700375,1315033482,75848098,1902443289];
const coeffB = [446579920,1369984879,520692052,375761642,552991714,1492539265,147104259,1855451947,1719883266,1680847976,2071143428,1543508991,1432768491,2001222128,248199313,547608357,1212957839,370942237,469440159,261644595,1140129491,1371967034,1614021151,817573484,1660943622,2094119007,477176627,767291474,2000608124,1951461737,988304151,597022215,1587069762,1112027612,1198044432,846505701,1705913574,974261712,1865630544,240199346,2014337521,534670341,776519254,199753597,1211571686,2069565179,1220442370,1092200512,2035966886,389347447,1315675840,1574557334,1878308167,1682394951,80235371,1293965656,1953269147,1385515078,735568895,1055078720,334876530,847446636,1697044376,1857046578,1556172896,156822216,1550953016,914669498,807975651,477680093,997257398,730939105,221776242,1168286217,1918046839,301949797,1453842830,294295294,1875629631,1641325515,337597221,113962389,254198926,1522946944,338124029,2032489754,1640911609,2055585184,436219508,1847892869,1817525569,1075531766,908899251,322786736,623604493,1009423664,539833262,172213385,1918905932,761821729,245871257,1884512356,2003823901,952107024,328878503,1911087554,1944178087,570151771,397231267,642093968,2039398632,165236286,328708088,1772155496,1785007757,1256076460,2041668024,1445349235,765923483,493951142,607884956,914862688,1850842682,1257535832,1126633468,1295382097,193735311,677927303,1768806732,66787,1952376128,638895541,1270272652,1321572134,1068907465,1600419808,979667169,1255425264,550382461,1545938141,117626313,786831612,30166607,1407449122,416985659,1907390800,2032404847,803432298,26856618,478834019,1904060208,82248993,718922863,1987111371,19322578,604361194,2078112203,1304741335,1264911182,2037677067,1407287472,1927440691,1213931433,765471474,978281384,579943698,1282039955,547085149,426463611,1196070482,1782874947,1227880730,1046353730,1717269814,111145624,885690372,1486294999,568746159,611907318,709719305,172351482,1810543613,1559896800,1961892893,107689289,2135512141,1404026278,117773674,1312296555,273042737,137302973,1566715528,1976063959,1527888326,1358725766,1511379655,1240314672,312923973,820964491,1468931021,1878841238,265788711,1959233187,1763853831,1149297274,1589000713,1438292888,2106945449,1449575345,230288687,22370393,1553053560,1733217625,1359351113,498400329,1876071029,295902235,1762284581,983588361,1103627307,167318749,1347502606,901638613,1043603534,1987725242,1917000974,571378711,741672007,300739876,1204963743,612528024,1259910483,869198182,1342537467,1227681953,1849522494,1952815404,1715121955,10831527,184540497,601020353,1920198307,784680279,176094269,1446507221,722181233,1004200192,145657860,1272638818,1134470991,1783178467,1267571993,1431542933,983645357,549837449,196331701];
/*
// Initially generated coefficients as shown below.
var fs = require('fs');
var coeffA = pickRandomCoeffs(numHashes);
var coeffB = pickRandomCoeffs(numHashes);
fs.writeFileSync("B.json", JSON.stringify(coeffB));
fs.writeFileSync("A.json", JSON.stringify(coeffA));
*/
module.exports = {
compare: compare,
summary: summary,
shingles: shingles,
jaccardIndex: jaccardIndex,
shingleHashList: shingleHashList
}
function compare(file1, file2) {
return similarity(minhash(file1), minhash(file2));
}
function summary(file1, file2) {
var minhashval = similarity(minhash(file1), minhash(file2));
var jaccard = jaccardIndex(shingles(file1), shingles(file2));
console.log( "Minhash similarity is "+minhashval+" (%d%% similar)", Math.round(minhashval * 100) );
console.log( "Jaccard index is "+jaccard+" (%d%% similar)", Math.round(jaccard * 100) );
}
function similarity(minhash1, minhash2) {
var total = minhash1.length;
var shared = 0;
for (var i = 0; i < minhash1.length; i++) {
if (minhash1[i] === minhash2[i]) {
shared++;
}
}
return shared / total;
}
function shingleHashList(str, kshingles=2) {
var list = [];
for (var word of shingles(str, kshingles)) {
list.push(crc32.str(word) & 0xffffffff);
}
return list;
}
function shingles(original, kshingles=2) {
var shingles = new Set();
for(var wordlist of NGrams.ngrams(original, kshingles, null, '[end]')) {
shingles.add(wordlist.join(" "));
}
return shingles;
}
function minhash(str) {
var shingles = shingleHashList(str);
var signature = [];
for (var i of range(0, coeffA.length)) {
var minHashCode = nextPrime + 1;
for (var shingleID of shingles) {
var hashCode = (coeffA[i] * shingleID + coeffB[i]) % nextPrime;
if (hashCode < minHashCode) {
minHashCode = hashCode;
}
}
signature.push(minHashCode);
}
return signature;
}
function jaccardIndex(set1, set2) {
var total = set1.size + set2.size;
var intersection = 0;
for(var shingle of set1 ) {
if(set2.has(shingle)) {
intersection++;
}
}
var union = total - intersection;
return intersection / union;
}
function createBinaryString (nMask) {
for (var nFlag = 0, nShifted = nMask, sMask = ""; nFlag < 32;
nFlag++, sMask += String(nShifted >>> 31), nShifted <<= 1);
return sMask;
}
function pickRandomCoeffs(k) {
var randList = [];
while (k > 0 ) {
var randIndex = getRandomIntInclusive(0, maxShingleID);
while (randList.indexOf(randIndex) != -1) {
randIndex = getRandomIntInclusive(0, maxShingleID);
}
randList.push(randIndex);
k = k - 1;
}
return randList;
}
function getRandomIntInclusive(min, max) {
return Math.floor(Math.random() * (max - min + 1)) + min;
}
function* range(start, stop, step){
if (typeof stop === 'undefined'){
stop = start;
start = 0;
}
if (typeof step === 'undefined'){
step = 1;
}
if ((step > 0 && start >= stop) || (step < 0 && start <= stop)){
return;
}
for (var i = start; step > 0 ? i < stop : i > stop; i += step){
yield(i);
}
}
}.call(this);