This repository has been archived by the owner on Jul 26, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
util.js
381 lines (353 loc) · 13.7 KB
/
util.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
import { default as ml, } from 'ml';
import { default as range, } from 'lodash.range';
import { default as rangeRight, } from 'lodash.rangeright';
const avg = ml.ArrayStat.mean;
const mean = avg;
const sum = ml.ArrayStat.sum;
const scale = (a, d) => a.map(x => (x - avg(a)) / d);
const max = a => a.concat([]).sort((x, y) => x < y)[0];
const min = a => a.concat([]).sort((x, y) => x > y)[0];
const sd = ml.ArrayStat.standardDeviation; //(a, av) => Math.sqrt(avg(a.map(x => (x - av) * x)));
/**
* Returns an array of the squared different of two arrays
* @memberOf util
* @param {Number[]} left
* @param {Number[]} right
* @returns {Number[]} Squared difference of left minus right array
*/
function squaredDifference(left, right) {
return left.reduce((result, val, index, arr) => {
result.push(Math.pow((right[index]-val), 2));
return result;
}, []);
}
/**
* The standard error of the estimate is a measure of the accuracy of predictions made with a regression line. Compares the estimate to the actual value
* @memberOf util
* @see {@link http://onlinestatbook.com/2/regression/accuracy.html}
* @example
const actuals = [ 2, 4, 5, 4, 5, ];
const estimates = [ 2.8, 3.4, 4, 4.6, 5.2, ];
const SE = ms.util.standardError(actuals, estimates);
SE.toFixed(2) // => 0.89
* @param {Number[]} actuals - numerical samples
* @param {Number[]} estimates - estimates values
* @returns {Number} Standard Error of the Estimate
*/
function standardError(actuals=[], estimates=[]) {
if (actuals.length !== estimates.length) throw new RangeError('arrays must have the same length');
const squaredDiff = squaredDifference(actuals, estimates);
return Math.sqrt((sum(squaredDiff)) / (actuals.length - 2));
}
/**
* Calculates the z score of each value in the sample, relative to the sample mean and standard deviation.
* @memberOf util
* @see {@link https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.mstats.zscore.html}
* @param {Number[]} observations - An array like object containing the sample data.
* @returns {Number[]} The z-scores, standardized by mean and standard deviation of input array
*/
function standardScore(observations = []) {
const mean = avg(observations);
const stdDev = sd(observations);
return observations.map(x => ((x - mean) / stdDev));
}
/**
* In statistics, the coefficient of determination, denoted R2 or r2 and pronounced "R squared", is the proportion of the variance in the dependent variable that is predictable from the independent variable(s). Compares distance of estimated values to the mean.
* {\bar {y}}={\frac {1}{n}}\sum _{i=1}^{n}y_{i}
* @example
const actuals = [ 2, 4, 5, 4, 5, ];
const estimates = [ 2.8, 3.4, 4, 4.6, 5.2, ];
const r2 = ms.util.coefficientOfDetermination(actuals, estimates);
r2.toFixed(1) // => 0.6
* @memberOf util
* @see {@link https://en.wikipedia.org/wiki/Coefficient_of_determination} {@link http://statisticsbyjim.com/regression/standard-error-regression-vs-r-squared/}
* @param {Number[]} actuals - numerical samples
* @param {Number[]} estimates - estimates values
* @returns {Number} r^2
*/
function coefficientOfDetermination(actuals = [], estimates = []) {
if (actuals.length !== estimates.length) throw new RangeError('arrays must have the same length');
const actualsMean = mean(actuals);
const totalVariation = sum(actuals.reduce((result, val, index) => {
result.push(Math.pow((actuals[index]-actualsMean), 2));
return result;
}, []));
const unexplainedVariation = sum(actuals.reduce((result, val, index) => {
result.push(Math.pow((actuals[ index ] - estimates[ index ]), 2));
return result;
}, []));
const rSquared = ((totalVariation - unexplainedVariation) / totalVariation);
return rSquared;
/*
@see {@link https://math.tutorvista.com/statistics/coefficient-of-determination.html}
Some Properties of Coefficient of Determination are as follow:
It helps to provide the proportion of the variance of one variable which is predictable from the other variable.
It is a way of measurement which allows determining how clear it can be in making predictions from a certain data provided.
It can be taken as a ratio of the explained variation to the total variation.
It denotes the strength of the linear association between the variables.
The square of the coefficient of determination will always b e between 0 and1, which is 0 ≤
≤
r2 ≤
≤
1. Here r2 will always be a positive value.
As r2 gets close to 1, the Y data values get close to the regression line.
As r2 gets close to 0, the Y data values get further from the regression line.
It helps to provide the proportion of the variance of one variable which is predictable from the other variable.
It is a way of measurement which allows determining how clear it can be in making predictions from a certain data provided.
It can be taken as a ratio of the explained variation to the total variation.
It denotes the strength of the linear association between the variables.
*/
}
/**
* You can use the adjusted coefficient of determination to determine how well a multiple regression equation “fits” the sample data. The adjusted coefficient of determination is closely related to the coefficient of determination (also known as R2) that you use to test the results of a simple regression equation.
* @example
const adjr2 = ms.util.adjustedCoefficentOfDetermination({
rSquared: 0.944346527,
sampleSize: 8,
independentVariables: 2,
});
r2.toFixed(3) // => 0.922
* @memberOf util
* @see {@link http://www.dummies.com/education/math/business-statistics/how-to-calculate-the-adjusted-coefficient-of-determination/}
* @param {Object} [options={}]
* @param {Number[]} [options.actuals] - numerical samples
* @param {Number[]} [options.estimates] - estimate values
* @param {Number} [options.rSquared] - coefficent of determination
* @param {Number} [options.sampleSize] - the sample size
* @param {Number} options.independentVariables - the number of independent variables in the regression equation
* @returns {Number} adjusted r^2 for multiple linear regression
*/
function adjustedCoefficentOfDetermination(options = {}) {
const { actuals, estimates, rSquared, independentVariables, sampleSize, } = options;
const r2 = rSquared || coefficientOfDetermination(actuals, estimates);
const n = sampleSize || actuals.length;
const k = independentVariables;
return (1 - (1 - r2) * ((n - 1) / (n - (k + 1))));
}
/**
* The coefficent of Correlation is given by R decides how well the given data fits a line or a curve.
* @example
const actuals = [ 39, 42, 67, 76, ];
const estimates = [ 44, 40, 60, 84, ];
const R = ms.util.coefficientOfCorrelation(actuals, estimates);
R.toFixed(4) // => 0.9408
* @memberOf util
* @see {@link https://calculator.tutorvista.com/r-squared-calculator.html}
* @param {Number[]} actuals - numerical samples
* @param {Number[]} estimates - estimates values
* @returns {Number} R
*/
function coefficientOfCorrelation(actuals = [], estimates = []) {
if (actuals.length !== estimates.length) throw new RangeError('arrays must have the same length');
const sumX = sum(actuals);
const sumY = sum(estimates);
const sumProdXY = actuals.reduce((result, val, index) => {
result = result + (actuals[ index ] * estimates[ index ]);
return result;
}, 0);
const sumXSquared = actuals.reduce((result, val) => {
result = result + (val * val);
return result;
}, 0);
const sumYSquared = estimates.reduce((result, val) => {
result = result + (val * val);
return result;
}, 0);
const N = actuals.length;
const R = (
(N * sumProdXY - sumX * sumY) /
Math.sqrt(
(N * sumXSquared - Math.pow(sumX, 2)) * (N * sumYSquared - Math.pow(sumY, 2))
)
);
return R;
}
/**
* The coefficent of determination is given by r^2 decides how well the given data fits a line or a curve.
*
* @param {Number[]} [actuals=[]]
* @param {Number[]} [estimates=[]]
* @returns {Number} r^2
*/
function rSquared(actuals = [], estimates=[]) {
return Math.pow(coefficientOfCorrelation(actuals, estimates), 2);
}
/**
* returns an array of vectors as an array of arrays
* @example
const vectors = [ [1,2,3], [1,2,3], [3,3,4], [3,3,3] ];
const arrays = pivotVector(vectors); // => [ [1,2,3,3], [2,2,3,3], [3,3,4,3] ];
* @memberOf util
* @param {Array[]} vectors
* @returns {Array[]}
*/
function pivotVector(vectors=[]) {
return vectors.reduce((result, val, index/*, arr*/) => {
val.forEach((vecVal, i) => {
(index === 0)
? (result.push([vecVal,]))
: (result[ i ].push(vecVal));
});
return result;
}, []);
}
/**
* returns a matrix of values by combining arrays into a matrix
* @memberOf util
* @example
const arrays = [
[ 1, 1, 3, 3 ],
[ 2, 2, 3, 3 ],
[ 3, 3, 4, 3 ],
];
pivotArrays(arrays); //=>
// [
// [1, 2, 3,],
// [1, 2, 3,],
// [3, 3, 4,],
// [3, 3, 3,],
// ];
* @param {Array} [vectors=[]] - array of arguments for columnArray to merge columns into a matrix
* @returns {Array} a matrix of column values
*/
function pivotArrays(arrays = []) {
return (arrays.length)
? arrays[ 0 ].map((vectorItem, index) => {
const returnArray = [];
arrays.forEach((v, i) => {
returnArray.push(arrays[ i ][ index ]);
});
return returnArray;
})
: arrays;
}
/**
* Standardize features by removing the mean and scaling to unit variance
Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and standard deviation are then stored to be used on later data using the transform method.
Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance)
* @memberOf util
* @param {number[]} z - array of integers or floats
* @returns {number[]}
*/
const StandardScaler = (z) => scale(z, sd(z));
/** This function returns two functions that can standard scale new inputs and reverse scale new outputs
* @param {Number[]} values - array of numbers
* @returns {Object} - {scale[ Function ], descale[ Function ]}
*/
function StandardScalerTransforms(vector = []) {
const average = avg(vector);
const standard_dev = sd(vector);
const maximum = max(vector);
const minimum = min(vector);
const scale = (z)=> (z - average) / standard_dev; // equivalent to MinMaxScaler(z)
const descale = (scaledZ) => (scaledZ * standard_dev) + average;
const values = vector.map(scale);
return {
components: {
average,
standard_dev,
maximum,
minimum,
},
scale,
descale,
values,
};
}
/**
* Transforms features by scaling each feature to a given range.
This estimator scales and translates each feature individually such that it is in the given range on the training set, i.e. between zero and one.
* @memberOf util
* @param {number[]} z - array of integers or floats
* @returns {number[]}
*/
const MinMaxScaler= (z) => scale(z, (max(z) - min(z)));
/** This function returns two functions that can mix max scale new inputs and reverse scale new outputs
* @param {Number[]} values - array of numbers
* @returns {Object} - {scale[ Function ], descale[ Function ]}
*/
function MinMaxScalerTransforms(vector = []) {
const average = avg(vector);
const standard_dev = sd(vector);
const maximum = max(vector);
const minimum = min(vector);
const scale = (z)=> (z - average) / (maximum - minimum); // equivalent to MinMaxScaler(z)
const descale = (scaledZ) => (scaledZ * (maximum - minimum)) + average;
const values = vector.map(scale);
return {
components: {
average,
standard_dev,
maximum,
minimum,
},
scale,
descale,
values,
};
}
/**
* Converts z-score into the probability
* @memberOf util
* @see {@link https://stackoverflow.com/questions/36575743/how-do-i-convert-probability-into-z-score}
* @param {number} z - Number of standard deviations from the mean.
* @returns {number} p - p-value
*/
function approximateZPercentile(z, alpha=true) {
// If z is greater than 6.5 standard deviations from the mean
// the number of significant digits will be outside of a reasonable
// range.
if (z < -6.5)
return 0.0;
if (z > 6.5)
return 1.0;
let factK = 1;
let sum = 0;
let term = 1;
let k = 0;
let loopStop = Math.exp(-23);
while (Math.abs(term) > loopStop) {
term = 0.3989422804 * Math.pow(-1, k) * Math.pow(z, k) / (2 * k + 1) /
Math.pow(2, k) * Math.pow(z, k + 1) / factK;
sum += term;
k++;
factK *= k;
}
sum += 0.5;
return (alpha) ? 1 - sum : sum;
}
/**
* @namespace
*/
export const util = {
range,
rangeRight,
scale,
avg,
mean: avg,
sum,
max,
min,
sd,
StandardScaler,
StandardScalerTransforms,
MinMaxScaler,
MinMaxScalerTransforms,
LogScaler: (z) => z.map(Math.log),
ExpScaler: (z) => z.map(Math.exp),
squaredDifference,
standardError,
coefficientOfDetermination,
coefficientOfCorrelation,
r: coefficientOfCorrelation,
rSquared,
adjustedCoefficentOfDetermination,
rBarSquared: adjustedCoefficentOfDetermination,
adjustedRSquared: adjustedCoefficentOfDetermination,
pivotVector,
pivotArrays,
standardScore,
zScore: standardScore,
approximateZPercentile,
// approximatePercentileZ,
};