Skip to content
Permalink
Browse files

[processing] Use QgsStatisticalSummary for statistic calculations

For BasicStatisticssNumbers and StatisticsByCategories algorithms.
Adds extra stats (minority, majority, 1st/3rd quartile and IQR) while
avoiding custom statistics calculation code.
  • Loading branch information
nyalldawson committed Nov 16, 2015
1 parent 526b8fc commit e15962eea97d8aea241c1e9560eb1dcada0ee1a0
@@ -43,7 +43,7 @@ class QgsStatisticalSummary
/** Constructor for QgsStatisticalSummary
* @param stats flags for statistics to calculate
*/
QgsStatisticalSummary( const QgsStatisticalSummary::Statistics& stats = QgsStatisticalSummary::Statistics( 0 ) );
QgsStatisticalSummary( const QgsStatisticalSummary::Statistics& stats = QgsStatisticalSummary::All );

virtual ~QgsStatisticalSummary();

@@ -27,6 +27,7 @@

import math

from qgis.core import QgsStatisticalSummary
from processing.core.GeoAlgorithm import GeoAlgorithm
from processing.core.parameters import ParameterVector
from processing.core.parameters import ParameterTableField
@@ -51,6 +52,11 @@ class BasicStatisticsNumbers(GeoAlgorithm):
RANGE = 'RANGE'
MEDIAN = 'MEDIAN'
UNIQUE = 'UNIQUE'
MINORITY = 'MINORITY'
MAJORITY = 'MAJORITY'
FIRSTQUARTILE = 'FIRSTQUARTILE'
THIRDQUARTILE = 'THIRDQUARTILE'
IQR = 'IQR'

def defineCharacteristics(self):
self.name, self.i18n_name = self.trAlgorithm('Basic statistics for numeric fields')
@@ -70,11 +76,16 @@ def defineCharacteristics(self):
self.addOutput(OutputNumber(self.MAX, self.tr('Maximum value')))
self.addOutput(OutputNumber(self.SUM, self.tr('Sum')))
self.addOutput(OutputNumber(self.MEAN, self.tr('Mean value')))
self.addOutput(OutputNumber(self.STD_DEV, self.tr('Standard deviation')))
self.addOutput(OutputNumber(self.COUNT, self.tr('Count')))
self.addOutput(OutputNumber(self.RANGE, self.tr('Range')))
self.addOutput(OutputNumber(self.MEDIAN, self.tr('Median')))
self.addOutput(OutputNumber(self.UNIQUE, self.tr('Number of unique values')))
self.addOutput(OutputNumber(self.STD_DEV, self.tr('Standard deviation')))
self.addOutput(OutputNumber(self.MINORITY, self.tr('Minority (rarest occurring value)')))
self.addOutput(OutputNumber(self.MAJORITY, self.tr('Majority (most frequently occurring value)')))
self.addOutput(OutputNumber(self.FIRSTQUARTILE, self.tr('First quartile')))
self.addOutput(OutputNumber(self.THIRDQUARTILE, self.tr('Third quartile')))
self.addOutput(OutputNumber(self.IQR, self.tr('Interquartile Range (IQR)')))

def processAlgorithm(self, progress):
layer = dataobjects.getObjectFromUri(
@@ -92,6 +103,11 @@ def processAlgorithm(self, progress):
meanValue = 0
medianValue = 0
stdDevValue = 0
minority = 0
majority = 0
firstQuartile = 0
thirdQuartile = 0
iqr = 0

isFirst = True
values = []
@@ -102,43 +118,30 @@ def processAlgorithm(self, progress):
current = 0
for ft in features:
if ft.attributes()[index]:
value = float(ft.attributes()[index])
if isFirst:
minValue = value
maxValue = value
isFirst = False
else:
if value < minValue:
minValue = value
if value > maxValue:
maxValue = value

values.append(value)
sumValue += value
values.append(float(ft.attributes()[index]))

current += 1
progress.setPercentage(int(current * total))

# Calculate additional values
rValue = maxValue - minValue
uniqueValue = vector.getUniqueValuesCount(layer, index)

if count > 0:
meanValue = sumValue / count
if meanValue != 0.00:
for v in values:
stdDevValue += (v - meanValue) * (v - meanValue)
stdDevValue = math.sqrt(stdDevValue / count)
cvValue = stdDevValue / meanValue

if count > 1:
tmp = sorted(values)

# Calculate median
if count % 2 == 0:
medianValue = 0.5 * (tmp[(count - 1) / 2] + tmp[count / 2])
else:
medianValue = tmp[(count + 1) / 2 - 1]
stat = QgsStatisticalSummary()
stat.calculate(values)

count = stat.count()
uniqueValue = stat.variety()
minValue = stat.min()
maxValue = stat.max()
rValue = stat.range()
sumValue = stat.sum()
meanValue = stat.mean()
medianValue = stat.median()
stdDevValue = stat.stDev()
if meanValue != 0.00:
cvValue = stdDevValue / meanValue
minority = stat.minority()
majority = stat.majority()
firstQuartile = stat.firstQuartile()
thirdQuartile = stat.thirdQuartile()
iqr = stat.interQuartileRange()

data = []
data.append('Count: ' + unicode(count))
@@ -151,6 +154,11 @@ def processAlgorithm(self, progress):
data.append('Median value: ' + unicode(medianValue))
data.append('Standard deviation: ' + unicode(stdDevValue))
data.append('Coefficient of Variation: ' + unicode(cvValue))
data.append('Minority (rarest occurring value): ' + unicode(minority))
data.append('Majority (most frequently occurring value): ' + unicode(majority))
data.append('First quartile: ' + unicode(firstQuartile))
data.append('Third quartile: ' + unicode(thirdQuartile))
data.append('Interquartile Range (IQR): ' + unicode(iqr))

self.createHTML(outputFile, data)

@@ -163,7 +171,11 @@ def processAlgorithm(self, progress):
self.setOutputValue(self.MEAN, meanValue)
self.setOutputValue(self.MEDIAN, medianValue)
self.setOutputValue(self.STD_DEV, stdDevValue)
self.setOutputValue(self.CV, cvValue)
self.setOutputValue(self.MINORITY, minority)
self.setOutputValue(self.MAJORITY, majority)
self.setOutputValue(self.FIRSTQUARTILE, firstQuartile)
self.setOutputValue(self.THIRDQUARTILE, thirdQuartile)
self.setOutputValue(self.IQR, iqr)

def createHTML(self, outputFile, algData):
f = open(outputFile, 'w')
@@ -25,7 +25,7 @@

__revision__ = '$Format:%H$'

import math
from qgis.core import QgsStatisticalSummary
from processing.core.outputs import OutputTable
from processing.core.GeoAlgorithm import GeoAlgorithm
from processing.tools import dataobjects, vector
@@ -83,36 +83,11 @@ def processAlgorithm(self, progress):

fields = ['category', 'min', 'max', 'mean', 'stddev', 'sum', 'count']
writer = output.getTableWriter(fields)
stat = QgsStatisticalSummary(QgsStatisticalSummary.Min | QgsStatisticalSummary.Max |
QgsStatisticalSummary.Mean | QgsStatisticalSummary.StDevSample |
QgsStatisticalSummary.Sum | QgsStatisticalSummary.Count)

for (cat, v) in values.items():
(min, max, mean, stddev, sum) = calculateStats(v)
record = [cat, min, max, mean, stddev, sum, len(v)]
stat.calculate(v)
record = [cat, stat.min(), stat.max(), stat.mean(), stat.sampleStDev(), stat.sum(), stat.count()]
writer.addRecord(record)


def calculateStats(values):
n = 0
sum = 0
mean = 0
M2 = 0
minvalue = None
maxvalue = None

for v in values:
sum += v
n = n + 1
delta = v - mean
mean = mean + delta / n
M2 = M2 + delta * (v - mean)
if minvalue is None:
minvalue = v
maxvalue = v
else:
minvalue = min(v, minvalue)
maxvalue = max(v, maxvalue)

if n > 1:
variance = M2 / (n - 1)
else:
variance = 0
stddev = math.sqrt(variance)
return (minvalue, maxvalue, mean, stddev, sum)
@@ -65,7 +65,7 @@ class CORE_EXPORT QgsStatisticalSummary
/** Constructor for QgsStatisticalSummary
* @param stats flags for statistics to calculate
*/
QgsStatisticalSummary( const QgsStatisticalSummary::Statistics& stats = Statistics( 0 ) );
QgsStatisticalSummary( const QgsStatisticalSummary::Statistics& stats = All );

virtual ~QgsStatisticalSummary();

@@ -156,7 +156,9 @@ void TestQgsStatisticSummary::individualStatCalculations()
QgsStatisticalSummary::Statistic stat = ( QgsStatisticalSummary::Statistic ) statInt;
QFETCH( double, expected );

QgsStatisticalSummary s;
//start with a summary which calculates NO statistics
QgsStatisticalSummary s( QgsStatisticalSummary::Statistics( 0 ) );
//set it to calculate just a single statistic
s.setStatistics( stat );
QCOMPARE( s.statistics(), stat );

0 comments on commit e15962e

Please sign in to comment.
You can’t perform that action at this time.