Skip to content
Permalink
Browse files
[processing] Use QgsStringStatisticalSummary in basic stats for strings
And also further optimise the algorithm
  • Loading branch information
nyalldawson committed Nov 30, 2016
1 parent e272bb3 commit ab29f2de282fc3a98a66923ef7f23b2d4311cc10
@@ -31,6 +31,9 @@

from qgis.PyQt.QtGui import QIcon

from qgis.core import (QgsStringStatisticalSummary,
QgsFeatureRequest)

from processing.core.GeoAlgorithm import GeoAlgorithm
from processing.core.parameters import ParameterVector
from processing.core.parameters import ParameterTableField
@@ -54,6 +57,8 @@ class BasicStatisticsStrings(GeoAlgorithm):
EMPTY = 'EMPTY'
FILLED = 'FILLED'
UNIQUE = 'UNIQUE'
MIN_VALUE = 'MIN_VALUE'
MAX_VALUE = 'MAX_VALUE'

def getIcon(self):
return QIcon(os.path.join(pluginPath, 'images', 'ftools', 'basic_statistics.png'))
@@ -78,6 +83,8 @@ def defineCharacteristics(self):
self.addOutput(OutputNumber(self.EMPTY, self.tr('Number of empty values')))
self.addOutput(OutputNumber(self.FILLED, self.tr('Number of non-empty values')))
self.addOutput(OutputNumber(self.UNIQUE, self.tr('Number of unique values')))
self.addOutput(OutputNumber(self.MIN_VALUE, self.tr('Minimum string value')))
self.addOutput(OutputNumber(self.MAX_VALUE, self.tr('Maximum string value')))

def processAlgorithm(self, progress):
layer = dataobjects.getObjectFromUri(
@@ -86,77 +93,47 @@ def processAlgorithm(self, progress):

outputFile = self.getOutputValue(self.OUTPUT_HTML_FILE)

index = layer.fields().lookupField(fieldName)

sumValue = 0
minValue = 0
maxValue = 0
meanValue = 0
nullValues = 0
filledValues = 0

isFirst = True
values = []

features = vector.features(layer)
request = QgsFeatureRequest().setFlags(QgsFeatureRequest.NoGeometry).setSubsetOfAttributes([fieldName],
layer.fields())
stat = QgsStringStatisticalSummary()
features = vector.features(layer, request)
count = len(features)
total = 100.0 / count
total = 100.0 / float(count)
for current, ft in enumerate(features):
value = ft[fieldName]
if value:
length = float(len(value))
filledValues += 1
else:
nullValues += 1
progress.setPercentage(int(current * total))
continue

if isFirst:
minValue = length
maxValue = length
isFirst = False
else:
if length < minValue:
minValue = length
if length > maxValue:
maxValue = length

values.append(length)
sumValue += length

stat.addValue(ft[fieldName])
progress.setPercentage(int(current * total))

n = float(len(values))
if n > 0:
meanValue = sumValue / n

uniqueValues = vector.getUniqueValuesCount(layer, index)
stat.finalize()

data = []
data.append(self.tr('Analyzed layer: {}').format(layer.name()))
data.append(self.tr('Analyzed field: {}').format(fieldName))
data.append(self.tr('Minimum length: {}').format(minValue))
data.append(self.tr('Maximum length: {}').format(maxValue))
data.append(self.tr('Mean length: {}').format(meanValue))
data.append(self.tr('Filled values: {}').format(filledValues))
data.append(self.tr('NULL (missing) values: {}').format(nullValues))
data.append(self.tr('Count: {}').format(count))
data.append(self.tr('Unique: {}').format(uniqueValues))
data.append(self.tr('Minimum length: {}').format(stat.minLength()))
data.append(self.tr('Maximum length: {}').format(stat.maxLength()))
data.append(self.tr('Mean length: {}').format(stat.meanLength()))
data.append(self.tr('Filled values: {}').format(stat.count() - stat.countMissing()))
data.append(self.tr('NULL (missing) values: {}').format(stat.countMissing()))
data.append(self.tr('Count: {}').format(stat.count()))
data.append(self.tr('Unique: {}').format(stat.countDistinct()))
data.append(self.tr('Minimum string value: {}').format(stat.min()))
data.append(self.tr('Maximum string value: {}').format(stat.max()))

self.createHTML(outputFile, data)

self.setOutputValue(self.MIN_LEN, minValue)
self.setOutputValue(self.MAX_LEN, maxValue)
self.setOutputValue(self.MEAN_LEN, meanValue)
self.setOutputValue(self.FILLED, filledValues)
self.setOutputValue(self.EMPTY, nullValues)
self.setOutputValue(self.COUNT, count)
self.setOutputValue(self.UNIQUE, uniqueValues)
self.setOutputValue(self.MIN_LEN, stat.minLength())
self.setOutputValue(self.MAX_LEN, stat.maxLength())
self.setOutputValue(self.MEAN_LEN, stat.meanLength())
self.setOutputValue(self.FILLED, stat.count() - stat.countMissing())
self.setOutputValue(self.EMPTY, stat.countMissing())
self.setOutputValue(self.COUNT, stat.count())
self.setOutputValue(self.UNIQUE, stat.countDistinct())
self.setOutputValue(self.MIN_VALUE, stat.min())
self.setOutputValue(self.MAX_VALUE, stat.max())

def createHTML(self, outputFile, algData):
with codecs.open(outputFile, 'w', encoding='utf-8') as f:
f.write('<html><head>\n')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>\n')
for s in algData:
f.write('<p>' + str(s) + '</p>\n')
f.write('</body></html>')
f.write('</body></html>\n')
@@ -2,11 +2,13 @@
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
<p>Analyzed layer: multipolys.gml</p>
<p>Analyzed field: Bname</p>
<p>Minimum length: 4.0</p>
<p>Maximum length: 4.0</p>
<p>Mean length: 4.0</p>
<p>Minimum length: 0</p>
<p>Maximum length: 4</p>
<p>Mean length: 3.0</p>
<p>Filled values: 3</p>
<p>NULL (missing) values: 1</p>
<p>Count: 4</p>
<p>Unique: 2</p>
</body></html>
<p>Minimum string value: Test</p>
<p>Maximum string value: Test</p>
</body></html>

0 comments on commit ab29f2d

Please sign in to comment.