trees/gr.go

package trees

import (
	"github.com/sjwhitworth/golearn/base"
	"math"
)

//
// Information Gatio Ratio generator
//

// InformationGainRatioRuleGenerator generates DecisionTreeRules which
// maximise the InformationGain at each node.
type InformationGainRatioRuleGenerator struct {
}

// GenerateSplitRule returns a DecisionTreeRule which maximises information
// gain ratio considering every available Attribute.
//
// IMPORTANT: passing a base.Instances with no Attributes other than the class
// variable will panic()
func (r *InformationGainRatioRuleGenerator) GenerateSplitRule(f base.FixedDataGrid) *DecisionTreeRule {

	attrs := f.AllAttributes()
	classAttrs := f.AllClassAttributes()
	candidates := base.AttributeDifferenceReferences(attrs, classAttrs)

	return r.GetSplitRuleFromSelection(candidates, f)
}

// GetSplitRuleFromSelection returns the DecisionRule which maximizes information gain,
// considering only a subset of Attributes.
//
// IMPORTANT: passing a zero-length consideredAttributes parameter will panic()
func (r *InformationGainRatioRuleGenerator) GetSplitRuleFromSelection(consideredAttributes []base.Attribute, f base.FixedDataGrid) *DecisionTreeRule {

	var selectedAttribute base.Attribute
	var selectedVal float64

	// Parameter check
	if len(consideredAttributes) == 0 {
		panic("More Attributes should be considered")
	}

	// Next step is to compute the information gain at this node
	// for each randomly chosen attribute, and pick the one
	// which maximises it
	maxRatio := math.Inf(-1)

	// Compute the base entropy
	classDist := base.GetClassDistribution(f)
	baseEntropy := getBaseEntropy(classDist)

	// Compute the information gain for each attribute
	for _, s := range consideredAttributes {
		var informationGain float64
		var localEntropy float64
		var splitVal float64
		if fAttr, ok := s.(*base.FloatAttribute); ok {
			localEntropy, splitVal = getNumericAttributeEntropy(f, fAttr)
		} else {
			proposedClassDist := base.GetClassDistributionAfterSplit(f, s)
			localEntropy = getSplitEntropy(proposedClassDist)
		}
		informationGain = baseEntropy - localEntropy
		informationGainRatio := informationGain / localEntropy
		if informationGainRatio > maxRatio {
			maxRatio = informationGainRatio
			selectedAttribute = s
			selectedVal = splitVal
		}
	}

	// Pick the one which maximises IG
	return &DecisionTreeRule{selectedAttribute, selectedVal}
}