## 9.2 连续与离散型特征的树的构建

In [1]:
from numpy import *
class treeNode():
    def __init__(self,feat,val,right,left):
        self.featureToSplitOn=feat
        self.valueOfSplit=val
        rightBranch=right
        leftBrance=left

def loadDataSet(fileName):
    dataMat=[]
    with open(fileName) as fr:
        for line in fr.readlines():
            curLine=line.strip().split('\t')
            dataMat.append(map(float,curLine))
    return dataMat

def binSplitDataSet(dataSet,feature,value):
    mat0=dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1=dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1

def regLeaf(dataSet):##生成叶节点，值为均值
    return mean(dataSet[:,-1])

def regErr(dataSet):##计算平方误差
    return var(dataSet[:,-1])*shape(dataSet)[0]

def chooseBestSplit(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
    tolS=ops[0]
    tolN=ops[1]
    if(len(set(dataSet[:,-1].A1.tolist()))==1):
        return None,leafType(dataSet),None,None
    n,m=shape(dataSet)
    S=errType(dataSet)
    bestS=inf
    bestIndex=0
    bestValue=0
    for featIndex in range(m-1):
        for splitVal in set(dataSet[:,featIndex].A1.tolist()):
            mat0,mat1=binSplitDataSet(dataSet,featIndex,splitVal)
            if(shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN):
                continue
            newS=errType(mat0)+errType(mat1)
            if(newS<bestS):
                bestS=newS
                bestIndex=featIndex
                bestValue=splitVal
    if((S-bestS)<tolS):
        return None,leafType(dataSet),None,None
    mat0,mat1=binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0]<tolN) or (shape(mat1)[0]<tolN):
        return None, leafType(dataSet),None,None
    return bestIndex,bestValue,mat0,mat1

def createTree(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
    feat,val,lSet,rSet=chooseBestSplit(dataSet,leafType,errType,ops)
    if(feat==None):
        return val
    retTree={}
    retTree['spInd']=feat
    retTree['spVal']=val
    retTree['left']=createTree(lSet,leafType,errType,ops)
    retTree['right']=createTree(rSet,leafType,errType,ops)
    if(getattr(leafType,'__name__')=='regLeaf'):
        retTree['mean']=mean(dataSet[:,-1])
    return retTree

In [2]:
dataSet=mat(loadDataSet('ex0.txt'))

In [3]:
createTree(dataSet,ops=(10,4))

{'left': {'left': {'left': 3.9871631999999999,
   'mean': 3.4672557710843379,
   'right': 2.9836209534883724,
   'spInd': 1,
   'spVal': 0.797583},
  'mean': 2.9675496160000003,
  'right': 1.980035071428571,
  'spInd': 1,
  'spVal': 0.582002},
 'mean': 2.0036986799999998,
 'right': {'left': 1.0289583666666666,
  'mean': 0.39728045333333334,
  'right': -0.023838155555555553,
  'spInd': 1,
  'spVal': 0.197834},
 'spInd': 1,
 'spVal': 0.39435}

## 9.4.2 后剪枝

In [4]:
def isTree(obj):##判断是不是 非叶节点
    return (type(obj).__name__=='dict')

def getMean(tree):
    if(isTree(tree['left'])):
        tree['left']=getMean(tree['left'])
    if(isTree(tree['right'])):
        tree['right']=getMean(tree['right'])
    return (tree['left']+tree['right'])/2.0

def prune(tree,testData):
    
    if(shape(testData)[0]==0):
        return getMean(tree)
    
    ltestData, rtestData=binSplitDataSet(testData,tree['spInd'],tree['spVal'])
    
    if(isTree(tree['left']) or isTree('right')):
        if isTree(tree['left']):
            tree['left']=prune(tree['left'],ltestData)
        if isTree(tree['right']):
            tree['right']=prune(tree['right'],rtestData)
            
    if(not isTree(tree['left']) and not isTree(tree['right'])):
        leftN=shape(ltestData)[0]
        rightN=shape(rtestData)[0]
        
        if(leftN==0):
            return tree['right']
        
        if(rightN==0):
            return tree['left']

        errorNoMerge=sum(power(ltestData[:,-1]-tree['left'],2))+sum(power(rtestData[:,-1] - tree['right'],2))
        errorMerge=sum(power(testData[:,-1]-tree['mean'],2))
        if(errorMerge<errorNoMerge): 
            print "merging"
            return tree['mean']
    return tree

In [5]:
myMat2=mat(loadDataSet('ex2.txt'))
myTree=createTree(myMat2,ops=(0.5,20))

In [6]:
myDataTest=mat(loadDataSet('ex2test.txt'))
prune(myTree,myDataTest)

merging


{'left': {'left': {'left': 99.627333307692297,
   'mean': 96.316547237288134,
   'right': 89.860514400000014,
   'spInd': 0,
   'spVal': 0.833026},
  'mean': 101.35815937735848,
  'right': 107.68699163829788,
  'spInd': 0,
  'spVal': 0.729397},
 'mean': 52.480096384999996,
 'right': {'left': 2.9213988695652184,
  'mean': -2.6377193297872341,
  'right': {'left': -10.621031625000001,
   'mean': -4.438560436619718,
   'right': {'left': -3.2337600000000002,
    'mean': -1.2815538723404256,
    'right': 1.3539243999999997,
    'spInd': 0,
    'spVal': 0.131833},
   'spInd': 0,
   'spVal': 0.284794},
  'spInd': 0,
  'spVal': 0.382037},
 'spInd': 0,
 'spVal': 0.499171}

## 9.5 模型树

In [7]:
def linearSolve(dataSet):
    n,m=shape(dataSet)
    X=mat(ones((n,m)))
    X[:,1:m]=dataSet[:,0:m-1]##增添常数列1
    Y=dataSet[:,-1]
    xTx=X.T*X
    if(linalg.det(xTx)==0.0):
        raise NameError('This matrix is singular, cannot do inverse,try increasing the second value of ops')
    ws=xTx.I*(X.T*Y)
    return ws,X,Y

def modelLeaf(dataSet):
    ws,X,Y=linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws,X,Y=linearSolve(dataSet)
    yHat=X*ws
    return sum(power(Y-yHat,2))

In [8]:
myMat2=mat(loadDataSet('ex2.txt'))
createTree(myMat2,modelLeaf,modelErr,(10000,10))

{'left': matrix([[ 119.62969999],
         [ -24.09733657]]), 'right': matrix([[-2.39368033],
         [-0.89717985]]), 'spInd': 0, 'spVal': 0.499171}

## 9.6 示例：树回归与标准回归

In [9]:
def regTreeEval(model, inDat):
    return float(model)

def modelTreeEval(model, inDat):
    m=shape(inDat)[1]
    X=mat(ones((1,m+1)))
    X[:,1:m+1]=inDat
    return float(X*model)

def treeForeCast(tree,inData,modelEval=regTreeEval):
    
    if(not isTree(tree)):
        return modelEval(tree,inData)
    
    if(inData[tree['spInd']] > tree['spVal']):
        if(isTree(tree['left'])):
            return treeForeCast(tree['left'], inData, modelEval)
        else:
            return modelEval(tree['left'],inData)
    else:
        if(isTree(tree['right'])):
            return treeForeCast(tree['right'],inData,modelEval)
        else:
            return modelEval(tree['right'],inData)
        
def createForeCast(tree,testData,modelEval=regTreeEval):
    n=len(testData)
    yHat=mat(zeros((n,1)))
    for i in range(n):
        yHat[i,0] = treeForeCast(tree, mat(testData[i]),modelEval)
    return yHat

In [10]:
trainMat=mat(loadDataSet('bikeSpeedVsIq_train.txt'))
testMat=mat(loadDataSet('bikeSpeedVsIq_test.txt'))
myTree=createTree(trainMat,ops=(1,20))
yHat=createForeCast(myTree,testMat[:,0])
corrcoef(yHat,testMat[:,1],rowvar=0)

array([[ 1.        ,  0.96408523],
       [ 0.96408523,  1.        ]])

In [11]:
myTree=createTree(trainMat,modelLeaf,modelErr,ops=(1,20))
yHat=createForeCast(myTree,testMat[:,0],modelTreeEval)
corrcoef(yHat,testMat[:,1],rowvar=0)

array([[ 1.        ,  0.97604122],
       [ 0.97604122,  1.        ]])