In [None]:
my_data=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]

In [46]:
class decisionnode:
    def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb

In [47]:
# Divides a set on a specific column. Can handle numeric
# or nominal values
def divideset(rows,column,value):
    # Make a function that tells us if a row is in 
    # the first group (true) or the second group (false)
    split_function = None
    if isinstance(value,int) or isinstance(value,float):
        split_function = lambda row:row[column] >= value
    else:
        split_function = lambda row:row[column] == value

    # Divide the rows into two sets and return them
    set1 = [row for row in rows if split_function(row)]
    set2 = [row for row in rows if not split_function(row)]
    return (set1,set2)


In [None]:
divideset(my_data,2,'yes')

In [61]:
# Create counts of possible results (the last column of 
# each row is the result)
def uniquecounts(rows):
    results = {}
    for row in rows:
        # The result is the last column
        r = row[len(row)-1]
        if r not in results: results[r] = 0
        results[r] += 1
    return results

# 基尼不纯度

In [48]:
# Probability that a randomly placed item will
# be in the wrong category
def giniimpurity(rows):
    total = len(rows)
    counts = uniquecounts(rows)
    imp = 0
    for k1 in counts:
        p1 = float(counts[k1])/total
        for k2 in counts:
            if k1 == k2: continue
            p2 = float(counts[k2])/total
            imp += p1 * p2
    return imp

# 熵

In [49]:
# Entropy is the sum of p(x)log(p(x)) across all 
# the different possible results
def entropy(rows):
    from math import log
    log2 = lambda x:log(x)/log(2)  
    results = uniquecounts(rows)
    # Now calculate the entropy
    ent = 0.0
    for r in results.keys():
        p = float(results[r])/len(rows)
        ent = ent - p * log2(p)
    return ent

In [None]:
giniimpurity(my_data)

In [None]:
entropy(my_data)

In [None]:
set1,set2 = divideset(my_data,2,'yes')

In [None]:
giniimpurity(set1)

In [None]:
giniimpurity(set2)

In [None]:
entropy(set1)

In [None]:
entropy(set2)

In [50]:
def buildtree(rows,scoref=entropy):
    if len(rows) == 0: return decisionnode()
    current_score = scoref(rows)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_criteria = None
    best_sets = None
  
    column_count = len(rows[0])-1
    for col in range(0,column_count):
        # Generate the list of different values in
        # this column
        column_values = {}
        for row in rows:
            column_values[row[col]] = 1
            
        # Now try dividing the rows up for each value
        # in this column
        for value in column_values.keys():
            (set1,set2) = divideset(rows,col,value)

            # Information gain
            p = float(len(set1))/len(rows)
            gain = current_score - p*scoref(set1) - (1-p)*scoref(set2)
            if gain > best_gain and len(set1)>0 and len(set2)>0:
                best_gain = gain
                best_criteria = (col,value)
                best_sets = (set1,set2)
    # Create the sub branches   
    if best_gain > 0:
        trueBranch = buildtree(best_sets[0])
        falseBranch = buildtree(best_sets[1])
        return decisionnode(col=best_criteria[0],value=best_criteria[1],tb=trueBranch,fb=falseBranch)
    else:
        return decisionnode(results=uniquecounts(rows))

In [None]:
tree = buildtree(my_data)

In [51]:
def printtree(tree,indent=''):
    # Is this a leaf node?
    if tree.results != None:
        print str(tree.results)
    else:
        # Print the criteria
        print str(tree.col) + ':' + str(tree.value) + '? '

        # Print the branches
        print indent + 'T->',
        printtree(tree.tb,indent + '  ')
        print indent + 'F->',
        printtree(tree.fb,indent + '  ')

In [None]:
printtree(tree)

# 绘制图形

In [52]:
from PIL import Image,ImageDraw

In [53]:
def getwidth(tree):
    if tree.tb == None and tree.fb == None: return 1
    return getwidth(tree.tb) + getwidth(tree.fb)

def getdepth(tree):
    if tree.tb == None and tree.fb == None: return 0
    return max(getdepth(tree.tb),getdepth(tree.fb)) + 1

In [54]:
def drawtree(tree,jpeg='tree.jpg'):
    w = getwidth(tree)*100
    h = getdepth(tree)*100 + 120

    img = Image.new('RGB',(w,h),(255,255,255))
    draw = ImageDraw.Draw(img)

    drawnode(draw,tree,w/2,20)
    img.save(jpeg,'JPEG')

In [55]:
def drawnode(draw,tree,x,y):
    if tree.results == None:
        # Get the width of each branch
        w1 = getwidth(tree.fb)*100
        w2 = getwidth(tree.tb)*100

        # Determine the total space required by this node
        left = x-(w1+w2)/2
        right = x+(w1+w2)/2

        # Draw the condition string
        draw.text((x-20,y-10),str(tree.col) + ':' + str(tree.value),(0,0,0))

        # Draw links to the branches
        draw.line((x,y,left + w1/2,y + 100),fill=(255,0,0))
        draw.line((x,y,right - w2/2,y + 100),fill=(255,0,0))

        # Draw the branch nodes
        drawnode(draw,tree.fb,left + w1/2,y+100)
        drawnode(draw,tree.tb,right - w2/2,y+100)
    else:
        txt=' \n'.join(['%s:%d' %v for v in tree.results.items()])
        draw.text((x - 20,y),txt,(0,0,0))

In [None]:
drawtree(tree)

In [56]:
def classify(observation,tree):
    if tree.results != None:
        return tree.results
    else:
        v = observation[tree.col]
        branch = None
        if isinstance(v,int) or isinstance(v,float):
            if v >= tree.value: branch = tree.tb
            else: branch = tree.fb
        else:
            if v == tree.value: branch = tree.tb
            else: branch = tree.fb
    return classify(observation,branch)

In [None]:
classify(['(direct)','USA','yes',51],tree)

In [57]:
def prune(tree,mingain):
    # If the branches aren't leaves, then prune them
    if tree.tb.results == None:
        prune(tree.tb,mingain)
    if tree.fb.results == None:
        prune(tree.fb,mingain)
    
    # If both the subbranches are now leaves, see if they
    # should merged
    if tree.tb.results !=None and tree.fb.results != None:
        # Build a combined dataset
        tb,fb = [],[]
        for v,c in tree.tb.results.items():
            tb += [[v]] * c
        for v,c in tree.fb.results.items():
            fb += [[v]] * c
    
        # Test the reduction in entropy
        delta = entropy(tb + fb) - (entropy(tb) + entropy(fb)/2)

        if delta < mingain:
            # Merge the branches
            tree.tb,tree.fb = None,None
            tree.results = uniquecounts(tb + fb)

In [None]:
a = [1,2]
b = [3,4]
c = a+ b
c

In [None]:
d = a * 3
d

In [None]:
e = [[1],[2]]
f = []
f += e * 3
f

In [None]:
f += e * 3
f

In [None]:
prune(tree,0.1)

In [None]:
printtree(tree)

In [None]:
prune(tree,1.0)

In [None]:
printtree(tree)

In [None]:
prune(tree,10.0)
printtree(tree)

In [58]:
def mdclassify(observation,tree):
    if tree.results != None:
        return tree.results
    else:
        v = observation[tree.col]
        if v == None:
            tr,fr = mdclassify(observation,tree.tb),mdclassify(observation,tree.fb)
            tcount = sum(tr.values())
            fcount = sum(fr.values())
            tw = float(tcount)/(tcount+fcount)
            fw = float(fcount)/(tcount+fcount)
            result = {}
            for k,v in tr.items(): result[k] = v*tw
            for k,v in fr.items(): result[k] = v*fw      
            return result
        else:
            if isinstance(v,int) or isinstance(v,float):
                if v >= tree.value: branch = tree.tb
                else: branch = tree.fb
            else:
                if v == tree.value: branch = tree.tb
                else: branch = tree.fb
            return mdclassify(observation,branch)


In [None]:
mdclassify(['google',None,'yes',None],tree)

In [None]:
mdclassify(['google','France',None,None],tree)

In [59]:
def variance(rows):
    if len(rows) == 0: return 0
    data = [float(row[len(row)-1]) for row in rows]
    mean = sum(data)/len(data)
    variance = sum([(d-mean)**2 for d in data])/len(data)
    return variance

# Zillow

In [78]:
import xml.dom.minidom
import urllib2

In [79]:
zwskey="X1-ZWz1f7enoxuu4r_3o1g1"

In [91]:
def getaddressdata(address,city):
    escad=address.replace(' ','+')
    url = 'http://www.zillow.com/webservice/GetDeepSearchResults.htm?'
    url += 'zws-id=%s&address=%s&citystatezip=%s' % (zwskey,escad,city)
    doc = xml.dom.minidom.parseString(urllib2.urlopen(url).read())
    code = doc.getElementsByTagName('code')[0].firstChild.data
    if code != '0': 
        print 'code:',code
        return None

    try:
        zipcode = doc.getElementsByTagName('zipcode')[0].firstChild.data
        use = doc.getElementsByTagName('useCode')[0].firstChild.data
        year = doc.getElementsByTagName('yearBuilt')[0].firstChild.data
        sqft = doc.getElementsByTagName('finishedSqFt')[0].firstChild.data
        bath = doc.getElementsByTagName('bathrooms')[0].firstChild.data
        bed = doc.getElementsByTagName('bedrooms')[0].firstChild.data
        rooms = doc.getElementsByTagName('totalRooms')[0].firstChild.data
        price = doc.getElementsByTagName('amount')[0].firstChild.data
    except Exception,e:
        print 'e:',e
        print 'doc:',doc
        return None
       
    return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price)

In [81]:
def getpricelist():
    l1=[]
    for line in file('addresslist.txt'):
        data = getaddressdata(line.strip(),'Cambridge,MA')
        l1.append(data)
    return l1

In [82]:
housedata = getpricelist()

e: list index out of range
doc: <xml.dom.minidom.Document instance at 0x06BC33A0>
e: list index out of range
doc: <xml.dom.minidom.Document instance at 0x06BC9D78>
e: list index out of range
doc: <xml.dom.minidom.Document instance at 0x069B0DC8>
e: 'NoneType' object has no attribute 'data'
doc: <xml.dom.minidom.Document instance at 0x069B0A80>
code: 508
code: 508
code: 508
e: list index out of range
doc: <xml.dom.minidom.Document instance at 0x069AC440>
code: 508
e: list index out of range
doc: <xml.dom.minidom.Document instance at 0x06472B70>
code: 508
e: list index out of range
doc: <xml.dom.minidom.Document instance at 0x03259328>
code: 508
code: 508


In [87]:
housedata

[(u'02139', u'Condominium', 1895, 1.5, 3, 6, u'580768'),
 (u'02138', u'SingleFamily', 1847, 1.5, 2, 5, u'841413'),
 (u'02139', u'Triplex', 1884, 3.5, 5, 11, u'1456470'),
 (u'02138', u'MultiFamily2To4', 1920, 1.0, 2, 5, u'667892'),
 None,
 None,
 (u'02138', u'Condominium', 1914, 1.0, 2, 4, u'666479'),
 (u'02138', u'Triplex', 1916, 3.5, 9, 21, u'1772885'),
 None,
 (u'02140', u'Duplex', 1894, 3.5, 4, 9, u'1005138'),
 None,
 (u'02138', u'SingleFamily', 1925, 3.0, 3, 7, u'3671989'),
 (u'02140', u'SingleFamily', 1894, 2.5, 4, 12, u'2241054'),
 (u'02140', u'SingleFamily', 1894, 2.5, 4, 8, u'2051552'),
 (u'02138', u'SingleFamily', 1956, 3.0, 4, 10, u'1521888'),
 (u'02140', u'SingleFamily', 1899, 1.5, 3, 7, u'769738'),
 (u'02138', u'MultiFamily2To4', 1927, 1.0, 2, 10, u'788439'),
 (u'02141', u'Condominium', 1903, 1.0, 3, 5, u'545643'),
 None,
 (u'02140', u'Condominium', 1920, 1.0, 2, 5, u'576397'),
 (u'02138', u'MultiFamily2To4', 1900, 1.0, 2, 4, u'620466'),
 (u'02139', u'Condominium', 1987, 2.

In [88]:
fo = open("zillowdetail.txt", "a+")

In [89]:
for line in housedata:
    if str(line) == str('None'):continue
    #
    for field in line:
        fo.write(str(field) + "\t" )
    #
    fo.write("\n" )

In [90]:
fo.close()

In [102]:
fo = open("zillowdetail.txt", "r")

In [103]:
hdata = [ (line.strip().split('\t')) for line in fo.readlines()]

In [104]:
fo.close()

In [105]:
hsdata = []
for row in hdata:
    #(code,address,year,num1,bathroomnum,roomnum,price) = 
    hsdata.append((str(row[0]),str(row[1]),str(row[2]),float(row[3]),int(row[4]),int(row[5]),int(row[6])))

In [106]:
hsdata

[('02139', 'Condominium', '1895', 1.5, 3, 6, 580768),
 ('02138', 'SingleFamily', '1847', 1.5, 2, 5, 841413),
 ('02139', 'Triplex', '1884', 3.5, 5, 11, 1456470),
 ('02138', 'MultiFamily2To4', '1920', 1.0, 2, 5, 667892),
 ('02138', 'Condominium', '1914', 1.0, 2, 4, 666479),
 ('02138', 'Triplex', '1916', 3.5, 9, 21, 1772885),
 ('02140', 'Duplex', '1894', 3.5, 4, 9, 1005138),
 ('02138', 'SingleFamily', '1925', 3.0, 3, 7, 3671989),
 ('02140', 'SingleFamily', '1894', 2.5, 4, 12, 2241054),
 ('02140', 'SingleFamily', '1894', 2.5, 4, 8, 2051552),
 ('02138', 'SingleFamily', '1956', 3.0, 4, 10, 1521888),
 ('02140', 'SingleFamily', '1899', 1.5, 3, 7, 769738),
 ('02138', 'MultiFamily2To4', '1927', 1.0, 2, 10, 788439),
 ('02141', 'Condominium', '1903', 1.0, 3, 5, 545643),
 ('02140', 'Condominium', '1920', 1.0, 2, 5, 576397),
 ('02138', 'MultiFamily2To4', '1900', 1.0, 2, 4, 620466),
 ('02139', 'Condominium', '1987', 2.5, 3, 5, 865799),
 ('02139', 'Condominium', '1820', 2.5, 3, 6, 867467),
 ('02139', 

In [107]:
housetree = buildtree(hsdata,scoref=variance)

In [108]:
drawtree(housetree,'housetree.jpg')

# Hot or Not

In [109]:
import urllib2
import xml.dom.minidom

In [114]:
api_key='479NUNJHETN'

In [115]:
def getrandomratings(c):
    # Construct URL for getRandomProfile
    url = "http://services.hotornot.com/rest/?app_key=%s" % api_key
    url += "&method=Rate.getRandomProfile&retrieve_num=%d" % c
    url += "&get_rate_info=true&meet_users_only=true"
  
    f1 = urllib2.urlopen(url).read()

    doc = xml.dom.minidom.parseString(f1)

    emids = doc.getElementsByTagName('emid')
    ratings = doc.getElementsByTagName('rating')

    # Combine the emids and ratings together into a list
    result = []
    for e,r in zip(emids,ratings):
        if r.firstChild != None:
            result.append((e.firstChild.data,r.firstChild.data))
    return result

In [116]:
stateregions={'New England':['ct','mn','ma','nh','ri','vt'],
              'Mid Atlantic':['de','md','nj','ny','pa'],
              'South':['al','ak','fl','ga','ky','la','ms','mo',
                       'nc','sc','tn','va','wv'],
              'Midwest':['il','in','ia','ks','mi','ne','nd','oh','sd','wi'],
              'West':['ak','ca','co','hi','id','mt','nv','or','ut','wa','wy']}

In [117]:
def getpeopledata(ratings):
    result = []
    for emid,rating in ratings:
        # URL for the MeetMe.getProfile method
        url = "http://services.hotornot.com/rest/?app_key=%s" % api_key
        url += "&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid

        # Get all the info about this person
        try:
            rating = int(float(rating)+0.5)
            doc2 = xml.dom.minidom.parseString(urllib2.urlopen(url).read())
            gender = doc2.getElementsByTagName('gender')[0].firstChild.data
            age = doc2.getElementsByTagName('age')[0].firstChild.data
            loc = doc2.getElementsByTagName('location')[0].firstChild.data[0:2]

            # Convert state to region
            for r,s in stateregions.items():
                if loc in s: region = r

            if region != None:
                result.append((gender,int(age),region,rating))
        except:
            pass
    return result

In [118]:
ll = getrandomratings(5)

HTTPError: HTTP Error 404: Not Found