In [1]:
from random import random,randint
import math

In [2]:
def wineprice(rating,age):
    peak_age = rating-50

    # Calculate price based on rating
    price = rating/2
    if age > peak_age:
        # Past its peak, goes bad in 10 years
        price = price*(5-(age-peak_age)/2)
    else:
        # Increases to 5x original value as it
        # approaches its peak
        price=price*(5*((age+1)/peak_age))
    if price < 0: price = 0
    
    #
    return price

In [3]:
def wineset1():
    rows = []
    for i in range(300):
        # Create a random age and rating
        rating = random()*50+50
        age = random()*50

        # Get reference price
        price = wineprice(rating,age)

        # Add some noise
        price *= (random()*0.4 + 0.8)

        # Add to the dataset
        rows.append({'input':(rating,age),'result':price})
    return rows

In [None]:
wineprice(95,45)

In [None]:
wineprice(95,50)

In [None]:
data = wineset1()

In [None]:
data[0]

In [None]:
data[1]

In [4]:
def euclidean(v1,v2):
    d = 0.0
    for i in range(len(v1)):
        d += (v1[i]-v2[i])**2
    return math.sqrt(d)

In [None]:
data[0]['input']

In [None]:
data[1]['input']

In [None]:
euclidean(data[0]['input'],data[1]['input'])

In [5]:
def getdistances(data,vec1):
    distancelist = []

    # Loop over every item in the dataset
    for i in range(len(data)):
        vec2 = data[i]['input']
        # Add the distance and the index
        distancelist.append((euclidean(vec1,vec2),i))
  
    # Sort by distance
    distancelist.sort()
    return distancelist

In [6]:
def knnestimate(data,vec1,k=5):
    # Get sorted distances
    dlist = getdistances(data,vec1)
    avg = 0.0

    # Take the average of the top k results
    for i in range(k):
        idx = dlist[i][1]
        avg += data[idx]['result']
    #
    avg = avg/k
    return avg

In [None]:
knnestimate(data,(95,3))

In [None]:
knnestimate(data,(95,45))

In [None]:
wineprice(95,45)

In [None]:
knnestimate(data,(95,45),k=3)

In [7]:
def inverseweight(dist,num=1.0,const=0.1):
    return num/(dist+const)

In [8]:
def subtractweight(dist,const=1.0):
    if dist > const: 
        return 0
    else: 
        return const-dist

In [9]:
def gaussian(dist,sigma=5.0):
    return math.e**(-dist**2/(2*sigma**2))

In [None]:
subtractweight(0.1)

In [None]:
inverseweight(0.1)

In [None]:
gaussian(0.1)

In [None]:
gaussian(1)

In [None]:
inverseweight(1)

In [None]:
subtractweight(1)

In [10]:
def weightedknn(data,vec1,k=5,weightf=gaussian):
    # Get distances
    dlist = getdistances(data,vec1)
    avg = 0.0
    totalweight = 0.0
  
    # Get weighted average
    for i in range(k):
        dist = dlist[i][0]
        idx = dlist[i][1]
        weight = weightf(dist)
        avg += weight * data[idx]['result']
        totalweight += weight
        
    if totalweight==0: return 0
    avg = avg/totalweight
    return avg

In [None]:
weightedknn(data,(95,45))

In [None]:
weightedknn(data,(95,3))

In [None]:
weightedknn(data,(95,50))

In [11]:
def dividedata(data,test=0.05):
    trainset = []
    testset = []
    for row in data:
        if random() < test:
            testset.append(row)
        else:
            trainset.append(row)
    #
    return trainset,testset

In [12]:
def testalgorithm(algf,trainset,testset):
    error = 0.0
    for row in testset:
        guess = algf(trainset,row['input'])
        error += (row['result']-guess)**2
        #print row['result'],guess
        #print error/len(testset)
        
    return error/len(testset)

In [13]:
def crossvalidate(algf,data,trials=100,test=0.1):
    error = 0.0
    for i in range(trials):
        trainset,testset = dividedata(data,test)
        error += testalgorithm(algf,trainset,testset)
    return error/trials

In [None]:
crossvalidate(knnestimate,data)

In [14]:
def knn3(d,v):
    return knnestimate(d,v,k=3)

In [None]:
crossvalidate(knn3,data)

In [15]:
def knn1(d,v):
    return knnestimate(d,v,k=1)

In [None]:
crossvalidate(knn1,data)

In [None]:
crossvalidate(weightedknn,data)

In [16]:
def knninverse(d,v):
    return  weightedknn(d,v,k=5,weightf=inverseweight)

In [None]:
crossvalidate(knninverse,data)

# WINE SET 2

In [17]:
def wineset2():
    rows = []
    for i in range(300):
        rating = random()*50+50
        age = random()*50
        aisle = float(randint(1,20))
        bottlesize = [375.0,750.0,1500.0][randint(0,2)]
        price = wineprice(rating,age)
        price *= (bottlesize/750)
        price *= (random()*0.2+0.9)
        rows.append({'input':(rating,age,aisle,bottlesize),'result':price})
    return rows

In [None]:
randint(0,2)

In [None]:
data = wineset2()

In [None]:
crossvalidate(knn3,data)

In [None]:
crossvalidate(weightedknn,data)

In [18]:
def rescale(data,scale):
    scaleddata = []
    for row in data:
        scaled = [scale[i] * row['input'][i] for i in range(len(scale))]
        scaleddata.append({'input':scaled,'result':row['result']})
    return scaleddata

In [None]:
sdata = rescale(data,[10,10,0,0.5])

In [None]:
crossvalidate(weightedknn,sdata)

In [None]:
crossvalidate(knn3,sdata)

In [19]:
def createcostfunction(algf,data):
    def costf(scale):
        sdata = rescale(data,scale)
        return crossvalidate(algf,sdata,trials=20)
    return costf

In [None]:
weightdomain = [(0,20)] * 4

In [20]:
import optimization

In [None]:
costf = createcostfunction(knnestimate,sdata)

In [None]:
optimization.annealingoptimize(weightdomain,costf,step=2)

In [21]:
import optimization_qcg

In [None]:
optimization_qcg.geneticoptimize(weightdomain,costf,popsize=50,step=1,mutprob=0.2,elite=0.2,maxiter=100)

# WINE SET 3

In [22]:
def wineset3():
    rows = wineset1()
    for row in rows:
        if random()<0.5:
            # Wine was bought at a discount store
            row['result'] *= 0.5
    return rows

In [24]:
data = wineset3()

In [25]:
wineprice(90,45)

135

In [26]:
weightedknn(data,[90,45])

90.39812428530564

In [23]:
def probguess(data,vec1,low,high,k=5,weightf=gaussian):
    dlist=getdistances(data,vec1)
    nweight=0.0
    tweight=0.0
  
    for i in range(k):
        dist=dlist[i][0]
        idx=dlist[i][1]
        weight=weightf(dist)
        v=data[idx]['result']
    
        # Is this point in the range?
        if v>=low and v<=high:
            nweight+=weight
        tweight+=weight
        if tweight==0: return 0

        # The probability is the weights in the range
        # divided by all the weights
    return nweight/tweight

In [None]:
a = [3,2,1,4,10]
a.sort()
a

In [27]:
probguess(data,[90,45],0,40)

0.0

In [28]:
probguess(data,[90,45],40,80)

0.20823544841181368

In [29]:
probguess(data,[90,45],80,120)

0.5845451559970536

In [31]:
probguess(data,[90,45],120,160)

0.20721939559113273

In [32]:
probguess(data,[90,45],160,1000)

0.0

In [34]:
from pylab import *

In [35]:
a = array([1,2,3,4])
b = array([4,3,2,1])
plot(a,b)
show()

In [36]:
t1 = arange(0.0,10.0,0.1)
t1

array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,
        1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,
        2.2,  2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,
        3.3,  3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,
        4.4,  4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,
        5.5,  5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,
        6.6,  6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,
        7.7,  7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,
        8.8,  8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,
        9.9])

In [37]:
plot(t1,sin(t1))

[<matplotlib.lines.Line2D at 0x983c890>]

In [38]:
show()

In [40]:
def cumulativegraph(data,vec1,high,k=5,weightf=gaussian):
    t1 = arange(0.0,high,0.1)
    cprob = array([probguess(data,vec1,0,v,k,weightf) for v in t1])
    plot(t1,cprob)
    show()

In [47]:
cumulativegraph(data,(90,45),160)

In [44]:
def probabilitygraph(data,vec1,high,k=5,weightf=gaussian,ss=5.0):
    # Make a range for the prices
    t1 = arange(0.0,high,0.1)

    # Get the probabilities for the entire range
    probs = [probguess(data,vec1,v,v+0.1,k,weightf) for v in t1]

    # Smooth them by adding the gaussian of the nearby probabilites
    smoothed = []
    for i in range(len(probs)):
        sv = 0.0
        for j in range(0,len(probs)):
            dist = abs(i-j)*0.1
            weight = gaussian(dist,sigma=ss)
            sv += weight*probs[j]
        smoothed.append(sv)
    smoothed = array(smoothed)

    plot(t1,smoothed)
    show()

In [45]:
probabilitygraph(data,(90,45),160)

# Ebay

In [48]:
import httplib
from xml.dom.minidom import parse, parseString, Node

In [49]:
devKey = 'YOUR DEV KEY'
appKey = 'YOUR APP KEY'
certKey = 'YOUR CERT KEY'
serverUrl = 'api.ebay.com'
userToken = 'YOUR TOKEN'

In [50]:
def getHeaders(apicall,siteID="0",compatabilityLevel = "433"):
    headers = {"X-EBAY-API-COMPATIBILITY-LEVEL": compatabilityLevel,
                "X-EBAY-API-DEV-NAME": devKey,
                "X-EBAY-API-APP-NAME": appKey,
                "X-EBAY-API-CERT-NAME": certKey,
                "X-EBAY-API-CALL-NAME": apicall,
                "X-EBAY-API-SITEID": siteID,
                "Content-Type": "text/xml"}
    return headers

In [51]:
def sendRequest(apicall,xmlparameters):
    connection = httplib.HTTPSConnection(serverUrl)
    connection.request("POST", '/ws/api.dll', xmlparameters, getHeaders(apicall))
    response = connection.getresponse()
    if response.status != 200:
        print "Error sending request:" + response.reason
    else: 
        data = response.read()
        connection.close()
    return data

In [52]:
def getSingleValue(node,tag):
    nl = node.getElementsByTagName(tag)
    if len(nl)>0:
        tagNode = nl[0]
        if tagNode.hasChildNodes():
            return tagNode.firstChild.nodeValue
    return '-1'

In [53]:
def doSearch(query,categoryID=None,page=1):
    xml = "<?xml version='1.0' encoding='utf-8'?>" + \
            "<GetSearchResultsRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">" + \
            "<RequesterCredentials><eBayAuthToken>"  + \
            userToken  + \
            "</eBayAuthToken></RequesterCredentials>" + \
            "<Pagination>"+\
            "<EntriesPerPage>200</EntriesPerPage>" + \
            "<PageNumber>"+str(page)+"</PageNumber>" + \
            "</Pagination>" + \
            "<Query>" + query + "</Query>"
    if categoryID != None:
        xml += "<CategoryID>" + str(categoryID) + "</CategoryID>"
    xml += "</GetSearchResultsRequest>"

    data = sendRequest('GetSearchResults',xml)
    response = parseString(data)
    itemNodes = response.getElementsByTagName('Item');
    results = []
    for item in itemNodes:
        itemId = getSingleValue(item,'ItemID')
        itemTitle = getSingleValue(item,'Title')
        itemPrice = getSingleValue(item,'CurrentPrice')
        itemEnds = getSingleValue(item,'EndTime')
        results.append((itemId,itemTitle,itemPrice,itemEnds))
    return results

In [54]:
def getCategory(query='',parentID=None,siteID='0'):
    lquery = query.lower()
    xml = "<?xml version='1.0' encoding='utf-8'?>" + \
            "<GetCategoriesRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">" + \
            "<RequesterCredentials><eBayAuthToken>" + \
            userToken  + \
            "</eBayAuthToken></RequesterCredentials>" + \
            "<DetailLevel>ReturnAll</DetailLevel>" + \
            "<ViewAllNodes>true</ViewAllNodes>" + \
            "<CategorySiteID>" + siteID + "</CategorySiteID>"
    if parentID == None:
        xml += "<LevelLimit>1</LevelLimit>"
    else:
        xml += "<CategoryParent>" + str(parentID) + "</CategoryParent>"
    xml += "</GetCategoriesRequest>"
    data = sendRequest('GetCategories',xml)
    categoryList = parseString(data)
    catNodes = categoryList.getElementsByTagName('Category')
    for node in catNodes:
        catid=getSingleValue(node,'CategoryID')
        name=getSingleValue(node,'CategoryName')
        if name.lower().find(lquery) != -1:
            print catid,name

In [55]:
def getItem(itemID):
    xml = "<?xml version='1.0' encoding='utf-8'?>" + \
            "<GetItemRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">" + \
            "<RequesterCredentials><eBayAuthToken>" + \
            userToken + \
            "</eBayAuthToken></RequesterCredentials>" + \
            "<ItemID>" + str(itemID) + "</ItemID>" + \
            "<DetailLevel>ItemReturnAttributes</DetailLevel>" + \
            "</GetItemRequest>"
    data = sendRequest('GetItem',xml)
    result = {}
    response = parseString(data)
    result['title'] = getSingleValue(response,'Title')
    sellingStatusNode = response.getElementsByTagName('SellingStatus')[0];
    result['price'] = getSingleValue(sellingStatusNode,'CurrentPrice')
    result['bids'] = getSingleValue(sellingStatusNode,'BidCount')
    seller = response.getElementsByTagName('Seller')
    result['feedback'] = getSingleValue(seller[0],'FeedbackScore')

    attributeSet = response.getElementsByTagName('Attribute');
    attributes = {}
    for att in attributeSet:
        attID = att.attributes.getNamedItem('attributeID').nodeValue
        attValue = getSingleValue(att,'ValueLiteral')
        attributes[attID] = attValue
    result['attributes'] = attributes
    return result


In [56]:
def makeLaptopDataset():
    searchResults = doSearch('laptop',categoryID=51148)
    result = []
    for r in searchResults:
        item = getItem(r[0])
        att = item['attributes']
        try:
            data = (float(att['12']),float(att['26444']),
                    float(att['26446']),float(att['25710']),
                    float(item['feedback'])
               )
            entry = {'input':data,'result':float(item['price'])}
            result.append(entry)
        except:
            print item['title'] + ' failed'
    return result