In [1]:
"""
Rolling a Die
"""

import random

def rollDie():
    return random.choice([1,2,3,4,5,6])

def rollN(n):
    result = ''
    for i in range(n):
        result = result + str(rollDie())
    return result

print rollN(5)

def getTarget(goal):
    numTries = 0
    numRolls = len(goal)
    while True:
        numTries += 1
        result = rollN(numRolls)
        if result == goal:
            return numTries

def runSim(goal, numTrials):
    total = 0
    for i in range(numTrials):
        total += getTarget(goal)
    print 'Average number of tries =', total/float(numTrials)

runSim('11111', 100)
runSim('54324', 100)

43313
Average number of tries = 7762.61
Average number of tries = 6245.71


In [2]:
def atLeastOneOne(numRolls, numTrials):
    numSuccess = 0
    for i in range(numTrials):
        rolls = rollN(numRolls)
        if '1' in rolls:
            numSuccess += 1
        fracSuccess = numSuccess/float(numTrials)
    print fracSuccess

atLeastOneOne(10, 1000)

0.836


In [3]:
"""
Introduction to Hashing
"""

import random

def strToInt(s):
    number = ''
    for c in s:
        number = number + str(ord(c))
    index = int(number)
    return index

print 'Index =', strToInt('a')
print 'Index =', strToInt('John is a cool dude')


Index = 97
Index = 74111104110321051153297329911111110832100117100101


In [4]:
def hashStr(s, tableSize = 101):
    number = ''
    for c in s:
        number = number + str(ord(c))
    index = int(number)%tableSize
    return index

##print hashStr('a')
##print hashStr('John is a cool dude')


print hashStr('Eric', 7)
print hashStr('Chris', 7)
print hashStr('Sarina', 7)

print hashStr('Jill', 7)

2
3
5
3


In [5]:
"""
Using Hashing to Look Up Information
"""

import random

class intDict(object):
    """A dictionary with integer keys"""
    
    def __init__(self, numBuckets):
        """Create an empty dictionary"""
        self.buckets = []
        self.numBuckets = numBuckets
        for i in range(numBuckets):
            self.buckets.append([])
            
    def addEntry(self, dictKey, dictVal):
        """Assumes dictKey an int.  Adds an entry."""
        hashBucket = self.buckets[dictKey%self.numBuckets]
        for i in range(len(hashBucket)):
            if hashBucket[i][0] == dictKey:
                hashBucket[i] = (dictKey, dictVal)
                return
        hashBucket.append((dictKey, dictVal))
        
    def getValue(self, dictKey):
        """Assumes dictKey an int.  Returns entry associated
           with the key dictKey"""
        hashBucket = self.buckets[dictKey%self.numBuckets]
        for e in hashBucket:
            if e[0] == dictKey:
                return e[1]
        return None
    
    def __str__(self):
        res = '{'
        for b in self.buckets:
            for t in b:
                res = res + str(t[0]) + ':' + str(t[1]) + ','
        return res[:-1] + '}' #res[:-1] removes the last comma

D = intDict(29)
for i in range(20):
    #choose a random int in range(10**5)
    key = random.choice(range(10**5))
    D.addEntry(key, i)

print '\n', 'The buckets are:'
for hashBucket in D.buckets: #violates abstraction barrier
    print '  ', hashBucket


The buckets are:
   [(15689, 18)]
   []
   [(43734, 4)]
   [(46258, 17)]
   [(18158, 14)]
   [(25757, 9)]
   []
   [(95243, 5)]
   []
   [(7955, 11), (27936, 16)]
   []
   []
   []
   [(42643, 2), (44847, 10)]
   []
   []
   [(24811, 13), (6976, 19)]
   []
   []
   []
   []
   [(48103, 12)]
   []
   [(82325, 0), (61967, 3), (99290, 15)]
   [(83022, 1)]
   []
   [(73135, 6)]
   [(26910, 8)]
   [(65336, 7)]


In [14]:
"""
L3 PROBLEM 5 INTRODUCTION

Recall from the last problem that when creating hash tables, we try to optimize both the size of the table (as small as possible)
and lookup time for elements (as short as possible). It turns out that by making the hash table large enough, we can reduce the 
number of collisions sufficiently to allow us to treat the complexity of lookup as almost O(1). I.e. we can trade space for 
time. But what is the tradeoff?

First, let's get a formulation of the problem. Assume:

- The range of the hash function is range(n)
- The number of insertions is K
- The hash function produces a perfectly uniform distribution of the keys used in insertions. This means that for all keys, k, 
  and integers, i, in range(n), the probability that hash(k) = i is \frac{1}{n}.

So, what is the probability that at least one collision occurs?

The question is equivalent to asking "given K randomly generated integers in range(n), what is the probability that at least 
two of them are equal?" As is often the case, it is easiest to start by answering the inverse question, "given K randomly 
generated integers in range(n), what is the probability that none of them are equal?"

When we insert the first element, the probability of not having a collision is clearly 1 (since the table is empty!). How about
the second insertion? Since there are n-1 hash results left that are not equal to the result of the first hash, n-1 out of n 
choices will not yield a collision. So, the probability of not getting a collision on the second insertion is \frac{n–1}{n}, and 
the probability of not getting a collision on either of the first two insertions is: 

1 \times \frac{n-1}{n}

We can multiply these probabilities - for each insertion the value produced by the hash function is independent of anything that 
has preceded it. Thus the probability of not having a collision after three insertions is:

1 \times \frac{(n−1)}{n} \times \frac{(n−2)}{n}

And after K insertions,

1 \times \frac{(n−1)}{n} \times \frac{(n−2)}{n} \times \times \frac{(n−(K−1))}{n}

To get the probability of having at least one collision, we merely subtract this value from 1:

1 − [\frac{(n−1)}{n} \times \frac{(n−2)}{n} \times \times \frac{(n−(K−1))}{n}]

Given the size of hash table and the number of expected insertion, we can use this formula to calculate the probability of at 
least one collision. If K is reasonably large, say 10,000, it would be a bit tedious to compute the probability with pencil and 
paper. That leaves two choices, mathematics and programming. Mathematicians have used some fairly advanced mathematics to find a 
way to approximate the value of this series. But unless K is very large, it is easier to run some code to compute the exact 
value of the series. In this problem we'll look at some simulations that examine hashing probabilities.

L3 Problem 5

For this problem, download intDictTests.py, a file that contains some simulations that will help us examine the properties of 
hashing. Read through the extra functions to understand what they do; call the appropriate function with the right parameters 
to answer the following questions.
"""

# intDictTests.py
def collision_prob(numBuckets, numInsertions):
    '''
    Given the number of buckets and the number of items to insert, 
    calculates the probability of a collision.
    '''
    prob = 1.0
    for i in range(1, numInsertions):
        prob = prob * ((numBuckets - i) / float(numBuckets))
    return 1 - prob

print collision_prob(365, 250)


1.0


In [12]:
def sim_insertions(numBuckets, numInsertions):
    '''
    Run a simulation for numInsertions insertions into the hash table.
    '''
    choices = range(numBuckets)
    used = []
    for i in range(numInsertions):
        hashVal = random.choice(choices)
        if hashVal in used:
            return False
        else:
            used.append(hashVal)
    return True

def observe_prob(numBuckets, numInsertions, numTrials):
    '''
    Given the number of buckets and the number of items to insert, 
    runs a simulation numTrials times to observe the probability of a collision.
    '''
    probs = []
    for t in range(numTrials):
        probs.append(sim_insertions(numBuckets, numInsertions))
    return 1 - sum(probs)/float(numTrials)


print observe_prob(365, 30, 1000)


0.676


In [16]:
numBuckets = 365
numInsertions = 365
while True:
    col = collision_prob(numBuckets, numInsertions)
    if col < 0.99:
        break
    numInsertions -= 1
print numInsertions

56
