![Image of Yaktocat](http://i.stack.imgur.com/aogj9.gif)
# Part1: Crawler, Store Server, Repository

In [1]:
# crawl webpages

import urllib2
import urlparse

def simplifyUrl(url):
    return urlparse.urlsplit(url).geturl()

def crawlWebpage(url):
    connection = urllib2.urlopen(url)
    page = connection.read()
    connection.close
    return page
testPageUrl = 'https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library'
testPageId = 1234
testPage = crawlWebpage(testPageUrl)
print simplifyUrl(testPageUrl)

https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library


In [2]:
# store pages in repository

import zlib
import urlparse
import struct

packetFormat = 'IxHI'
packetByteSize = 12

def checksum(url):
    import binascii
    return binascii.crc32(url) & 0xffffffff #this keeps the results consistent across all platforms

#packs a given webpage information into bytes
def packWebpage(webpage, url, docID):
    url = simplifyUrl(url)
    packet = struct.pack(packetFormat, docID, len(url), len(webpage)) + url + webpage
    return packet

#unpacks a given packet into webpage, url, and docID
def unpackWebpage(packet):
    info = struct.unpack(packetFormat, packet[:packetByteSize])#because 3 arguments
    docID, urlLength, docLength = info[:]
    url = packet[packetByteSize:packetByteSize + urlLength]
    webpage = packet[packetByteSize + urlLength:]
    return webpage, url, docID

#packs a webpage and saves it by docID. save name is subject to change in the future.
def storePage(webpage, url, docID):
    url = simplifyUrl(url)
    print("Struct Size: %i" %struct.calcsize(packetFormat))
    packet = packWebpage(webpage, url, docID)
    compressedPacket = zlib.compress(packet)
    output = open('poople\\page_repository\\' + urlparse.urlparse(url)[1] + '%i' % docID, 'w')
    output.write(compressedPacket)
    output.close()

packet = packWebpage(testPage, testPageUrl, testPageId)
storePage(testPage, testPageUrl, testPageId)

Struct Size: 12


# Indexer(Parsing part) and Lexicon

In [3]:
# parse and simplify page

from bs4 import BeautifulSoup
import time
import re
import string

def simplifyPage(page):
    before = time.clock()
    soup = BeautifulSoup(testPage, "lxml")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    text = re.sub(r'\W+', ' ', text)
    after = time.clock()
    print("Simplify Page Time: %f" %(after-before))
    return text

def parseWordsInPage(simplifiedPage):
    before = time.clock()
    #words = re.split(r'\W+', simplifiedPage)
    words = simplifiedPage.split(' ')
    after = time.clock()
    print("Parse Time: %f" %(after-before))
    for word in words:
        if word == "":
            words.remove(word)
    return words


# test page simplification
simplifiedPage = simplifyPage(testPage)
wordList = parseWordsInPage(simplifiedPage)
for word in wordList:
    print word

Simplify Page Time: 0.135830
Parse Time: 0.000163
Lightweight
Java
Game
Library
Wikipedia
the
free
encyclopedia
Lightweight
Java
Game
Library
From
Wikipedia
the
free
encyclopedia
Jump
to
navigation
search
This
article
has
multiple
issues
Please
help
improve
it
or
discuss
these
issues
on
the
talk
page
Learn
how
and
when
to
remove
these
template
messages
The
topic
of
this
article
may
not
meet
Wikipedia
s
general
notability
guideline
Please
help
to
establish
notability
by
citing
reliable
secondary
sources
that
are
independent
of
the
topic
and
provide
significant
coverage
of
it
beyond
its
mere
trivial
mention
If
notability
cannot
be
established
the
article
is
likely
to
be
merged
redirected
or
deleted
Find
sources
Lightweight
Java
Game
Library
news
newspapers
books
scholar
JSTOR
free
images
March
2016
Learn
how
and
when
to
remove
this
template
message
This
article
relies
too
much
on
references
to
primary
sources
Please
improve
this
by
adding
secondary
or
tertiary
sources
August
2015
Learn
h

In [4]:
# TO DELETE

import sys
import time
import binascii

#The hash we are using. Can be changed without affecting the program
def wordHash(word):
    word = word.lower()
    return binascii.crc32(word) & 0xffffffff

#adds the word to the hashtable
#saves the word to the file so next time it will be loaded like normal
def addWordToLexicon(word, hashtable):
    word = word.lower()
    hashtable = addWordToHashtable(word, hashtable)
    saveNewWord(word)
    return hashtable

#adds word to given hashtable. Does not save to file. Hashes word
def addWordToHashtable(word, hashtable):
    word = word.lower()
    hashtable[wordHash(word)] = len(hashtable)
    return hashtable

#gets the wordID from the hashtable
def getWordId(word, hashtable):
    try:
        lowerWord = word.lower()
        return hashtable[wordHash(lowerWord)]
    except KeyError:
        print "Word not in lexicon: " + word
        return -1;

#saves the word to a file
def saveNewWord(word):
    word = word.lower()
    word = word + ","
    output = open('poople\\lexicon', 'a')
    output.write(word)
    output.close()
    
#loads the lexicon from file. creates the hashtable from lexicon
def loadLexicon():
    lexiconList = []
    stream = open('poople\\lexicon', 'r')
    lexicon = stream.read()
    stream.close()
    lexiconList = lexicon.split(",")
    hashtable = {}
    for word in lexiconList:
        if word != '':
            hashtable = addWordToHashtable(word, hashtable)
    return hashtable
    
wordHashtable = loadLexicon()
print len(wordHashtable)
addWordToHashtable("hello", wordHashtable)
for word in simplifiedPage.split(" "):
    addWordToHashtable(word, wordHashtable)
print getWordId("hello", wordHashtable)
print len(wordHashtable)
print wordHashtable

1
1
698
{0L: 698, 1788938242L: 572, 3187900075L: 539, 3230498821L: 488, 3747223559L: 284, 835352588L: 429, 2902841359L: 362, 2260480018L: 517, 820668437L: 648, 1718319126L: 681, 1154021400L: 584, 1724430365L: 546, 1423829025L: 580, 64952355L: 0, 1843675174L: 282, 3358384175L: 671, 1964050481L: 257, 2568837171L: 558, 1762699316L: 258, 1574674497L: 612, 1060745282L: 394, 4052537413L: 544, 689295432L: 404, 666529867L: 639, 1280163916L: 420, 2838417488L: 360, 3110191185L: 230, 2252471652L: 375, 460212315L: 338, 146714717L: 541, 2897602656L: 459, 711391330L: 570, 1630611555L: 162, 395401319L: 653, 306016363L: 464, 1972041839L: 384, 3218944360L: 627, 1673277559L: 78, 1917560954L: 71, 2508312701L: 92, 3065852031L: 628, 1437575297L: 557, 2796017799L: 480, 3265925258L: 505, 2297403532L: 400, 1458215053L: 669, 1997877400L: 255, 376615066L: 46, 4097009823L: 662, 2458417313L: 354, 1877817509L: 356, 4038908070L: 588, 2854008999L: 483, 2223210669L: 587, 432980143L: 494, 1638783154L: 306, 2892185779L

# Anchors, URL Resolver, Links, Doc Index, URL Server

# Indexer(rest) Barrels, Sorter, PageRank

In [5]:
#handle (en/de)coding hits

def encodePlainHit(cap, imp, pos):
    result = 0
    if(cap):
        result +=   0b1000000000000000 #2^16
    result += imp << 12
    result += pos & 0b0000111111111111
    return result
def encodeFancyHit(cap, ftype, pos):
    result = 0
    if(cap):
        result +=   0b1000000000000000 #2^16
    result += 0b0111000000000000 #fancy hits are always 7
    result += ftype << 8
    return result
def encodeAnchorHit(cap, ahash, pos):
    result = 0b0111000000000000
    if(cap):
        result +=   0b1000000000000000 #2^16
    result += pos
    result += ahash << 4
    return result

def decodeHit(hit):
    result = []
    result.append(hit >> 15)
    imp = (hit >> 12) & 0b111
    result.append(imp)
    if(imp != 7):
        result.append(hit & 0b0000111111111111)
    else: #fancy hit
        fancyType = (hit >> 8) & 0b1111
        result.append(fancyType)
        if(fancyType == 0): #anchor hit
            result.append(hit >> 4 & 0b1111)
            result.append(hit & 0b1111)
        else:
            result.append(hit & 0b11111111)
    return result

hit = encodePlainHit(False, 4, 0b001100000111)
fhit = encodeFancyHit(True, 0b0111, 0b10000000)
ahit = encodeAnchorHit(True, 0b1000, 0b1000)
print(decodeHit(hit))
print(decodeHit(fhit))
print(decodeHit(ahit))

[0, 4, 775]
[1, 7, 7, 0]
[1, 7, 0, 8, 8]


In [166]:
# start to put together forward indices

def parseTitleHits(page):
    hits = []
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    simplifiedTitle = re.sub(r'\W+', ' ', title).split(" ")
    result = ""
    for word in simplifiedTitle:
        capital = not word.islower()
        fontSize = 6
        result += encodeFancyHit(capital, 0, 0)
    return result

def encodeHitlists(docId, hits): #untested for accuracy
    wordHits = {}
    for hit in hits:
        if(hit[0] in wordHits):
            wordHits[hit[0]] += struct.pack("L", hit[1])
        else:
            wordHits[hit[0]] = struct.pack("L", hit[1])
    result = struct.pack("L", docId)
    for key in wordHits:
        wordId = getWordId(key, wordHashtable)
        if(wordId == -1):
            continue
        result += struct.pack("L", (0b11111111 & (len(wordHits[key])/2)) +
                              (0b11111111111111111111111100000000 & (wordId << 8)))
        result += wordHits[key]
    result += struct.pack("L", 0) # add null wordID to indicate end of hitlist
    return result

In [263]:
# backwards indices

def pront(pref, text):
    print(pref + ":".join("{:02x}".format(ord(c)) for c in text))

def saveBackwardsIndex(wordId, docId, hits):
    hitIndices = [i for i,x in enumerate(hits) if wordHash(x[0]) == wordId] # finds all instances of wordId in hitlist
    if(len(hitIndices) == 0):
        return # meaning the doc does not contain wordId
    else:
        # generate the entry for the document
        docHits = ""+struct.pack("L", (docId + (len(hitIndices) << 27)))
        for hitId in hitIndices:
            docHits += struct.pack("H", hits[hitId][1])
        # write it to the file
        try: # read the file
            readFile = open("poople\\inv\\%i" % wordId, "r")
            content = readFile.read()
            pront("Content: ", content)
            readFile.close()
        except IOError: # if the file doesn't exist and open() throws exceptions
            writeFile = open("poople\\inv\\%i" % wordId, "w")
            writeFile.write(struct.pack("H", 1) + docHits)
            os.fsync(writeFile.fileno())
            writeFile.close()
            return
        numDocs = struct.unpack("H", content[0:2])[0]
        print(numDocs)
        currentDocId = -1
        currentSeekIndex = 2
        while True:
            temp = content[currentSeekIndex:currentSeekIndex+4]
            pront("MASKED: ", temp)
            if(len(temp) == 0): # if the end of the file is reached, append docHits to the end
                writeFile = open("poople\\inv\\%i" % wordId, "w")
                writeFile.write(struct.pack("H", numDocs+1) +
                                content[2:] + docHits)
                os.fsync(writeFile.fileno())
                writeFile.close()
            temp = struct.unpack("L", temp)[0]
            currentDocId = temp & 0b111111111111111111111111111 #2^27
            numHits = temp >> 27
            if(currentDocId == docId): # replace old entry under docId with an updated copy
                writeFile = open("poople\\inv\\%i" % wordId, "w")
                writeFile.write(content[0:currentSeekIndex] + docHits + content[currentSeekIndex+4+numHits*2:])
                os.fsync(writeFile.fileno())
                writeFile.close()
                break
            if(currentDocId > docId): # insert new entry between next lowest and next highest docIds
                writeFile = open("poople\\inv\\%i" % wordId, "w")
                writeFile.write(struct.pack("H", numDocs+1) +
                                content[2:currentSeekIndex] + docHits + content[currentSeekIndex:])
                os.fsync(writeFile.fileno())
                writeFile.close()
                break
            currentSeekIndex += 4+numHits*2

def loadBackwardsIndex(wordId):
    
    # read file
    try:
        readFile = open("poople\\inv\\%i" % wordId, "r")
        content = readFile.read()
        readFile.close()
    except IOError:
        print("WORD NOT IN BACKWARDS INDEXES")
        return {}
    # load hit lists
    hits = {}
    numDocs = struct.unpack("H", content[0:2])[0]
    print("Number of Documents: %i" % numDocs)
    currentSeekIndex = 2
    for i in xrange(numDocs):
        temp = struct.unpack("L", content[currentSeekIndex:currentSeekIndex+4])[0]
        docId = temp & 0b111111111111111111111111111 #2^27
        numHits = temp >> 27
        print("For DocID '%i" % docId + "', hits: %i" % numHits)
        hits[docId] = content[currentSeekIndex+4:currentSeekIndex+4+2*numHits]
        currentSeekIndex += 4+numHits*2
    return hits

testHits = [("hello", encodePlainHit(1, 0b100, 0b111111111111)),
            ("hello", encodePlainHit(1, 0b111, 0b111111111111)),
            ("hello", encodePlainHit(1, 0b111, 0b111111111111)),
            ("hello", encodePlainHit(1, 0b111, 0b111111111111)),
            ("hello", encodePlainHit(1, 0b111, 0b111111111111)),
            ("the", encodePlainHit(1, 0b0, 0b11111)),
           ("the", encodePlainHit(1, 0b0, 0b11111)),
           ("the", encodePlainHit(1, 0b0, 0b11111)),
           ("the", encodePlainHit(1, 0b0, 0b11111))]
saveBackwardsIndex(wordHash("the"), 0b10, testHits)
results = loadBackwardsIndex(wordHash("the"))
unpacked = struct.unpack("H", results[results.keys()[0]][2:4])[0]
print(unpacked)
print(decodeHit(unpacked))

Content: 02:00:00:00:00:20:1f:80:1f:80:1f:80:1f:80:02:00:00:20:1f:80:1f:80:1f:80:1f:80
2
MASKED: 00:00:00:20
MASKED: 02:00:00:20
MASKED: 


error: unpack requires a string argument of length 4

# Ranking System, Searcher