![Image of Yaktocat](http://i.stack.imgur.com/aogj9.gif)
# Part1: Crawler, Store Server, Repository

In [1]:
import urllib2
import urlparse

def simplifyUrl(url):
    return urlparse.urlsplit(url).geturl()

def crawlWebpage(url):
    connection = urllib2.urlopen(url)
    page = connection.read()
    connection.close
    return page
testPageUrl = 'https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library'
testPageId = 1234
testPage = crawlWebpage(testPageUrl)
print simplifyUrl(testPageUrl)

https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library


In [2]:
import zlib
import urlparse
import struct

packetFormat = 'IxHI'
packetByteSize = 12

def checksum(url):
    import binascii
    return binascii.crc32(url) & 0xffffffff #this keeps the results consistent across all platforms

#packs a given webpage information into bytes
def packWebpage(webpage, url, docID):
    url = simplifyUrl(url)
    packet = struct.pack(packetFormat, docID, len(url), len(webpage)) + url + webpage
    return packet

#unpacks a given packet into webpage, url, and docID
def unpackWebpage(packet):
    info = struct.unpack(packetFormat, packet[:packetByteSize])#because 3 arguments
    docID, urlLength, docLength = info[:]
    url = packet[packetByteSize:packetByteSize + urlLength]
    webpage = packet[packetByteSize + urlLength:]
    return webpage, url, docID

#packs a webpage and saves it by docID. save name is subject to change in the future.
def storePage(webpage, url, docID):
    url = simplifyUrl(url)
    print("Struct Size: %i" %struct.calcsize(packetFormat))
    packet = packWebpage(webpage, url, docID)
    compressedPacket = zlib.compress(packet)
    output = open('\\poople\\page_repository\\' + urlparse.urlparse(url)[1] + '%i' % docID, 'w')
    output.write(compressedPacket)
    output.close()

packet = packWebpage(testPage, testPageUrl, testPageId)
storePage(testPage, testPageUrl, testPageId)

Struct Size: 12


# Indexer(Parsing part) and Lexicon

In [3]:
from bs4 import BeautifulSoup
import time
import re
import string

class Hit:
    def __init__(self, word, capital, fontSize):
        self.word = word
        self.capital = capital
        self.fontSize = fontSize

def parseTitleHits(page):
    hits = []
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    simplifiedTitle = re.sub(r'\W+', ' ', title).split(" ")
    for word in simplifiedTitle:
        capital = not word.islower()
        fontSize = 6
        hits.append(Hit(word.lower(), capital, fontSize))
    return hits

#for hit in parseTitleHits(testPage):
#    print "w: " + hit.word + "\tcap:", hit.capital, "\tfont: ", hit.fontSize


def simplifyPage(page):
    before = time.clock()
    soup = BeautifulSoup(testPage, "lxml")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    text = re.sub(r'\W+', ' ', text)
    after = time.clock()
    print("Simplify Page Time: %f" %(after-before))
    return text

def parseWordsInPage(simplifiedPage):
    before = time.clock()
    #words = re.split(r'\W+', simplifiedPage)
    words = simplifiedPage.split(' ')
    after = time.clock()
    print("Parse Time: %f" %(after-before))
    for word in words:
        if word == "":
            words.remove(word)
    return words

simplifiedPage = simplifyPage(testPage)
wordList = parseWordsInPage(simplifiedPage)
for word in wordList:
    print word
    


Simplify Page Time: 0.080851
Parse Time: 0.000082
Lightweight
Java
Game
Library
Wikipedia
the
free
encyclopedia
Lightweight
Java
Game
Library
From
Wikipedia
the
free
encyclopedia
Jump
to
navigation
search
This
article
has
multiple
issues
Please
help
improve
it
or
discuss
these
issues
on
the
talk
page
Learn
how
and
when
to
remove
these
template
messages
The
topic
of
this
article
may
not
meet
Wikipedia
s
general
notability
guideline
Please
help
to
establish
notability
by
citing
reliable
secondary
sources
that
are
independent
of
the
topic
and
provide
significant
coverage
of
it
beyond
its
mere
trivial
mention
If
notability
cannot
be
established
the
article
is
likely
to
be
merged
redirected
or
deleted
Find
sources
Lightweight
Java
Game
Library
news
newspapers
books
scholar
JSTOR
free
images
March
2016
Learn
how
and
when
to
remove
this
template
message
This
article
relies
too
much
on
references
to
primary
sources
Please
improve
this
by
adding
secondary
or
tertiary
sources
August
2015
Learn
h

In [29]:
import sys
import time
import binascii

#The hash we are using. Can be changed without affecting the program
def wordHash(word):
    word = word.lower()
    return binascii.crc32(word) & 0xffffffff

#adds the word to the hashtable
#saves the word to the file so next time it will be loaded like normal
def addWordToLexicon(word, hashtable):
    word = word.lower()
    hashtable = addWordToHashtable(word, hashtable)
    saveNewWord(word)
    return hashtable

#adds word to given hashtable. Does not save to file. Hashes word
def addWordToHashtable(word, hashtable):
    word = word.lower()
    hashtable[wordHash(word)] = len(hashtable)
    return hashtable

#gets the wordID from the hashtable
def getWordId(word, hashtable):
    try:
        lowerWord = word.lower()
        return hashtable[wordHash(lowerWord)]
    except KeyError:
        print "Word not in lexicon: " + word
        return -1;

#saves the word to a file
def saveNewWord(word):
    word = word.lower()
    word = word + ","
    output = open('\\poople\\lexicon', 'a')
    output.write(word)
    output.close()
    
#loads the lexicon from file. creates the hashtable from lexicon
def loadLexicon():
    lexiconList = []
    stream = open('\\poople\\lexicon', 'r')
    lexicon = stream.read()
    stream.close()
    lexiconList = lexicon.split(",")
    hashtable = {}
    for word in lexiconList:
        if word != '':
            hashtable = addWordToHashtable(word, hashtable)
    return hashtable
    
wordHashtable = loadLexicon()
print len(wordHashtable)
addWordToHashtable("hello", wordHashtable)
print getWordId("hello", wordHashtable)
print len(wordHashtable)
print wordHashtable

698
698
700
{1788938242: 572, 3187900075: 539, 3230498821: 488, 3747223559: 286, 835352588: 429, 2902841359: 364, 2260480018: 517, 820668437: 648, 1718319126: 680, 1154021400: 584, 1724430365: 546, 1423829025: 580, 1114099747: 223, 1843675174: 284, 1964050481: 258, 2568837171: 558, 1762699316: 259, 1574674497: 612, 1060745282: 395, 4052537413: 544, 689295432: 404, 666529867: 639, 1280163916: 420, 2838417488: 362, 3110191185: 233, 2252471652: 376, 460212315: 340, 146714717: 541, 2897602656: 459, 711391330: 570, 1630611555: 165, 395401319: 653, 306016363: 464, 1972041839: 385, 3218944360: 627, 1673277559: 75, 1917560954: 68, 2508312701: 89, 3065852031: 628, 1437575297: 557, 2796017799: 480, 3265925258: 505, 2297403532: 400, 1458215053: 669, 1997877400: 256, 376615066: 43, 4097009823: 662, 2458417313: 356, 1877817509: 358, 4038908070: 588, 2854008999: 483, 2223210669: 587, 432980143: 494, 1638783154: 308, 2892185779: 309, 1465923764: 347, 3207319737: 108, 3407792314: 529, 4283914359: 645,

# Anchors, URL Resolver, Links, Doc Index, URL Server

# Indexer(rest) Barrels, Sorter, PageRank

# Ranking System, Searcher