![Image of Yaktocat](http://i.stack.imgur.com/aogj9.gif)
# Part1: Crawler, Store Server, Repository

In [1]:
# crawl webpages

import urllib2
import urlparse

def simplifyUrl(url):
    return urlparse.urlsplit(url).geturl()

def crawlWebpage(url):
    connection = urllib2.urlopen(url)
    page = connection.read()
    connection.close
    return page
testPageUrl = 'https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library'
testPageId = 1234
testPage = crawlWebpage(testPageUrl)
print simplifyUrl(testPageUrl)

https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library


In [2]:
# store pages in repository

import zlib
import urlparse
import struct

packetFormat = 'IxHI'
packetByteSize = 12

def checksum(url):
    import binascii
    return binascii.crc32(url) & 0xffffffff #this keeps the results consistent across all platforms

#packs a given webpage information into bytes
def packWebpage(webpage, url, docID):
    url = simplifyUrl(url)
    packet = struct.pack(packetFormat, docID, len(url), len(webpage)) + url + webpage
    return packet

#unpacks a given packet into webpage, url, and docID
def unpackWebpage(packet):
    info = struct.unpack(packetFormat, packet[:packetByteSize])#because 3 arguments
    docID, urlLength, docLength = info[:]
    url = packet[packetByteSize:packetByteSize + urlLength]
    webpage = packet[packetByteSize + urlLength:]
    return webpage, url, docID

#packs a webpage and saves it by docID. save name is subject to change in the future.
def storePage(webpage, url, docID):
    url = simplifyUrl(url)
    print("Struct Size: %i" %struct.calcsize(packetFormat))
    packet = packWebpage(webpage, url, docID)
    compressedPacket = zlib.compress(packet)
    output = open('poople\\page_repository\\' + urlparse.urlparse(url)[1] + '%i' % docID, 'w')
    output.write(compressedPacket)
    output.close()

packet = packWebpage(testPage, testPageUrl, testPageId)
storePage(testPage, testPageUrl, testPageId)

Struct Size: 12


# Indexer(Parsing part) and Lexicon

In [3]:
# parse and simplify page

from bs4 import BeautifulSoup
import time
import re
import string

class Hit:
    def __init__(self, word, capital, fontSize):
        self.word = word
        self.capital = capital
        self.fontSize = fontSize

def parseTitleHits(page):
    hits = []
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    simplifiedTitle = re.sub(r'\W+', ' ', title).split(" ")
    for word in simplifiedTitle:
        capital = not word.islower()
        fontSize = 6
        hits.append(Hit(word.lower(), capital, fontSize))
    return hits

#for hit in parseTitleHits(testPage):
#    print "w: " + hit.word + "\tcap:", hit.capital, "\tfont: ", hit.fontSize


def simplifyPage(page):
    before = time.clock()
    soup = BeautifulSoup(testPage, "lxml")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    text = re.sub(r'\W+', ' ', text)
    after = time.clock()
    print("Simplify Page Time: %f" %(after-before))
    return text

def parseWordsInPage(simplifiedPage):
    before = time.clock()
    #words = re.split(r'\W+', simplifiedPage)
    words = simplifiedPage.split(' ')
    after = time.clock()
    print("Parse Time: %f" %(after-before))
    for word in words:
        if word == "":
            words.remove(word)
    return words

simplifiedPage = simplifyPage(testPage)
wordList = parseWordsInPage(simplifiedPage)
for word in wordList:
    print word
    


Simplify Page Time: 0.296359
Parse Time: 0.000365
Lightweight
Java
Game
Library
Wikipedia
the
free
encyclopedia
Lightweight
Java
Game
Library
From
Wikipedia
the
free
encyclopedia
Jump
to
navigation
search
This
article
has
multiple
issues
Please
help
improve
it
or
discuss
these
issues
on
the
talk
page
Learn
how
and
when
to
remove
these
template
messages
The
topic
of
this
article
may
not
meet
Wikipedia
s
general
notability
guideline
Please
help
to
establish
notability
by
citing
reliable
secondary
sources
that
are
independent
of
the
topic
and
provide
significant
coverage
of
it
beyond
its
mere
trivial
mention
If
notability
cannot
be
established
the
article
is
likely
to
be
merged
redirected
or
deleted
Find
sources
Lightweight
Java
Game
Library
news
newspapers
books
scholar
JSTOR
free
images
March
2016
Learn
how
and
when
to
remove
this
template
message
This
article
relies
too
much
on
references
to
primary
sources
Please
improve
this
by
adding
secondary
or
tertiary
sources
August
2015
Learn
h

In [4]:
# TO DELETE

import sys
import time
import binascii

def wordHash(word):
    word = word.lower()
    return binascii.crc32(word) & 0xffffffff

def addWordToLexicon(word, wordIndex, lexiconList, wordIds):    
    word = word.lower()
    wordIds[wordHash(word)] = wordIndex
    wordIndex += 1
    lexiconList.append(word)
    return wordIds, lexiconList, wordIndex
    
def saveLexicon(lexiconList):
    lexicon = ""
    for word in lexiconList:
        lexicon += word + ","
    output = open('poople\\lexicon', 'w')
    output.write(lexicon)
    output.close()
    
def loadLexicon():
    lexiconList = []
    stream = open('poople\\lexicon', 'r')
    lexicon = stream.read()
    stream.close()
    lexiconList = lexicon.split(",")
    newLexiconList = []
    wordIndex = 0
    wordIds = {}
    for word in lexiconList:
        wordIds, newLexiconList, wordIndex = addWordToLexicon(word, wordIndex, newLexiconList, wordIds)
    return wordIds, lexiconList, wordIndex
    
wordIds, lexiconList, wordIndex = loadLexicon()
print wordIds
print sys.getsizeof(lexiconList)

{64952355L: 0}
160


# Anchors, URL Resolver, Links, Doc Index, URL Server

# Indexer(rest) Barrels, Sorter, PageRank

In [5]:
#handle (en/de)coding hits

def encodePlainHit(cap, imp, pos):
    result = 0
    if(cap):
        result +=   0b1000000000000000 #2^16
    result += imp << 12
    result += pos & 0b0000111111111111
    return result
def encodeFancyHit(cap, ftype, pos):
    result = 0
    if(cap):
        result +=   0b1000000000000000 #2^16
    result += 0b0111000000000000 #fancy hits are always 7
    result += ftype << 8
    return result
def encodeAnchorHit(cap, ahash, pos):
    result = 0b0111000000000000
    if(cap):
        result +=   0b1000000000000000 #2^16
    result += pos
    result += ahash << 4
    return result

def decodeHit(hit):
    result = []
    result.append(hit >> 15)
    imp = (hit >> 12) & 0b111
    result.append(imp)
    if(imp != 7):
        result.append(hit & 0b0000111111111111)
    else: #fancy hit
        fancyType = (hit >> 8) & 0b1111
        result.append(fancyType)
        if(fancyType == 0): #anchor hit
            result.append(hit >> 4 & 0b1111)
            result.append(hit & 0b1111)
        else:
            result.append(hit & 0b11111111)
    return result

hit = encodePlainHit(False, 4, 0b001100000111)
fhit = encodeFancyHit(True, 0b0111, 0b10000000)
ahit = encodeAnchorHit(True, 0b1000, 0b1000)
print(decodeHit(hit))
print(decodeHit(fhit))
print(decodeHit(ahit))

[0, 4, 775]
[1, 7, 7, 0]
[1, 7, 0, 8, 8]


In [6]:
# convert list of words into a hit list

def convertToHits(docID, words):
    result = ""
    for word in words:
        result += struct.pack("H", )
        result += struct.pack("H", encodePlainHit(word))
    
    
print(docID, convertToHits(wordList))

TypeError: convertToHits() takes exactly 2 arguments (1 given)

# Ranking System, Searcher