# Part1: Crawler, Store Server, Repository

In [1]:
import urllib2
import urlparse

def simplifyUrl(url):
    return urlparse.urlsplit(url).geturl()

def crawlWebpage(url):
    connection = urllib2.urlopen(url)
    page = connection.read()
    connection.close
    return page
testPageUrl = 'https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library'
testPageId = 1234
testPage = crawlWebpage(testPageUrl)
print simplifyUrl(testPageUrl)

https://en.wikipedia.org/wiki/Lightweight_Java_Game_Library


In [2]:
import zlib
import urlparse
import struct

packetFormat = 'IxHI'
packetByteSize = 12

def checksum(url):
    import binascii
    return binascii.crc32(url) & 0xffffffff #this keeps the results consistent across all platforms

#packs a given webpage information into bytes
def packWebpage(webpage, url, docID):
    url = simplifyUrl(url)
    packet = struct.pack(packetFormat, docID, len(url), len(webpage)) + url + webpage
    return packet

#unpacks a given packet into webpage, url, and docID
def unpackWebpage(packet):
    info = struct.unpack(packetFormat, packet[:packetByteSize])#because 3 arguments
    docID, urlLength, docLength = info[:]
    url = packet[packetByteSize:packetByteSize + urlLength]
    webpage = packet[packetByteSize + urlLength:]
    return webpage, url, docID

#packs a webpage and saves it by docID. save name is subject to change in the future.
def storePage(webpage, url, docID):
    url = simplifyUrl(url)
    print("Struct Size: %i" %struct.calcsize(packetFormat))
    packet = packWebpage(webpage, url, docID)
    compressedPacket = zlib.compress(packet)
    output = open('\\poople\\page_repository\\' + urlparse.urlparse(url)[1] + '%i' % docID, 'w')
    output.write(compressedPacket)
    output.close()

packet = packWebpage(testPage, testPageUrl, testPageId)
storePage(testPage, testPageUrl, testPageId)

Struct Size: 12


# Indexer(Parsing part) and Lexicon

In [11]:
from bs4 import BeautifulSoup
import time
import re
import string

class Hit:
    def __init__(self, word, capital, fontSize):
        self.word = word
        self.capital = capital
        self.fontSize = fontSize

def parseTitleHits(page):
    hits = []
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    simplifiedTitle = re.sub(r'\W+', ' ', title).split(" ")
    for word in simplifiedTitle:
        capital = not word.islower()
        fontSize = 6
        hits.append(Hit(word.lower(), capital, fontSize))
    return hits

#for hit in parseTitleHits(testPage):
#    print "w: " + hit.word + "\tcap:", hit.capital, "\tfont: ", hit.fontSize


def simplifyPage(page):
    before = time.clock()
    soup = BeautifulSoup(testPage, "lxml")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    text = re.sub(r'\W+', ' ', text)
    after = time.clock()
    print("Simplify Page Time: %f" %(after-before))
    return text

def parseWordsInPage(simplifiedPage):
    before = time.clock()
    #words = re.split(r'\W+', simplifiedPage)
    words = simplifiedPage.split(' ')
    after = time.clock()
    print("Parse Time: %f" %(after-before))
    for word in words:
        if word == "":
            words.remove(word)
    return words

simplifiedPage = simplifyPage(testPage)
wordList = parseWordsInPage(simplifiedPage)
for word in wordList:
    print word
    


Simplify Page Time: 0.083336
Parse Time: 0.000164
Lightweight
Java
Game
Library
Wikipedia
the
free
encyclopedia
Lightweight
Java
Game
Library
From
Wikipedia
the
free
encyclopedia
Jump
to
navigation
search
This
article
has
multiple
issues
Please
help
improve
it
or
discuss
these
issues
on
the
talk
page
Learn
how
and
when
to
remove
these
template
messages
The
topic
of
this
article
may
not
meet
Wikipedia
s
general
notability
guideline
Please
help
to
establish
notability
by
citing
reliable
secondary
sources
that
are
independent
of
the
topic
and
provide
significant
coverage
of
it
beyond
its
mere
trivial
mention
If
notability
cannot
be
established
the
article
is
likely
to
be
merged
redirected
or
deleted
Find
sources
Lightweight
Java
Game
Library
news
newspapers
books
scholar
JSTOR
free
images
March
2016
Learn
how
and
when
to
remove
this
template
message
This
article
relies
too
much
on
references
to
primary
sources
Please
improve
this
by
adding
secondary
or
tertiary
sources
August
2015
Learn
h

In [5]:
import sys
import time

def wordHash(word):
    word = word.lower()
    return binascii.crc32(word) & 0xffffffff

def addWordToLexicon(word, wordIndex, lexiconList, wordIds):
    word = word.lower()
    wordIds[wordHash(word)] = wordIndex
    wordIndex += 1
    lexiconList.append(word)
    return wordIds, lexiconList, wordIndex

def getWordId(word):
    try:
        lowerWord = word.lower()
        return wordIds[wordHash(lowerWord)]
    except KeyError:
        print "Word not in lexicon: " + word
        return -1;
    
def saveLexicon(lexiconList):
    lexicon = ""
    for word in lexiconList:
        lexicon += word + ","
    output = open('\\poople\\lexicon', 'w')
    output.write(lexicon)
    output.close()
    
def loadLexicon():
    lexiconList = []
    stream = open('\\poople\\lexicon', 'r')
    lexicon = stream.read()
    stream.close()
    lexiconList = lexicon.split(",")
    newLexiconList = []
    wordIndex = 0
    wordIds = {}
    for word in lexiconList:
        wordIds, newLexiconList, wordIndex = addWordToLexicon(word, wordIndex, newLexiconList, wordIds)
    return wordIds, lexiconList, wordIndex
    
wordIds, lexiconList, wordIndex = loadLexicon()
print wordIds

{0: 1400, 1788938242: 1159, 3187900075: 1113, 3230498821: 1044, 3747223559: 565, 835352588: 959, 2902841359: 731, 2260480018: 1087, 820668437: 1302, 1718319126: 1363, 1154021400: 1178, 1724430365: 1123, 1423829025: 1169, 1114099747: 417, 1843675174: 561, 1964050481: 500, 2568837171: 1137, 1762699316: 503, 1574674497: 1222, 1060745282: 886, 4052537413: 1120, 689295432: 918, 666529867: 1288, 1280163916: 949, 2838417488: 728, 3110191185: 433, 2252471652: 784, 460212315: 679, 146714717: 1116, 2897602656: 1002, 711391330: 1157, 1630611555: 292, 395401319: 1310, 306016363: 1009, 1972041839: 818, 3218944360: 1263, 1673277559: 113, 1917560954: 100, 2508312701: 158, 3065852031: 1269, 1437575297: 1135, 2796017799: 1035, 3265925258: 1068, 2297403532: 912, 1458215053: 1341, 1997877400: 494, 376615066: 60, 4097009823: 1323, 2458417313: 716, 1877817509: 720, 4038908070: 1183, 2854008999: 1039, 2223210669: 1182, 432980143: 1054, 1638783154: 609, 2892185779: 610, 1465923764: 699, 3207319737: 196, 3407

# Anchors, URL Resolver, Links, Doc Index, URL Server

# Indexer(rest) Barrels, Sorter, PageRank

# Ranking System, Searcher