diff --git a/.DS_Store b/.DS_Store index b96aa7e..85257df 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/backend/backendInterface.pyc b/backend/backendInterface.pyc index 735b78e..1deb5a3 100644 Binary files a/backend/backendInterface.pyc and b/backend/backendInterface.pyc differ diff --git a/backend/crawler1.py b/backend/crawler1.py index 4ef1812..cf9c050 100644 --- a/backend/crawler1.py +++ b/backend/crawler1.py @@ -95,7 +95,7 @@ def crawl(self, depth = 2): pages = newpages if __name__=="__main__": - #crawler().crawl() + crawler().crawl() print "CRAWLER FINISHED" print "BEGINNING PAGERANK ALGORITHM" diff --git a/backend/cursorhelpers.py b/backend/cursorhelpers.py index 7b90698..4e15318 100644 --- a/backend/cursorhelpers.py +++ b/backend/cursorhelpers.py @@ -7,7 +7,6 @@ """In general, Inserts return true or false. Searches return a value or None""" class DataBase(object): - @classmethod def drop_tables(cls): try: @@ -22,12 +21,15 @@ def drop_tables(cls): @classmethod def create_tables(cls): - connection.cursor().execute('CREATE TABLE lexicon ( word_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, word VARCHAR(100) UNIQUE NOT NULL)') - connection.cursor().execute('CREATE TABLE document (url_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, url VARCHAR(255) UNIQUE NOT NULL)') - connection.cursor().execute('CREATE TABLE link ( from_doc_id INTEGER NOT NULL REFERENCES document(url_id), to_doc_id INTEGER NOT NULL REFERENCES document(url_id), freq UNSIGNED INTEGER, PRIMARY KEY(from_doc_id, to_doc_id))') - connection.cursor().execute('CREATE TABLE doc_word_index ( doc_id INTEGER REFERENCES document(url_id), word_id INTEGER REFERENCES lexicon(word_id), freq UNSIGNED INTEGER, PRIMARY KEY(doc_id, word_id))') - connection.cursor().execute('CREATE TABLE page_rank(doc_id INTEGER REFERENCES document(url_id), page_rank INTEGER, PRIMARY KEY(doc_id))') - connection.commit() + try: + connection.cursor().execute('CREATE TABLE lexicon ( word_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, word VARCHAR(100) UNIQUE NOT NULL)') + connection.cursor().execute('CREATE TABLE document (url_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, url VARCHAR(255) UNIQUE NOT NULL)') + connection.cursor().execute('CREATE TABLE link ( from_doc_id INTEGER NOT NULL REFERENCES document(url_id), to_doc_id INTEGER NOT NULL REFERENCES document(url_id), freq UNSIGNED INTEGER, PRIMARY KEY(from_doc_id, to_doc_id))') + connection.cursor().execute('CREATE TABLE doc_word_index ( doc_id INTEGER REFERENCES document(url_id), word_id INTEGER REFERENCES lexicon(word_id), freq UNSIGNED INTEGER, PRIMARY KEY(doc_id, word_id))') + connection.cursor().execute('CREATE TABLE page_rank(doc_id INTEGER REFERENCES document(url_id), page_rank INTEGER, PRIMARY KEY(doc_id))') + connection.commit() + except: + print "Error in trying to create tables!\n" class DocLexBaseDB(object): @@ -98,6 +100,9 @@ class Document(DocLexBaseDB): MAXWORDLEN = 255 + + + class LinkWordIndexBaseDB(object): """The doc_word_index and link databases are very similar This class fulfills the functionalities of both. Variable names diff --git a/db/repo.db b/db/repo.db index c8dcb6e..3c371b7 100755 Binary files a/db/repo.db and b/db/repo.db differ diff --git a/frontend/__init__.pyc b/frontend/__init__.pyc index cc9b3e6..60d5fd9 100644 Binary files a/frontend/__init__.pyc and b/frontend/__init__.pyc differ diff --git a/frontend/core.py b/frontend/core.py index c0a35a0..a0b7d3d 100644 --- a/frontend/core.py +++ b/frontend/core.py @@ -18,7 +18,7 @@ def index(): @view('results') def results(): keyword = request.GET.get('keyword') - print search_query(keyword) - return dict(keyword=keyword, val=6) + results = search_query(keyword) + return dict(keyword=keyword, results=results) run(host='localhost', port=8080) \ No newline at end of file diff --git a/frontend/views/index.tpl b/frontend/views/index.tpl index c644c45..8c5b6d0 100644 --- a/frontend/views/index.tpl +++ b/frontend/views/index.tpl @@ -24,7 +24,7 @@
FooBar *
* should pop to 'p', not 'b'. +
Foo
* | * should pop to 'tr', not the first 'td'
+ """
+
+ nestingResetTriggers = self.NESTABLE_TAGS.get(name)
+ isNestable = nestingResetTriggers != None
+ isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ popTo = None
+ inclusive = True
+ for i in range(len(self.tagStack)-1, 0, -1):
+ p = self.tagStack[i]
+ if (not p or p.name == name) and not isNestable:
+ #Non-nestable tags get popped to the top or to their
+ #last occurance.
+ popTo = name
+ break
+ if (nestingResetTriggers is not None
+ and p.name in nestingResetTriggers) \
+ or (nestingResetTriggers is None and isResetNesting
+ and self.RESET_NESTING_TAGS.has_key(p.name)):
+
+ #If we encounter one of the nesting reset triggers
+ #peculiar to this tag, or we encounter another tag
+ #that causes nesting to reset, pop up to but not
+ #including that tag.
+ popTo = p.name
+ inclusive = False
+ break
+ p = p.parent
+ if popTo:
+ self._popToTag(popTo, inclusive)
+
+ def unknown_starttag(self, name, attrs, selfClosing=0):
+ #print "Start tag %s: %s" % (name, attrs)
+ if self.quoteStack:
+ #This is not a real tag.
+ #print "<%s> is not real!" % name
+ attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
+ self.handle_data('<%s%s>' % (name, attrs))
+ return
+ self.endData()
+
+ if not self.isSelfClosingTag(name) and not selfClosing:
+ self._smartPop(name)
+
+ if self.parseOnlyThese and len(self.tagStack) <= 1 \
+ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
+ return
+
+ tag = Tag(self, name, attrs, self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = tag
+ self.previous = tag
+ self.pushTag(tag)
+ if selfClosing or self.isSelfClosingTag(name):
+ self.popTag()
+ if name in self.QUOTE_TAGS:
+ #print "Beginning quote (%s)" % name
+ self.quoteStack.append(name)
+ self.literal = 1
+ return tag
+
+ def unknown_endtag(self, name):
+ #print "End tag %s" % name
+ if self.quoteStack and self.quoteStack[-1] != name:
+ #This is not a real end tag.
+ #print "%s> is not real!" % name
+ self.handle_data('%s>' % name)
+ return
+ self.endData()
+ self._popToTag(name)
+ if self.quoteStack and self.quoteStack[-1] == name:
+ self.quoteStack.pop()
+ self.literal = (len(self.quoteStack) > 0)
+
+ def handle_data(self, data):
+ self.currentData.append(data)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.endData()
+ self.handle_data(text)
+ self.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+ """This parser knows the following facts about HTML:
+
+ * Some tags have no closing tag and should be interpreted as being
+ closed as soon as they are encountered.
+
+ * The text inside some tags (ie. 'script') may contain tags which
+ are not really part of the document and which should be parsed
+ as text, not tags. If you want to parse the text as tags, you can
+ always fetch it and parse it explicitly.
+
+ * Tag nesting rules:
+
+ Most tags can't be nested at all. For instance, the occurance of
+ a tag should implicitly close the previous tag. + + Para1 Para2 + should be transformed into: + Para1 Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a tag should _not_ implicitly close the previous +tag. + + Alice said:Bob said:Blah + should NOT be transformed into: + Alice said:Bob said:Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a |