Skip to content

Commit

Permalink
Backend done
Browse files Browse the repository at this point in the history
  • Loading branch information
alexlitoiu committed Dec 6, 2011
1 parent 1e4784e commit 85bec20
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 19 deletions.
Binary file modified backend/.crawler1.py.swp
Binary file not shown.
Binary file modified backend/.pagerank.py.swp
Binary file not shown.
14 changes: 10 additions & 4 deletions backend/PageRank.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self,doc_id,url,outgoing_links=None, score =0):

def add_outgoing_link(self,outnode):
self.outgoing_links.append(outnode)
self.num_outgoing+=1
self.num_outgoing=len(self.outgoing_links)

def load_graph():
#tuples of (id,url)
Expand All @@ -35,6 +35,9 @@ def load_graph():

#tuples of (from, to, frequency)
raw_edges = db.Link.get_all_edges()

num_from = set([edge[0] for edge in raw_edges])
num_to = set([edge[1] for edge in raw_edges])
for edge in raw_edges:
node1 = nodes[edge[0]]
node2 = nodes[edge[1]]
Expand Down Expand Up @@ -65,17 +68,20 @@ def get_seed_nodes_and_set_seeds(nodes):
return node_seed_list

def populate_page_rank():
return compute_page_rank()
nodes_dict = compute_page_rank()
nodes = nodes_dict.values()
for node in nodes:
db.PageRank.insert_page_rank(node.doc_id,node.score)

def compute_page_rank():
nodes = load_graph()
seed_nodes = get_seed_nodes_and_set_seeds(nodes)
print_nodes(seed_nodes)
#print_nodes(seed_nodes)

for node in seed_nodes:
recursively_propagate_rank(node)

print_nodes(nodes)
#print_nodes(nodes)

return nodes

Expand Down
25 changes: 20 additions & 5 deletions backend/backendInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,24 @@
'''

import Crawler
import URLSequencer
import cursorhelpers as db

def searchQuery():
u = URLSequencer.URLSequencer('urllist.txt')
c = Crawler.Crawler(u._URLSequence)
def searchQuery(word):
"""Returns pages that contain the word corresponding to word_id by descending
page_rank.
Return:
((doc_id, page_rank, frequency of word, url),...)
or
(), if the query returns nothing"""
word_id = db.Lexicon.get_word_id(word)

if word_id == None:
#no matches
return ()

results = db.Join.get_page_rank_urls_by_word(word_id)

return results

if __name__=="__main__":
print searchQuery("alex")
8 changes: 4 additions & 4 deletions backend/crawler1.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def crawl(self, depth = 2):
url = urljoin(page, link['href'])
if url.find("'") != -1: continue
url = url.split('#')[0] # remove location portion
if url[0:4] == 'http' and url in self.is_indexed:
if url[0:4] == 'http' and url not in self.is_indexed:
newpages[url] = 1
linktext = self._get_text_only(link)
self._add_link(page, url, linktext)
Expand All @@ -95,11 +95,11 @@ def crawl(self, depth = 2):
pages = newpages

if __name__=="__main__":
conn = sqlite3.connect('../db/repo.db')
db.connection = conn #set the connection variable in cursorshelper

#crawler().crawl()

print "CRAWLER FINISHED"
print "BEGINNING PAGERANK ALGORITHM"

#index
import pagerank
pagerank.connection = conn
Expand Down
37 changes: 31 additions & 6 deletions backend/cursorhelpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sqlite3

#global cursors which all classes in this module use
connection = None
connection = sqlite3.connect("../db/repo.db")


"""In general, Inserts return true or false. Searches return a value or None"""
Expand All @@ -10,18 +10,23 @@ class DataBase(object):

@classmethod
def drop_tables(cls):
connection.cursor().execute("DROP TABLE lexicon")
connection.cursor().execute("DROP TABLE document")
connection.cursor().execute("DROP TABLE link")
connection.cursor().execute("DROP TABLE doc_word_index")
connection.commit()
try:
connection.cursor().execute("DROP TABLE lexicon")
connection.cursor().execute("DROP TABLE document")
connection.cursor().execute("DROP TABLE link")
connection.cursor().execute("DROP TABLE doc_word_index")
connection.cursor().execute("DROP TABLE page_rank")
connection.commit()
except:
pass

@classmethod
def create_tables(cls):
connection.cursor().execute('CREATE TABLE lexicon ( word_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, word VARCHAR(100) UNIQUE NOT NULL)')
connection.cursor().execute('CREATE TABLE document (url_id INTEGER PRIMARY KEY ASC AUTOINCREMENT, url VARCHAR(255) UNIQUE NOT NULL)')
connection.cursor().execute('CREATE TABLE link ( from_doc_id INTEGER NOT NULL REFERENCES document(url_id), to_doc_id INTEGER NOT NULL REFERENCES document(url_id), freq UNSIGNED INTEGER, PRIMARY KEY(from_doc_id, to_doc_id))')
connection.cursor().execute('CREATE TABLE doc_word_index ( doc_id INTEGER REFERENCES document(url_id), word_id INTEGER REFERENCES lexicon(word_id), freq UNSIGNED INTEGER, PRIMARY KEY(doc_id, word_id))')
connection.cursor().execute('CREATE TABLE page_rank(doc_id INTEGER REFERENCES document(url_id), page_rank INTEGER, PRIMARY KEY(doc_id))')
connection.commit()


Expand Down Expand Up @@ -141,3 +146,23 @@ class DocWordIndex(LinkWordIndexBaseDB):
FIELD1 = 'doc_id'
FIELD2 = 'word_id'

class PageRank(object):

@classmethod
def insert_page_rank(cls,doc_id,page_rank):
cursor = connection.cursor()
cursor.execute('insert into page_rank values (?,?)',(doc_id,page_rank))
connection.commit()

class Join(object):
"""A class that performs queries on joins of tables """
@classmethod
def get_page_rank_urls_by_word(cls,word_id):
"""Returns pages that contain the word corresponding to word_id by descending
page_rank.
Return:
((doc_id, page_rank, frequency of word, url),...)"""
cursor = connection.cursor()
cursor.execute('select document.url_id, page_rank.page_rank, doc_word_index.freq, document.url from document,page_rank,doc_word_index where document.url_id=page_rank.doc_id and page_rank.doc_id=doc_word_index.doc_id and doc_word_index.word_id=? order by page_rank desc',(word_id,))

return list(cursor)
Binary file modified backend/cursorhelpers.pyc
Binary file not shown.
Binary file modified backend/pagerank.pyc
Binary file not shown.

0 comments on commit 85bec20

Please sign in to comment.