In [101]:
from collections import defaultdict
from collections import Counter
import codecs
import math
import re
import operator
import os
from os import path
from nltk.stem.snowball import SnowballStemmer


In [102]:
"""
  Read files in the directory to a list of strings. 
  Input: directory name
  Output: Text files listed in the directory as String array
"""
def read_documents(dirname):
        # Get all files from given directory
        files = [dirname+'/'+f for f in os.listdir(dirname)]
        documents = []
        for filename in files:
            # Open file
            txt = open(filename)
            # Read file as text and add it to String array
            documents.append(txt.read())
        return documents

In [103]:
""" 
   Convert a string representing one document into a list of words and convert to lowercase.
   Input: document - text
   Output: List of words
"""
def tokenize(document):
        return [t.lower() for t in re.findall(r"\w+(?:[-']\w+)*", document)]


In [104]:
""" 
   Given a list of tokens, stem the english words using Snowball Stemmer
   Input: List of words
   Output: Stem words using nltk SnowballStemmer 
"""
def stem(tokens):
        stemmer = SnowballStemmer("english")
        return [stemmer.stem(t) for t in tokens]

In [105]:
""" 
    Given documents, return dict mapping terms to document frequency.
    Input: Documents
    Output: Document frequency dictionaty
"""
def count_doc_frequencies(docs):
        res = defaultdict(lambda: 0);
        for i in range(len(docs)):
            doc = list(set(docs[i]))
            for j in range(len(doc)):
                res[doc[j]] += 1
        return res

In [106]:
"""
    Creates an index in which each postings list contains a list of[doc_id, tf-idf weight] pairs. 
    Input: Documents & Document frequency of each term
    Output: tf-idf weight for each term
"""
def create_tfidf_index(docs, doc_freqs):
        index = defaultdict(list)
        total_docs = len(docs)
        for i in range(len(docs)):
            term_count = dict(Counter(docs[i]))
            for term in term_count:
                if term_count[term]/len(docs[i])>0:
                    index[term].append([i,(1+math.log10(term_count[term]/len(docs[i])))*math.log10(total_docs/doc_freqs[term])])
                else:
                    index[term].append([i,math.log10(total_docs/doc_freqs[term])])
                    
        return index


In [107]:
"""
    Return a dict mapping doc_id to length.
    Input: tf-idf index weight list
    Output: Dictionary mapping with doc length
"""
def compute_doc_lengths(index):
        lengths = defaultdict(lambda: 0)
        for i in index:
            for term_count in index[i]:
                lengths[term_count[0]] += math.pow(term_count[1],2)
        for key, value in lengths.items():
            lengths[key] = math.sqrt(value)
        return lengths

In [108]:
""" 
    Given query terms, converts to dictionary with frequency
    Input: Query terms
    Output: Frequency of terms
"""
def query_to_vector(query_terms):
        return dict(Counter(query_terms))

In [131]:
"""
    Given query vector, index and doc length, returns a sorted list of doc_id, score pairs, where the score is the 
    cosine similarity between the query_vector and the document. 
    Input: query_vector, document index, doc length
    Output: doc_id and cosine score
"""
def search_by_cosine(query_vector, index, doc_lengths):
        scores = defaultdict(lambda: 0)
        for query_term, query_weight in query_vector.items():
            for doc_id, doc_weight in index[query_term]:
                scores[doc_id] += query_weight * doc_weight 
        for doc_id in scores:
            scores[doc_id] /= doc_lengths[doc_id]
        return sorted(scores.items(), key=lambda x: x[1])


In [136]:
""" 
    Return the top 5 job search results 
    Input: doc_ids
"""
def print_job_info(doc_ids):
    res = ''
    for doc_id, score in doc_ids[:5]:
        fileName = "jobs/job" + str(doc_id) + ".txt"
        with open(fileName) as resultFile:
            job_info = [next(resultFile) for x in xrange(3)]
        print job_info

""" 
    Return the top 5 user search results 
    Input: doc_ids
"""
def print_user_info(doc_ids):
    res = ''
    for doc_id, score in doc_ids[:5]:
        fileName = "users/user" + str(doc_id) + ".txt"
        with open(fileName) as resultFile:
            user_info = [next(resultFile) for x in xrange(3)]
        print user_info


In [151]:
documents = read_documents('jobs')
stemmed_docs = [tokenize(d) for d in documents]
doc_freqs = count_doc_frequencies(stemmed_docs)
index = create_tfidf_index(stemmed_docs, doc_freqs)
doc_lengths = compute_doc_lengths(index)
user_documents = read_documents('users')
print user_documents[0]
user_stemmed_docs = tokenize(user_documents[0])
user_query_vector = query_to_vector(user_stemmed_docs)
search_job_results = search_by_cosine(user_query_vector,index,doc_lengths)
print_job_info(search_job_results)

Hans Brough
Likes to build useful stuff
San Francisco Bay Area
Hans started out thinking he wanted to be a Landscape Architect. After attending CAL POLY San Luis Obispo and attaining his degree in the mid- 90's he was wooed to the dark side by the promise of the internet.

Luckily many of the problem solving skills they teach in design school translate well to planning and implementing web projects. The tool set is a little different...

His career to date has been spent learning and putting to practical use these new tool sets while always trying to keep the end user in mind. He now enjoys participating in web projects using a variety of tools. 

Specialties: Speed, Object Based JavaScript, Design Patterns, Server Side Javascript, DOM Scripting, YUI, jQuery, Mootools, Dust, HandleBars, MV* Frameworks, PHP, JSP, MySQL, DHTML, CSS, HTML, XHTML, XML, XSLT, CVS, Bug Tracking Systems, Grunt, UI testing with Jasmine, XFN, SEO, XSS, Grunt, Hapi, Node
Senior Web Developer
Brightsky Labs
Web D