In [10]:
from collections import defaultdict
from collections import Counter
import codecs
import math
import re
import operator
import os
from os import path
from nltk.stem.snowball import SnowballStemmer

In [11]:
def read_documents(dirname):
        # Get all files from given directory
        files = [dirname+'/'+f for f in os.listdir(dirname)]
        documents = []
        for filename in files:
            # Open file
            txt = open(filename)
            # Read file as text and add it to String array
            documents.append(txt.read())
        return documents

In [12]:
def tokenize(document):
        return [t.lower() for t in re.findall(r"\w+(?:[-']\w+)*", document)]

In [14]:


def stem(tokens):
        stemmer = SnowballStemmer("english")
        return [stemmer.stem(t) for t in tokens]

In [15]:
def count_doc_frequencies(docs):
        res = defaultdict(lambda: 0);
        for i in range(len(docs)):
            doc = list(set(docs[i]))
            for j in range(len(doc)):
                res[doc[j]] += 1
        return res

In [16]:
def create_tfidf_index(docs, doc_freqs):
        index = defaultdict(list)
        total_docs = len(docs)
        for i in range(len(docs)):
            term_count = dict(Counter(docs[i]))
            for term in term_count:
                if term_count[term]/len(docs[i])>0:
                    index[term].append([i,(1+math.log10(term_count[term]/len(docs[i])))*math.log10(total_docs/doc_freqs[term])])
                else:
                    index[term].append([i,math.log10(total_docs/doc_freqs[term])])
                    
        return index

In [17]:
def compute_doc_lengths(index):
        lengths = defaultdict(lambda: 0)
        for i in index:
            for term_count in index[i]:
                lengths[term_count[0]] += math.pow(term_count[1],2)
        for key, value in lengths.items():
            lengths[key] = math.sqrt(value)
        return lengths

In [18]:
def query_to_vector(query_terms):
        return dict(Counter(query_terms))

In [19]:
def search_by_cosine(query_vector, index, doc_lengths):
        scores = defaultdict(lambda: 0)
        for query_term, query_weight in query_vector.items():
            for doc_id, doc_weight in index[query_term]:
                scores[doc_id] += query_weight * doc_weight 
        for doc_id in scores:
            scores[doc_id] /= doc_lengths[doc_id]
        return sorted(scores.items(), key=lambda x: x[1])

In [22]:
def print_job_info(doc_ids):
    res = ''
    for doc_id, score in doc_ids[:5]:
        fileName = "jobs/job" + str(doc_id) + ".txt"
        with open(fileName) as resultFile:
            job_info = [next(resultFile) for x in xrange(3)]
        print(job_info)


def print_user_info(doc_ids):
    res = ''
    for doc_id, score in doc_ids[:5]:
        fileName = "users/user" + str(doc_id) + ".txt"
        with open(fileName) as resultFile:
            user_info = [next(resultFile) for x in xrange(3)]
        print(user_info)

In [23]:
documents = read_documents('jobs')
stemmed_docs = [tokenize(d) for d in documents]
doc_freqs = count_doc_frequencies(stemmed_docs)
index = create_tfidf_index(stemmed_docs, doc_freqs)
doc_lengths = compute_doc_lengths(index)
user_documents = read_documents('users')
print(user_documents[0])
user_stemmed_docs = tokenize(user_documents[0])
user_query_vector = query_to_vector(user_stemmed_docs)
search_job_results = search_by_cosine(user_query_vector,index,doc_lengths)
print_job_info(search_job_results)

FileNotFoundError: [Errno 2] No such file or directory: 'jobs'