In [1]:
###############  This cell is a solution and for your reference.
###              So when grading, delete this cell, you will use only the tests below.

import os
def getIDsAndReviews(dataDir):
    dict = {}
    for filename in os.listdir(dataDir):
        dict[getDocID(filename)] = readFile(dataDir, filename)
    return dict
        
def getDocID(filename):
    return os.path.splitext(os.path.basename(filename))[0]

def readFile(dataDir, filename):
    return open(f'{dataDir}/{filename}').read()

from nltk.tokenize import word_tokenize

#  This should return a list of strings, every string should be all lower-case letters
def cleanReview(review):
    return list(filter(lambda s: s.isalpha(), [s.lower() for s in word_tokenize(review)]))

def cleanReviews(idToReviews):
    for docID, review in idToReviews.items():
        idToReviews[docID] = cleanReview(review)

import re
def tokenizeWord(word):
    regex = re.compile('[^a-zA-Z]')
    return regex.sub('', word).lower()

#   Tokenize a single review (string)
#  Input -- a review (string)
#  Output -- the cleaned review -- a list of words (strings), each word is cleaned
def tokenizeDocument(document):
    tokens = [tokenizeWord(word) for word in document.split()]
    return [tok for tok in tokens if len(tok) > 0]

# Input -- a dictionary as produced by getIDsAndDocuments
# Output -- a dictionary with the same keys, but each value is the cleaned review (a list of strings)
def tokenizeDocuments(idToDocuments):
    dict = {}
    for docID, document in idToDocuments.items():
        dict[docID] = tokenizeDocument(document)
    return dict

def buildTermToDocDictionaries(docsToTerms):
    dd = []
    for docID, terms in docsToTerms.items():
        dd.append(dict(list(zip(terms, [{docID} for n in range(0, len(terms))]))))
    return dd

def mergeTwo(d1, d2):
    d = {}
    allTerms = set(d1.keys()) | set(d2.keys())
    for term in allTerms:
        s1 = d1.get(term, set())
        s2 = d2.get(term, set())
        d[term] = s1 | s2
    return d

def mergeIndexes(indexes):
    dout = {}
    for d in indexes:
        dout = mergeTwo(dout, d)
    return dout

def indexReviews(dataDir):
    return mergeIndexes(buildTermToDocDictionaries(tokenizeDocuments(getIDsAndReviews(dataDir))))

#######################################

class QueryProcessor:
    def __init__(self, dataDir):
        self.index = indexReviews(dataDir)
    def query(self, term):
        return self.index.get(term, set())
    def andQuery(self, term1, term2):
        t1 = term1 if type(term1) == set else self.query(term1)
        t2 = term2 if type(term2) == set else self.query(term2)
        return t1 & t2
    def orQuery(self, term1, term2):
        t1 = term1 if type(term1) == set else self.query(term1)
        t2 = term2 if type(term2) == set else self.query(term2)
        return t1 | t2
    

In [2]:
##################
##  For testing / grading.  Copy this cell at the end of the student's solution
##  then restart kernel and run all

def testEqual(msg, expected, actual):
    res = None
    if expected == actual:
        res = "Succeeded"
    else:
        res = f"Failed, expected {expected} got {actual}"
    print(f"{msg}: {res}")

def testOriginal():
    qp = QueryProcessor('data')
    q1 = sorted(qp.query('movie'))
    testEqual("q1", ['Dog1', 'Dog2', 'Dog3', 'Dog4', 'Joe1', 'Joe2'], q1)
    q2 = sorted(qp.andQuery('yellow', 'lab'))
    testEqual("q2", ['Dog1', 'Dog2'], q2)
    q3 = sorted(qp.andQuery(qp.andQuery('salome', qp.orQuery('good', 'excellent')), 'opera'))
    testEqual("q3", ['Salome5'], q3)
    q4 = sorted(qp.orQuery(qp.orQuery('good', 'excellent'), qp.orQuery('liked', 'loved')))
    testEqual("q4", ['Dog4', 'DooWop1', 'DooWop2', 'Joe1', 'Joe2', 'Salome5'], q4)
    q5 = list(qp.query("xxx"))
    testEqual("q5", [], q5)
    q6 = sorted(qp.orQuery("xxx", 'movie'))
    testEqual("q6", ['Dog1', 'Dog2', 'Dog3', 'Dog4', 'Joe1', 'Joe2'], q6)
    q7 = list(qp.andQuery("xxx", 'movie'))
    testEqual("q7", [], q7)
    
def testBooks():
    DATA_DIR = 'textcorpora'
    qp = QueryProcessor('textcorpora')
    q1 = sorted(qp.query('king'))
    testEqual("q1", ['bible-kjv', 'melville-moby_dick', 'shakespeare-macbeth'], q1)    
    q2 = sorted(qp.andQuery('happy', 'death'))
    testEqual("q2", ['bible-kjv', 'melville-moby_dick', 'shakespeare-macbeth'], q2)  
    q3 = sorted(qp.andQuery(qp.andQuery('whale', qp.orQuery('good', 'excellent')), 'ocean'))
    testEqual("q3", ['melville-moby_dick'], q3)
    q4 = sorted(qp.orQuery(qp.orQuery('good', 'excellent'), qp.orQuery('liked', 'loved')))
    testEqual("q4", ['bible-kjv', 'melville-moby_dick', 'shakespeare-macbeth'], q4)
    q5 = sorted(qp.query("xxx"))
    testEqual("q5", [], q5)
    q6 = sorted(qp.orQuery("xxx", 'mother'))
    testEqual("q6", ['bible-kjv', 'melville-moby_dick', 'shakespeare-macbeth'], q6)
    q7 = sorted(qp.andQuery("xxx", 'movie'))
    testEqual("q7", [], q7)
    
def doTests():
    testOriginal()
    print("\n==========================================\n")
    testBooks()
    
doTests()

q1: Succeeded
q2: Succeeded
q3: Succeeded
q4: Succeeded
q5: Succeeded
q6: Succeeded
q7: Succeeded


q1: Succeeded
q2: Succeeded
q3: Succeeded
q4: Succeeded
q5: Succeeded
q6: Succeeded
q7: Succeeded
