In [10]:
import os
import re
from os import listdir
from os.path import isfile, join

corpusDirectory = "sequence/corpus"
def getAllDirectoriesInLocation(loc):
    listOfSubDir = [loc+"/"+f for f in os.listdir(loc)]
    return listOfSubDir



        
#give it the name of directory and get back list of files present in directory
def getAllFilesInDirectory(directory):
    return [join(directory, f) for f in listdir(directory) if isfile(join(directory, f))]




def getRelationFromStr(s):
    l = s.split("\t")
    output = []
    prev = l[0]
    for index in range(1, len(l)):
        output.append((prev, l[index]))
        prev = l[index]
    return output


def getFirstLineInPage(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return getRelationFromStr(content[0])

def getFirstLineAsBinaryRelation(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    l = content[0].split("\t")
    return (l[0], l[len(l)-1])
#     return getRelationFromStr(content[0])





def getBinaryRelationFromRelation(l):
    return (l[0], l[len(l)-1])


def getPageLocationAndSeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getFirstLineInPage(seedLocation)
        output.append((pageLocation, seed))
    return output


def getPageLocationAndBinarySeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getFirstLineAsBinaryRelation(seedLocation)
        output.append((pageLocation, seed))
    return output

#read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent

#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return ' '.join(document.split())



#get all seds location with range of seom 2000 characters
def getEachSeedLocationsInPage(document, (lSeed, rSeed), kvDiff=2000):
    lSeedLocations = [(i, i+len(lSeed)) for i in range(len(document)) if document.startswith(lSeed, i)]
    docLength = len(document)
    output = []
    for (ls, le) in lSeedLocations:
        subStr = document[le: min(le+2000, docLength)]
#         print("Seed is " + str(rSeed))
        loc = subStr.find(rSeed)
        if loc==-1:
            continue
        rs = loc
        re = loc + len(rSeed)
        output.append((ls, le, rs, re))
#     print("returning " + str(output))
    return output


def getBinaryOutput(l):
    output = []
    for(pageLocation, relation) in l:
        output.append((pageLocation, getBinaryRelationFromRelation(relation)))
    return output



def getInformationAboutPageRelation(pageContent, allPositions):
    leftContexts = []
    rightContexts = []
    middleStrings = []
    totalPageLength = len(pageContent)
    for (ls, le, roffsetS, roffsetE) in allPositions:
        lc           = pageContent[max(0, ls-100):ls]
        lc = lc[::-1]
        rc           = pageContent[roffsetE:min(roffsetE+100, totalPageLength)]
        middleString = pageContent[le:le+roffsetS]
        leftContexts.append(lc)
        rightContexts.append(rc)
        middleStrings.append(middleString)
    return (leftContexts, rightContexts, middleStrings)




#reverse strings in list
def reverseListElements(l):
    output = []
    for item in l:
        output.append(item[::-1])
    return output




def getLeftAndRightContexts(pageContent, seed):
    b = getEachSeedLocationsInPage(pageContent, seed)
#     print("b is " + str(b))
    output = []
    totalLength = len(pageContent)
    leftContexts = []
    rightContexts = []
#     print("Seed is " + str(seed))
    for (ls, le, roffsetS, roffsetE) in b:
        leftContext = pageContent[max(0, ls-100):ls]
        leftContext = leftContext[::-1]
        rightContext = pageContent[le+roffsetE:min(totalLength, le+roffsetS+100)]
#         print("Right context isn " + str(rightContext))
        leftContexts.append(leftContext)
        rightContexts.append(rightContext)
    return (leftContexts, rightContexts)

def getCommonPrefix(s1, s2):
    l = [s1, s2]
    return os.path.commonprefix(l)



def insertCommonPrefix(results, commonPrefix):
    if commonPrefix in results or len(commonPrefix)<=0:
        return results
    r = list(results)
    for item in r:
        if len(item)<len(commonPrefix) and commonPrefix.startswith(item):
            results.remove(item)
    r=list(results)
    for item in r:
        if len(item)>len(commonPrefix) and item.startswith(commonPrefix):
            return results
    results.add(commonPrefix)
    return results

def doPrefixIntersection(list1, list2):
    results = set()
    for item1 in list1:
        commonPrefix = ""
        for item2 in list2:
            c = getCommonPrefix(item1, item2)
            if len(c)>len(commonPrefix):
                commonPrefix = c
        results = insertCommonPrefix(results, commonPrefix)
    return results


def doIntersection(patterns):
    if len(patterns)<=0:
        return patterns
    result = patterns[0]
    for index in range(1, len(patterns)):
        result = doPrefixIntersection(result, patterns[index])
    return result



def getLeftPatterns(leftContexts):
    lp = doIntersection(leftContexts)
    results = []
    for item in lp:
        results.append(item[::-1])
    return results


def getRightPatterns(rightContexts):
    return list(doIntersection(rightContexts))
    
    
    
    
def allPossiblePairs(leftPatterns, rightPatterns):
    patterns = []
    for lp in leftPatterns:
        for rp in rightPatterns:
            patterns.append((lp, rp))
    return patterns



def filterPatterns(pageLocationAndSeeds, patterns):
    seedsSet = set()
    for (pageLoc, s) in pageLocationAndSeeds:
        seedsSet.add(s)
    seeds = list(seedsSet)
    patternSuccess = []
    for index in range(0, len(patterns)):
        patternSuccess.append(0)
    for index in range(0, len(patterns)):
        for (pageLocation, seed) in pageLocationAndSeeds:
            pageContent = preprocessDocument(readPage(pageLocation))
            resultsFound = extractSet([patterns[index]], pageContent)
            if seed in resultsFound:
                patternSuccess[index]+=1
                
    output = []
    for index in range(0, len(patterns)):
        if patternSuccess[index]>0:
            output.append(patterns[index])
    return output

def doProcessingWithoutClass(s):
    i = "class[\s]*=[\s]*"
    o       = "class="
    s = re.sub(i, o, s)
    i = "([^class])=\"[^\"]*\""
    o = "\\1=\"\""
    return re.sub(i, o, s)


def getAllPatternsForBinaryRelation(corpusLocation):
    directories                = getAllDirectoriesInLocation(corpusLocation)
#     pageLocationsAndSeed       = getPageLocationAndSeed(directories)
    pageLocationAndBinarySeeds = getPageLocationAndBinarySeed(directories)
    leftContexts  = []
    rightContexts = []
    for (pageLocation, seed) in pageLocationAndBinarySeeds:
#         pageContent = doProcessingWithoutClass(preprocessDocument(readPage(pageLocation)))
        pageContent = preprocessDocument(readPage(pageLocation))    
        (leftContextsPerPage, rightContextsPerPage) = getLeftAndRightContexts(pageContent, seed)
        leftContexts.append(leftContextsPerPage)
        rightContexts.append(rightContextsPerPage)
#     print("co")
    print("Left contexts are " + str(leftContexts))
    print("Right contexts are " + str(rightContexts))
    printContextInformation(leftContexts, pageLocationAndBinarySeeds, True)
    print("Printed left patterns")
    printContextInformation(rightContexts, pageLocationAndBinarySeeds)
    leftPatterns = getLeftPatterns(leftContexts)
    rightPatterns = getRightPatterns(rightContexts)
    
    print("Left patterns are " + str(leftPatterns))
    print("Right patterns are " + str(rightPatterns))
    patterns = allPossiblePairs(leftPatterns, rightPatterns)
    print("All possible patterns: ")
    print(patterns)
#     patterns = filterPatterns(pageLocationAndBinarySeeds, patterns)
    #TODO if number of patterns are empty go for empty class thing
    print("Final Patterns ")
    print(patterns)
    return patterns

#Pattern is (l, r) and match them to htmlPageContent
def findEntitySetwrtPattern(htmlPageContent, (l, r)):
    #for each start location of pattern find its end
    #for each end page find the pattern right
    #extract everything till that point
    #after extraction move one point above that pattern string
    results = []
    for m in re.finditer(re.escape(l), htmlPageContent):
        start = m.start()
        end = m.end()
        rightPage = htmlPageContent[end:]
        rightLoc  = rightPage.find(r)
        if rightLoc==-1:
            break
        element = rightPage[:rightLoc]
        if len(element)>1 and len(element)<2000:
            results.append(element)
    return set(results)



#input: all the pattern found and new html page
#output: extract set of elements from the page that are in our set(whose definition we know)
def getPassage(patterns, htmlPageContent):
    output = []
    for pattern in patterns:
        output.extend(findEntitySetwrtPattern(htmlPageContent, pattern))
    return output



def eachSeedPresent(passage, seed):
    for (a, b) in seed:
        if not a in passage or not b in passage:
            return False
    return True

def filterPassage(passage, seed):
    (a, b)=seed[0]
    (c, d)=seed[len(seed)-1]
    output = []
    for p in passage:
        if p.startswith(a) and p.endswith(d) and eachSeedPresent(p, seed):
            output.append(doProcessingWithoutClass(p))
    return output

def getBetweenPatterns(seed, passage):
    output = []
    for p in passage:
        for (a, b) in seed:
            loc1 = p.find(a)
            loc2 = p.find(b)
            if loc1==-1 or loc2==-1 or loc1>loc2:
                break
            output.append(p[loc1+len(a):loc2])
            p = p[loc2:]
    return output
        
    
def applyMiddlePatterns(middlePatterns, l):
    if len(middlePatterns)==0:
        return l
    pattern = middlePatterns[0]
    results = []
    for item in l:
        results.extend(item.split(pattern))
    if len(middlePatterns)==1:
        return results
    return applyMiddlePatterns(middlePatterns[1:], results)
    
s = "Kindle Store </a> </span></li> <li class=\"a-breadcrumb-divider\"><span class=\"a-list-item a-color-tertiary\"> \xe2\x80\xba </span></li> <li><span class=\"a-list-item\"> <a class=\"a-link-normal a-color-tertiary\" href=\"\"> Kindle eBooks </a> </span></li> <li class=\"a-breadcrumb-divider\"><span class=\"a-list-item a-color-tertiary\"> \xe2\x80\xba </span></li> <li><span class=\"a-list-item a-color-tertiary\"> The Intelligent Investor" 

def getAllPatternsForNaryRelation(corpusLocation):
    patterns = getAllPatternsForBinaryRelation(corpusLocation)
    directories                = getAllDirectoriesInLocation(corpusLocation)
    pageLocationsAndSeed       = getPageLocationAndSeed(directories)
    output = []
    print("\n\n\n\n\n\n")
    for (pageLocation, seed) in pageLocationsAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))    
        passage = getPassage(patterns, pageContent)
        passage = filterPassage(passage, seed)
        print("seed is " + str(seed))
        print("passage is " + str(passage))
        print("\n\n\n")
        output.append((seed, passage))
    middlePatterns = []
    for (seed, passage) in output:
        middlePatterns.extend(getBetweenPatterns(seed, passage))
    middlePatterns = list(set(middlePatterns))
#     createAllPatterns(patterns, middlePatterns)
    print("Middle patterns are ")
    print(middlePatterns)
    print("\n\n\n")
    print("output is ")
    print(output)
    return (patterns, middlePatterns)


#remove elements which were actually tag
def isTag(s):
    if s.find("<")!=-1 and s.find(">")!=-1:
        return True
    return False


#preprocess results before writing it to file
def preprocessResults(output):
    result = []
    for o in output:
        if isTag(o):
            continue
        result.append(o)
    return result

def writeListToFile(loc, l):
    thefile = open(loc, 'w')
    for item in l:
        print>>thefile, item

def ifAnyMiddlePatternMatch(passage, patterns):
    for p in patterns:
        if passage.find(p)!=-1:
            return True
    return False

def printContextInformation(contexts, pageAndSeed, rev=False):
    for index in range(0, len(pageAndSeed)):
        (pageLocation, seed) = pageAndSeed[index]
        pageContent = preprocessDocument(readPage(pageLocation))
        contextPerPage = contexts[index]
        print("For seed " + str(seed))
        print("Context is following: " + pageLocation + " number " + str(contextPerPage))
        print("--------------------------------")
        for item in contextPerPage:
            if rev==True:
                item = item[::-1]
            print(item)
            print("---------------------")


(patterns, middlePatterns) = getAllPatternsForNaryRelation(corpusDirectory)
print("\n\n\n\n")
print("Applying middle patterns: ")
print(applyMiddlePatterns(middlePatterns, [s]))
unseenCorpusLocation = "sequence/unseenCorpus"
unseenCorpusFiles     = getAllFilesInDirectory(unseenCorpusLocation)
unseenResults = []
for productPage in unseenCorpusFiles:
    pageContent = preprocessDocument(readPage(productPage))
    passage = getPassage(patterns, pageContent)
    resultsPerPage = []
    for p in passage:
        p = doProcessingWithoutClass(p)
        ifAnyMiddlePatternMatch(p, middlePatterns):
            resultsPerPassage = preprocessResults(applyMiddlePatterns(middlePatterns, [p]))
            resultsPerPage.extend(resultsPerPassage)
#         if len(resultsPerPassage)==1 and resultsPerPassage[0]==p:
#             continue
#         print("Passage is ")
#         print(p)
#         print("Reuslts per passage are ")
#         print(resultsPerPassage)
        
#         resultsPerPage.extend(preprocessResults(list(extractSet(middlePatterns, p))))
    if len(resultsPerPage)>0:
        unseenResults.extend(resultsPerPage)
        print("passage is " + str(passage))
        print("Reuslt pre page: ")
        print(resultsPerPage)
    
unseenResults = list(set(unseenResults))
print("Total number of unseen results: " + str(len(unseenResults)))
writeListToFile("unseenSeeds", unseenResults)

    
    
    
    
    
    
        
# def doSomeIntersection():
    
    
# getAllPatterns(corpusDirectory)
getAllPatternsForNaryRelation(corpusDirectory)



SyntaxError: invalid syntax (<ipython-input-10-78fe9007c7fa>, line 436)