In [53]:
import os
corpusDirectory = "corpus"
def getAllDirectoriesInLocation(loc):
    listOfSubDir = [loc+"/"+f for f in os.listdir(loc)]
    return listOfSubDir



def getFirstLineInPage(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    return content[0]






def getPageLocationAndSeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getFirstLineInPage(seedLocation)
        output.append((pageLocation, seed))
    return output
    
        
#find all start, end pairs of particular key
def getAllStartEndPairs(document, key):
    keyLength = len(key)
    return [(i, i+keyLength) for i in range(len(document)) if document.startswith(key, i)]
 
    
#find seeds for one particular seed(at this stage call only getAllStartEndPairs())
def getEachSeedLocationsInPage(htmlPageContent, seed):
    return getAllStartEndPairs(htmlPageContent, seed)

        



    #read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent



#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return ' '.join(document.split())


def getCommonPrefix(s1, s2):
    l = [s1, s2]
    return os.path.commonprefix(l)



def insertCommonPrefix(results, commonPrefix):
    if commonPrefix in results or len(commonPrefix)<=0:
        return results
    r = list(results)
    for item in r:
        if len(item)<len(commonPrefix) and commonPrefix.startswith(item):
            results.remove(item)
    r=list(results)
    for item in r:
        if len(item)>len(commonPrefix) and item.startswith(commonPrefix):
            return results
    results.add(commonPrefix)
    return results

def doPrefixIntersection(list1, list2):
    results = set()
    for item1 in list1:
        commonPrefix = ""
        for item2 in list2:
            c = getCommonPrefix(item1, item2)
            if len(c)>len(commonPrefix):
                commonPrefix = c
        results = insertCommonPrefix(results, commonPrefix)
    return results


def doIntersection(patterns):
    if len(patterns)<=0:
        return patterns
    result = patterns[0]
    for index in range(1, len(patterns)):
        result = doPrefixIntersection(result, patterns[index])
    return result



def getLeftPatterns(leftContexts):
    lp = doIntersection(leftContexts)
    results = []
    for item in lp:
        results.append(item[::-1])
    return results


def getRightPatterns(rightContexts):
    return doIntersection(rightContexts)
    
    
    
    
    
def allPossiblePairs(leftPatterns, rightPatterns):
    patterns = []
    for lp in leftPatterns:
        for rp in rightPatterns:
            patterns.append((lp, rp))
            
def filterPatterns(pageLocationAndSeeds, patterns):
    return patterns

def getLeftAndRightContexts(pageContent, seed):
    b = getEachSeedLocationsInPage(pageContent, seed)
    output = []
    totalLength = len(pageContent)
    leftContexts = []
    rightContexts = []
    for (s, e) in b:
        leftContext = pageContent[max(0, s-100):s]
        leftContext = leftContext[::-1]
        rightContext = pageContent[e:min(totalLength, e+100)]
        leftContexts.append(leftContext)
        rightContexts.append(rightContext)
    return (leftContexts, rightContexts)
    

    
    
def printContextInformation(contexts, pageAndSeed):
    
    for index in range(0, len(pageAndSeed)):
        (pageLocation, seed) = pageAndSeed[index]
        pageContent = preprocessDocument(readPage(pageLocation))
        contextPerPage = contexts[index]
        print("For seed " + str(seed))
        print("Context is following:")
        for item in contextPerPage:
            print(item)
            print("---------------------")


def getAllPatterns(corpusLocation):
    directories          = getAllDirectoriesInLocation(corpusLocation)
    pageLocationsAndSeed = getPageLocationAndSeed(directories)
    leftContexts  = []
    rightContexts = []
    for (pageLocation, seed) in pageLocationsAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))
        (leftContextsPerPage, rightContextsPerPage) = getLeftAndRightContexts(pageContent, seed)
        leftContexts.append(leftContextsPerPage)
        rightContexts.append(rightContextsPerPage)
#     print("co")
    print("Left contexts are " + str(leftContexts))
    print("Right contexts are " + str(rightContexts))
    printContextInformation(leftContexts, pageLocationsAndSeed)
    printContextInformation(rightContexts, pageLocationsAndSeed)
    leftPatterns = getLeftPatterns(leftContexts)
    rightPatterns = getRightPatterns(rightContexts)
    patterns = allPossiblePairs(leftPatterns, rightPatterns)
    patterns = filterPatterns(pageLocationsAndSeed, patterns)
    return patterns

print getAllPatterns(corpusDirectory)

Left contexts are [['":"eman","92%MAR+BG+482%+92%BG+46+C2%kcalB82%+1A+iM_sbmurCdaerb_tcudorp=rekcarto?genbbhztrsnxemti/p/', '>"861"=ditcaer-atad "uwkmoB 74dHK1_"=ssalc vid<>"761"=ditcaer-atad "0vvEH1_"=ssalc vid<>vid/<>gvs/<>'], ['":"eman","92%MAR+BG+482%+92%BG+46+C2%dloG82%+sulP+3F+OPPO_sbmurCdaerb_tcudorp=rekcarto?hgtwm5my8dyue', '>"671"=ditcaer-atad "uwkmoB 74dHK1_"=ssalc vid<>"571"=ditcaer-atad "0vvEH1_"=ssalc vid<>vid/<>gvs/<>'], ['":"eman","92%MAR+BG+482%+92%BG+46+C2%dloG82%+4+etoN+imdeR_sbmurCdaerb_tcudorp=rekcarto?8qdkzyjf68gqe', '>"961"=ditcaer-atad "uwkmoB 74dHK1_"=ssalc vid<>"861"=ditcaer-atad "0vvEH1_"=ssalc vid<>vid/<>gvs/<>'], ['":"eman","92%MAR+BG+382%+92%BG+23+C2%kcalB+moneV82%+sulP+8K+ovoneL_sbmurCdaerb_tcudorp=rekcarto?czpg', '>"671"=ditcaer-atad "uwkmoB 74dHK1_"=ssalc vid<>"571"=ditcaer-atad "0vvEH1_"=ssalc vid<>vid/<>gvs/<>']]
Right contexts are [['"}}]}] </script> <script nonce="14999099344180227478"> if (requestAnimationFrame && performance && p', '</div><svg widt

ValueError: too many values to unpack