In [90]:
import os
import re
from os import listdir
from os.path import isfile, join

corpusDirectory = "sequence/corpus"
def getAllDirectoriesInLocation(loc):
    listOfSubDir = [loc+"/"+f for f in os.listdir(loc)]
    return listOfSubDir



def getRelationFromStr(s):
    l = s.split("\t")
    output = []
    prev = l[0]
    for index in range(1, len(l)):
        output.append((prev, l[index]))
        prev = l[index]
    return output


def getFirstLineInPage(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return getRelationFromStr(content[0])


def getPageLocationAndSeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getFirstLineInPage(seedLocation)
        output.append((pageLocation, seed))
    return output

#read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent

#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return ' '.join(document.split())


def getEachSeedLocationsInPage(document, (lSeed, rSeed), kvDiff=400):
    lSeedLocations = [(i, i+len(lSeed)) for i in range(len(document)) if document.startswith(lSeed, i)]
    docLength = len(document)
    output = []
    for (ls, le) in lSeedLocations:
        subStr = document[le: min(le+400, docLength)]
        loc = subStr.find(rSeed)
        if loc==-1:
            continue
        rs = loc
        re = loc + len(rSeed)
        output.append((ls, le, rs, re))
    return output



def attach(item, l):
    result = []
    for i in l:
        value = [item]
        value.extend(i)
        result.append(value)
    return result

def getSequencesFromRelation(relations, startLoc):
    if len(relations)==0:
        return []
    sequences = []
    relation = relations[0]
    if len(relations)==1:
        for (ls, le, roffsetStart, roffsetEnd) in relation:
            if ls>=startLoc:
                sequences.append([(ls, le, roffsetStart, roffsetEnd)])
        return sequences
    else:
        other = relations[1:]
        for (ls, le, roffsetStart, roffsetEnd) in relations:
            if ls<startLoc:
                continue
            else:
                startLoc = ls + roffsetStart
                recAns = getSequencesFromRelation(other, startLoc)
                sequences.extend(attach((ls, le, roffsetStart, roffsetEnd), recAns))
        return sequences



def convertRelationIntoSequence(relations):
#     print("Called with " +str(relations))
    sequences = []
    if len(relations)==0:
        return sequences
    relation = relations[0]
    if len(relations)==1:
        for r in relations:
            sequences.append([r])
        return sequences
    else:
        other = relations[1:]
        for (ls, le, roffsetStart, roffsetEnd) in relation:
            startLoc = le+roffsetStart
#             print("Calling from " + str(startLoc))
            recAns = getSequencesFromRelation(other, startLoc)
#             print("Answer received is " + str(recAns))
            sequences.extend(attach((ls, le, roffsetStart, roffsetEnd), recAns))
    output = []
    totalRelations = len(relations)
    for item in sequences:
        if len(item)!=totalRelations:
            continue
        output.append(item)
    return output
        
    

    
def getPatternsForCluster(cluster, pageContent):
    if len(cluster)<=1:
        return []
    (ls, le, rStartOffset, rEndOffset) = cluster[0]
    (lastS, lastE, lastROffsetS, lastROffsetE) = cluster[len(cluster)-1]
    lastEnd = lastE+lastROffsetE
    leftPattern = pageContent[max(0, ls-100):ls]
    rightPattern = pageContent[lastEnd:min(len(pageContent), lastEnd+100)]
    middlePatterns = set()
    for (ls, le, roffsetS, roffsetE) in cluster:
        startPosition=le
        endPosition = le+roffsetS
        pattern = pageContent[startPosition:endPosition]
#         print("pattern is " + pattern)
        if not pattern in middlePatterns:
            middlePatterns.add(pattern)
    middlePatterns = list(middlePatterns)
#     print("left pattern is " + str(leftPattern))
#     print("right pattern is " + str(rightPattern))
#     print("Middle pattern is " + str(middlePatterns))
#     print("Middle pattern length is " + str(len(middlePatterns)))
    output = [leftPattern]
    output.extend(middlePatterns)
    output.append(rightPattern)
#     print("output is " + str(output))
    return output

    
def getRelationClusterPattern(pageContent, relations):
    relationPositions = []
    for relation in relations:
        positions = getEachSeedLocationsInPage(pageContent, relation)
        relationPositions.append(positions)
#     print("Relation positions are ")
#     print(relationPositions)
    relationPositions = convertRelationIntoSequence(relationPositions)
    output = []
    for relationPosition in relationPositions:
        output.append(getPatternsForCluster(relationPosition, pageContent))
    return output


    
def getBetweenStringOfParameter(q):
    p = q[1:len(q)-1]
    output = ""
    for item in p:
        output+=item+"\t"
    print(output)
    return output.strip()


def getBetweenStringOfCluster(cluster):
    output = []
    for item in cluster:
#         print("item is " + str(item))
        output.append(getBetweenStringOfParameter(item))
    return output

        
        
def clubPatternsAccordingToBetweenString(relationCluster):
    betweenOfClusters = []
    for cluster in relationCluster:
        betweenStringCluster = getBetweenStringOfCluster(cluster)
        betweenOfClusters.append(betweenStringCluster)
    return doBetweenStringIntersection(betweenOfClusters)

    
def doBetweenStringIntersection(betweenStringOfClusters):
    d = {}
    for betweenStringCluster in betweenStringOfClusters:
        betweenStringCluster = list(set(betweenStringCluster))
        for s in betweenStringCluster:
            if s in d:
                d[s]+=1
            else:
                d[s]=1
    output = []
    for key,val in d.items():
        print("Between strings\n===========\n" +str(val))
        print(key)
#         if val>1:
#             output.append(key)
    return output
    
def getPatterns(corpusDirectory):
    allDirs = getAllDirectoriesInLocation(corpusDirectory)
    output  = getPageLocationAndSeed(allDirs)
    relationClusters = []
    for (pageLocation, relations) in output:
        pageContent             = preprocessDocument(readPage(pageLocation))
        print("Page is " + str(pageLocation))
        print("Relation is " + str(relations))
        relationClusterPattern  = getRelationClusterPattern(pageContent, relations)
#         print("Relation cluster patterns is " + str(relationClusterPattern))
#         print(relationClusterPattern)
        relationClusters.append(relationClusterPattern)
#     print("Clubbing patterns")
#     print("type is "  +str(type(relationClusters[0][0])))
    print(clubPatternsAccordingToBetweenString(relationClusters))
    print(output)
    
    
    
        
        
        
# def doSomeIntersection():
    
    
getPatterns(corpusDirectory)


        



Page is sequence/corpus/page1/page.html
Relation is [('Electronics', 'Redmi 4'), ('Redmi 4', 'Xiaomi Redmi 4')]
Page is sequence/corpus/page3/page.html
Relation is [('Musical Instruments', 'Accessories'), ('Accessories', 'Blue Microphones Yeti USB Microphone')]
Page is sequence/corpus/page2/page.html
Relation is [('Electronics', 'Exclusive in Electronics'), ('Exclusive in Electronics', 'OnePlus 3T')]
Page is sequence/corpus/page4/page.html
Relation is [('Kindle Store', 'Kindle eBooks'), ('Kindle eBooks', 'The Intelligent Investor')]
 </a> </span></li> <li class="a-breadcrumb-divider"><span class="a-list-item a-color-tertiary"> › </span></li> <li><span class="a-list-item"> <a class="a-link-normal a-color-tertiary" href="https://www.amazon.in/redmi-4/b/ref=dp_bc_2?ie=UTF8&amp;node=13713623031"> 	 </a> </span></li> <li class="a-breadcrumb-divider"><span class="a-list-item a-color-tertiary"> › </span></li> <li><span class="a-list-item a-color-tertiary"> 	
 </a> </span></li> <li class="a-br