In [38]:
import os
import re
from os import listdir
from os.path import isfile, join

corpusDirectory = "sequence/corpus"
def getAllDirectoriesInLocation(loc):
    listOfSubDir = [loc+"/"+f for f in os.listdir(loc)]
    return listOfSubDir



def getRelationFromStr(s):
    l = s.split("\t")
    output = []
    prev = l[0]
    for index in range(1, len(l)):
        output.append((prev, l[index]))
        prev = l[index]
    return output


def getFirstLineInPage(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return getRelationFromStr(content[0])


def getPageLocationAndSeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getFirstLineInPage(seedLocation)
        output.append((pageLocation, seed))
    return output

#read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent

#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return ' '.join(document.split())


def getEachSeedLocationsInPage(document, (lSeed, rSeed), kvDiff=400):
    lSeedLocations = [(i, i+len(lSeed)) for i in range(len(document)) if document.startswith(lSeed, i)]
    docLength = len(document)
    output = []
    for (ls, le) in lSeedLocations:
        subStr = document[le: min(le+400, docLength)]
        loc = subStr.find(rSeed)
        if loc==-1:
            continue
        rs = loc
        re = loc + len(rSeed)
        output.append((ls, le, rs, re))
    return output



def attach(item, l):
    result = []
    for i in l:
        value = [item]
        value.extend(i)
        result.append(value)
    return result

def getSequencesFromRelation(relations, startLoc):
    if len(relations)==0:
        return []
    sequences = []
    relation = relations[0]
    if len(relations)==1:
        for (ls, le, roffsetStart, roffsetEnd) in relation:
            if ls>startLoc:
                sequences.append([(ls, le, roffsetStart, roffsetEnd)])
        return sequences
    else:
        other = relations[1:]
        for (ls, le, roffsetStart, roffsetEnd) in relations:
            if ls<startLoc:
                continue
            else:
                startLoc = ls + roffsetEnd
                recAns = getSequencesFromRelation(other, startLoc)
                sequences.extend(attach((ls, le, roffsetStart, roffsetEnd), recAns))
        return sequences



def convertRelationIntoSequence(relations):
    sequences = []
    if len(relations)==0:
        return sequences
    relation = relations[0]
    if len(relations)==1:
        for r in relations:
            sequences.append([r])
        return sequences
    else:
        other = relations[1:]
        for (ls, le, roffsetStart, roffsetEnd) in relation:
            startLoc = le+roffsetEnd
            recAns = getSequencesFromRelation(other, startLoc)
#             print("Answer received is " + str(recAns))
            sequences.extend(attach((ls, le, roffsetStart, roffsetEnd), recAns))
    output = []
    totalRelations = len(relations)
    for item in sequences:
        if len(item)!=totalRelations:
            continue
        output.append(item)
    return output
        
    
def getRelationClusterPattern(pageContent, relations):
    relationPositions = []
    for relation in relations:
        positions = getEachSeedLocationsInPage(pageContent, relation)
        relationPositions.append(positions)
#     print("Relation positions are ")
#     print(relationPositions)
    relationPositions = convertRelationIntoSequence(relationPositions)
    return relationPositions



def getPatterns(corpusDirectory):
    allDirs = getAllDirectoriesInLocation(corpusDirectory)
    output  = getPageLocationAndSeed(allDirs)
    for (pageLocation, relations) in output:
        pageContent             = preprocessDocument(readPage(pageLocation))
        print("Page is " + str(pageLocation))
        print("Relation is " + str(relations))
        relationClusterPattern  = getRelationClusterPattern(pageContent, relations)
        print("Relation cluster patterns is " + str(relationClusterPattern))
        print(relationClusterPattern)
    print(output)
    
    
getPatterns(corpusDirectory)


Page is sequence/corpus/page1/page.html
Relation is [('Electronics', 'Redmi 4'), ('Redmi 4', 'Xiaomi Redmi 4')]
Relation cluster patterns is [[(204222, 204233, 267, 274), (376374, 376381, 145, 159)], [(204222, 204233, 267, 274), (376456, 376463, 63, 77)], [(204222, 204233, 267, 274), (383349, 383356, 205, 219)], [(204222, 204233, 267, 274), (383534, 383541, 20, 34)]]
[[(204222, 204233, 267, 274), (376374, 376381, 145, 159)], [(204222, 204233, 267, 274), (376456, 376463, 63, 77)], [(204222, 204233, 267, 274), (383349, 383356, 205, 219)], [(204222, 204233, 267, 274), (383534, 383541, 20, 34)]]
Page is sequence/corpus/page3/page.html
Relation is [('Musical Instruments', 'Accessories'), ('Accessories', 'Blue Microphones Yeti USB Microphone')]
Relation cluster patterns is []
[]
Page is sequence/corpus/page2/page.html
Relation is [('Electronics', 'Exclusive in Electronics'), ('Exclusive in Electronics', 'OnePlus 3T')]
Relation cluster patterns is []
[]
Page is sequence/corpus/page4/page.html