In [1]:
import os
import re
from os import listdir
from os.path import isfile, join

corpusDirectory = "corpus"
def getAllDirectoriesInLocation(loc):
    listOfSubDir = [loc+"/"+f for f in os.listdir(loc)]
    return listOfSubDir



def getFirstLineInPage(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    return content[0]






def getPageLocationAndSeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getFirstLineInPage(seedLocation)
        output.append((pageLocation, seed))
    return output
    
        
#find all start, end pairs of particular key
def getAllStartEndPairs(document, key):
    keyLength = len(key)
    return [(i, i+keyLength) for i in range(len(document)) if document.startswith(key, i)]
 
    
#find seeds for one particular seed(at this stage call only getAllStartEndPairs())
def getEachSeedLocationsInPage(htmlPageContent, seed):
    return getAllStartEndPairs(htmlPageContent, seed)

        



    #read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent



#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return ' '.join(document.split())


def getCommonPrefix(s1, s2):
    l = [s1, s2]
    return os.path.commonprefix(l)



def insertCommonPrefix(results, commonPrefix):
    if commonPrefix in results or len(commonPrefix)<=0:
        return results
    r = list(results)
    for item in r:
        if len(item)<len(commonPrefix) and commonPrefix.startswith(item):
            results.remove(item)
    r=list(results)
    for item in r:
        if len(item)>len(commonPrefix) and item.startswith(commonPrefix):
            return results
    results.add(commonPrefix)
    return results

def doPrefixIntersection(list1, list2):
    results = set()
    for item1 in list1:
        commonPrefix = ""
        for item2 in list2:
            c = getCommonPrefix(item1, item2)
            if len(c)>len(commonPrefix):
                commonPrefix = c
        results = insertCommonPrefix(results, commonPrefix)
    return results


def doIntersection(patterns):
    if len(patterns)<=0:
        return patterns
    result = patterns[0]
    for index in range(1, len(patterns)):
        result = doPrefixIntersection(result, patterns[index])
    return result



def getLeftPatterns(leftContexts):
    lp = doIntersection(leftContexts)
    results = []
    for item in lp:
        results.append(item[::-1])
    return results


def getRightPatterns(rightContexts):
    return list(doIntersection(rightContexts))
    
    
    
    
    
def allPossiblePairs(leftPatterns, rightPatterns):
    patterns = []
    for lp in leftPatterns:
        for rp in rightPatterns:
            patterns.append((lp, rp))
    return patterns
            
            
            
            
            
#Pattern is (l, r) and match them to htmlPageContent
def findEntitySetwrtPattern(htmlPageContent, (l, r)):
    #for each start location of pattern find its end
    #for each end page find the pattern right
    #extract everything till that point
    #after extraction move one point above that pattern string
    results = []
    for m in re.finditer(re.escape(l), htmlPageContent):
        start = m.start()
        end = m.end()
        rightPage = htmlPageContent[end:]
        rightLoc  = rightPage.find(r)
        if rightLoc==-1:
            break
        element = rightPage[:rightLoc]
        if len(element)>1 and len(element)<300:
            results.append(element)
    return set(results)



#input: all the pattern found and new html page
#output: extract set of elements from the page that are in our set(whose definition we know)
def extractSet(patterns, htmlPageContent):
    output = []
    for pattern in patterns:
        output.extend(findEntitySetwrtPattern(htmlPageContent, pattern))
    return output


def filterPatterns(pageLocationAndSeeds, patterns):
    seedsSet = set()
    for (pageLoc, s) in pageLocationAndSeeds:
        seedsSet.add(s)
    seeds = list(seedsSet)
    patternSuccess = []
    for index in range(0, len(patterns)):
        patternSuccess.append(0)
    for index in range(0, len(patterns)):
        for (pageLocation, seed) in pageLocationAndSeeds:
            pageContent = preprocessDocument(readPage(pageLocation))
            resultsFound = extractSet([patterns[index]], pageContent)
            if seed in resultsFound:
                patternSuccess[index]+=1
                
    output = []
    for index in range(0, len(patterns)):
        if patternSuccess[index]>0:
            output.append(patterns[index])
    return output



def getLeftAndRightContexts(pageContent, seed):
    b = getEachSeedLocationsInPage(pageContent, seed)
    output = []
    totalLength = len(pageContent)
    leftContexts = []
    rightContexts = []
    for (s, e) in b:
        leftContext = pageContent[max(0, s-100):s]
        leftContext = leftContext[::-1]
        rightContext = pageContent[e:min(totalLength, e+100)]
        leftContexts.append(leftContext)
        rightContexts.append(rightContext)
    return (leftContexts, rightContexts)
    

    
    
def printContextInformation(contexts, pageAndSeed, rev=False):
    
    for index in range(0, len(pageAndSeed)):
        (pageLocation, seed) = pageAndSeed[index]
        pageContent = preprocessDocument(readPage(pageLocation))
        contextPerPage = contexts[index]
        print("For seed " + str(seed))
        print("Context is following: " + pageLocation + " number " + str(contextPerPage))
        for item in contextPerPage:
            if rev==True:
                item = item[::-1]
            print(item)
            print("---------------------")


            
#give it the name of directory and get back list of files present in directory
def getAllFilesInDirectory(directory):
    return [join(directory, f) for f in listdir(directory) if isfile(join(directory, f))]

            
def writeListToFile(loc, l):
    thefile = open(loc, 'w')
    for item in l:
        print>>thefile, item
        
        

#remove elements which were actually tag
def isTag(s):
    if s.find("<")!=-1 and s.find(">")!=-1:
        return True
    return False


#preprocess results before writing it to file
def preprocessResults(output):
    result = []
    for o in output:
        if isTag(o):
            continue
        result.append(o)
    return result
        
def getAllPatterns(corpusLocation):
    directories          = getAllDirectoriesInLocation(corpusLocation)
    pageLocationsAndSeed = getPageLocationAndSeed(directories)
    leftContexts  = []
    rightContexts = []
    for (pageLocation, seed) in pageLocationsAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))
        (leftContextsPerPage, rightContextsPerPage) = getLeftAndRightContexts(pageContent, seed)
        leftContexts.append(leftContextsPerPage)
        rightContexts.append(rightContextsPerPage)
#     print("co")
#     print("Left contexts are " + str(leftContexts))
#     print("Right contexts are " + str(rightContexts))
    printContextInformation(leftContexts, pageLocationsAndSeed, True)
    printContextInformation(rightContexts, pageLocationsAndSeed)
    leftPatterns = getLeftPatterns(leftContexts)
    rightPatterns = getRightPatterns(rightContexts)
    
    print("Left patterns are " + str(leftPatterns))
    print("Right patterns are " + str(rightPatterns))
    patterns = allPossiblePairs(leftPatterns, rightPatterns)
    print("All possible patterns: ")
    print(patterns)
    patterns = filterPatterns(pageLocationsAndSeed, patterns)
    #TODO if number of patterns are empty go for empty class thing
    print("Final Patterns ")
    print(patterns)
    return patterns

patterns = getAllPatterns(corpusDirectory)
unseenCorpusLocation = "unseenCorpus"
unseenCorpusFiles     = getAllFilesInDirectory(unseenCorpusLocation)
unseenResults = []
for productPage in unseenCorpusFiles:
    pageContent = preprocessDocument(readPage(productPage))
    resultsPerPage = preprocessResults(list(extractSet(patterns, pageContent)))
    unseenResults.extend(resultsPerPage)
unseenResults = list(set(unseenResults))
print("Total number of unseen results: " + str(len(unseenResults)))
writeListToFile("unseenSeeds", unseenResults)


For seed Swipe Elite Pro (32GB, 3GB RAM)
Context is following: corpus/page1/page.html number ['>eltit< >"on=enohpelet"=tnetnoc "noitceted-tamrof"=eman atem< >"8c187044f3c02b8b"=tnetnoc "yek_y"=em', ' yuB"=tnetnoc "noitpircsed"=eman atem< >"pma/336602612166/bg23-orp-etile-epiws/tcudorp/moc.laedpans.', ' rof enilno pohS .aidnI ni secirp tseb ta enilno senohP eliboM )MAR BG3 ,BG23( orP etilE epiwS yuB"=', ' ,senohP eliboM ,stelbaT ;pma& seliboM"=tnetnoc "sdrowyek"=eman atem< >" .aidnI ssorca snoitpo DoC ;', '"=tnetnoc "eltit:go"=ytreporp "eltit_go"=eman atem< >""=tnetnoc "noitpircsed:go"=ytreporp atem< >"me', '"=tnetnoc "eltit:rettiwt"=eman atem< >"laedpans@"=tnetnoc "etis:rettiwt"=eman atem< >"egami_egral_yr', ' yuB"=tnetnoc "noitpircsed:rettiwt"=eman atem< >" aidnI laedpanS | secirP woL ta enilnO senohP elibo', ' rof enilno pohS .aidnI ni secirp tseb ta enilno senohP eliboM )MAR BG3 ,BG23( orP etilE epiwS yuB"=', '"=eulav "neddih"=epyt tupni<>vid/< >vid/<>vid/< >vid/< >lu/< >"] ,};93#&s

Final Patterns 
[('"> </div> <div class="tile-desc"> <p class="product-title">', '</p> <p class="product-offer-price"> Rs. '), ('" class="pdp-e-i-head"> ', '</h1> </div> <div class="col-xs-2"> <div class="shortlist-icon-wrpr comp-animated-icon "> <span clas'), ('nk itemprop="availability" href="http://schema.org/InStock"> <div itemprop="name" class="disp-none">', '</div> <div id="buyPriceBox" class="elecPriceTile buyNowBlock row"> <div class="col-xs-21 reset-padd')]
Total number of unseen results: 108


In [None]:
def getHtmlStart():
    header = "<html><head></head><body>"
    return header
def getHtmlEnd():
    footer = "</body></head>"
def getHtmlSection(title, items):
    items = list(items)
    output = "<h4>" + title +"</h4></br><ul>"
    
    for item in items:
        output+="<li>"+item+</li>+"</br>"
    output+="</ul></br></hr>"
    
    
