In [5]:
import os
import re
from os import listdir
from os.path import isfile, join
#corpus directory for flipkart
# productNameData/flipkart/corpus
print("Enter the corpus directory out of which we'll learn patterns: ")
# corpusDirectory = raw_input()
corpusDirectory = "categoryData/flipkart/corpus"
print("Corpus directory is " + corpusDirectory)
directories = getAllDirectoriesInLocation(corpusDirectory)
(trainDirs, testDirs) = trainTestSplit(directories)
print("Train Files are " + str(trainDirs))
print("Test files are " + str(testDirs))

Enter the corpus directory out of which we'll learn patterns: 
Corpus directory is categoryData/flipkart/corpus
Train Files are ['categoryData/flipkart/corpus/page1', 'categoryData/flipkart/corpus/page6', 'categoryData/flipkart/corpus/page7', 'categoryData/flipkart/corpus/page5', 'categoryData/flipkart/corpus/page8', 'categoryData/flipkart/corpus/page3']
Test files are ['categoryData/flipkart/corpus/page9', 'categoryData/flipkart/corpus/page10', 'categoryData/flipkart/corpus/page2', 'categoryData/flipkart/corpus/page4']


In [35]:
import os
import re
from os import listdir
from os.path import isfile, join
#do train/test split
def trainTestSplit(files):
    sixtyPercent = int(len(files)*(0.6))
    if sixtyPercent<4:
        print("We won't be able to report accuracy of patterns")
        return (files, [])
    else:
        return (files[:sixtyPercent], files[sixtyPercent:])

def getAllDirectoriesInLocation(loc):
    listOfSubDir = [loc+"/"+f for f in os.listdir(loc)]
    return listOfSubDir

def removeMultipleWhiteSpaces(s):
    return ' '.join(s.split())

def readAllSeedsAsSet(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [removeMultipleWhiteSpaces(x.strip()) for x in content]
    return set(content)

def getCategorySeeds(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [removeMultipleWhiteSpaces(x.strip()) for x in content] 
#     print("content is " + str(content))
    return content[1:len(content)-1]


def getPageLocationAndSeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getCategorySeeds(seedLocation)
        output.append((pageLocation, seed))
    return output
#read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent

def removeTerminatedAnd(s):
    return re.sub("&amp;", "&", s)



#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return removeTerminatedAnd(' '.join(document.split()))




def getLeftAndRightContexts(pageContent, seed):
    b = getEachSeedLocationsInPage(pageContent, seed)
    output = []
    totalLength = len(pageContent)
    leftContexts = []
    rightContexts = []
    for (s, e) in b:
        leftContext = pageContent[max(0, s-100):s]
        leftContext = leftContext[::-1]
        rightContext = pageContent[e:min(totalLength, e+100)]
        leftContexts.append(leftContext)
        rightContexts.append(rightContext)
    return (leftContexts, rightContexts)
    
    
#find all start, end pairs of particular key
def getAllStartEndPairs(document, key):
    keyLength = len(key)
    return [(i, i+keyLength) for i in range(len(document)) if document.startswith(key, i)]
 
    
#find seeds for one particular seed(at this stage call only getAllStartEndPairs())
def getEachSeedLocationsInPage(htmlPageContent, seed):
    return getAllStartEndPairs(htmlPageContent, seed)

def printContextInformation(contexts, pageAndSeed, rev=False):
    for index in range(0, len(pageAndSeed)):
        print("\n\n\n\n")
        (pageLocation, seed) = pageAndSeed[index]
        pageContent = preprocessDocument(readPage(pageLocation))
        contextPerPage = contexts[index]
        print("For seed " + str(seed))
        print("Context is following: " + pageLocation)# + " number " + str(contextPerPage))
        for item in contextPerPage:
            if rev==True:
                item = item[::-1]
            print(item)
            print("---------------------")

def getCommonPrefix(s1, s2):
    l = [s1, s2]
    return os.path.commonprefix(l)



def insertCommonPrefix(results, commonPrefix):
    if commonPrefix in results or len(commonPrefix)<=0:
        return results
    r = list(results)
    for item in r:
        if len(item)<len(commonPrefix) and commonPrefix.startswith(item):
            results.remove(item)
    r=list(results)
    for item in r:
        if len(item)>len(commonPrefix) and item.startswith(commonPrefix):
            return results
    results.add(commonPrefix)
    return results

def doPrefixIntersection(list1, list2):
    results = set()
    for item1 in list1:
        commonPrefix = ""
        for item2 in list2:
            c = getCommonPrefix(item1, item2)
            if len(c)>len(commonPrefix):
                commonPrefix = c
        results = insertCommonPrefix(results, commonPrefix)
    return results


def doIntersection(patterns):
    if len(patterns)<=0:
        return patterns
    result = patterns[0]
    for index in range(1, len(patterns)):
        result = doPrefixIntersection(result, patterns[index])
    return result


def getLeftPatterns(leftContexts):
    lp = doIntersection(leftContexts)
    results = []
    for item in lp:
        results.append(item[::-1]) 
    return results


def getRightPatterns(rightContexts):
    return list(doIntersection(rightContexts))


def allPossiblePairs(leftPatterns, rightPatterns):
    patterns = []
    for lp in leftPatterns:
        for rp in rightPatterns:
            patterns.append((lp, rp))
    return patterns

#Pattern is (l, r) and match them to htmlPageContent
def findEntitySetwrtPattern(htmlPageContent, (l, r)):
    #for each start location of pattern find its end
    #for each end page find the pattern right
    #extract everything till that point
    #after extraction move one point above that pattern string
    results = []
    for m in re.finditer(re.escape(l), htmlPageContent):
        start = m.start()
        end = m.end()
        rightPage = htmlPageContent[end:]
        rightLoc  = rightPage.find(r)
        if rightLoc==-1:
            break
        element = rightPage[:rightLoc]
        if len(element)>1 and len(element)<500:
            results.append(element)
    return set(results)


def extractSet(patterns, htmlPageContent):
    output = []
    for pattern in patterns:
        output.extend(findEntitySetwrtPattern(htmlPageContent, pattern))
    return set(output)

def filterPatterns(pageLocationAndSeeds, patterns, applyClass=False):
    seedsSet = set()
    for (pageLoc, seed) in pageLocationAndSeeds:
        for s in seed:
            seedsSet.add(s)
    seeds = list(seedsSet)
    patternSuccess = []
    for index in range(0, len(patterns)):
        patternSuccess.append(0)
    for index in range(0, len(patterns)):
        for (pageLocation, seed) in pageLocationAndSeeds:
            pageContent = preprocessDocument(readPage(pageLocation))
            if applyClass == True:
                pageContent = doProcessingWithoutClass(pageContent)
            resultsFound = extractSet([patterns[index]], pageContent)
            flag = True
            for s in seed:
                if not s in resultsFound:
                    flag=False
            if flag==True:
                patternSuccess[index]+=1
                
    output = []
    for index in range(0, len(patterns)):
        if patternSuccess[index]>0:
            output.append(patterns[index])
    return output


#remove elements which were actually tag
def isTag(s):
    if s.find("<")!=-1 and s.find(">")!=-1:
        return True
    return False


#preprocess results before writing it to file
def preprocessResults(output):
    result = []
    for o in output:
        if isTag(o):
            continue
        result.append(o)
    return result



def doProcessingWithoutClass(s, words):
    i = "class[\s]*=[\s]*"
    o       = "class="
    s = re.sub(i, o, s)
    i = "([^class])=\"[^\"]*\""
    o = "\\1=\"\""
    return re.sub(i, o, s)

def getContextsForAllSeeds(pageContent, seed):
    lout = []
    rout = []
    for s in seed:
        (lcPerS, rcPerS) = getLeftAndRightContexts(pageContent, s)
        lout.append(lcPerS)
        rout.append(rcPerS)
    return (lout, rout)

def getAllPatternsByPlainStringMatch(corpusLocation):
    directories             = getAllDirectoriesInLocation(corpusLocation)
    (trainDirs, testDirs)   = trainTestSplit(directories)
#     print(trainDirs)
    pageLocationsAndSeed    = getPageLocationAndSeed(trainDirs)
    testPageLocationAndSeed = getPageLocationAndSeed(testDirs)
    leftContexts  = []
    rightContexts = []
    for (pageLocation, seed) in pageLocationsAndSeed:
#         print("pageLocation is " + pageLocation)
#         print("seed is ")
#         print(seed)
        pageContent = preprocessDocument(readPage(pageLocation))
        (leftContextsPerPage, rightContextsPerPage) = getContextsForAllSeeds(pageContent, seed)
        if [] in leftContextsPerPage:
            print(pageLocation + " has some empty seed")
#             error
#         (leftContextsPerPage, rightContextsPerPage) = getLeftAndRightContexts(pageContent, seed)
        leftContexts.extend(leftContextsPerPage)
        rightContexts.extend(rightContextsPerPage)
#     printContextInformation(leftContexts, pageLocationsAndSeed, True)
#     printContextInformation(rightContexts, pageLocationsAndSeed)
#     print("Total left contexts ")
#     print(leftContexts)
    leftPatterns = getLeftPatterns(leftContexts)
    rightPatterns = getRightPatterns(rightContexts)
#     print(leftPatterns)
#     print(rightPatterns)
    
#     print("Left patterns are " + str(leftPatterns))
#     print("Right patterns are " + str(rightPatterns))
    patterns = allPossiblePairs(leftPatterns, rightPatterns)
#     print("All possible patterns: ")
#     print(patterns)
    patterns = filterPatterns(pageLocationsAndSeed, patterns)
    #TODO if number of patterns are empty go for empty class thing
#     print("Final Patterns ")
#     print(patterns)
    recall = 0
    extraResults = 0
    for (pageLocation, seed) in testPageLocationAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))
        resultsPerPage = preprocessResults(list(extractSet(patterns, pageContent)))
#         resultsPerPage = list(set(resultsPerPage))
        flag=True
        for s in seed:
            if not s in resultsPerPage:
                flag=False
        if flag==True:
            recall+=1
#         if seed in resultsPerPage:
#             recall += 1
        extraElements = len(resultsPerPage) - len(seed)
        if extraElements>1:
            seedsLocation = os.path.dirname(pageLocation) + "/seed"
            allSeeds = readAllSeedsAsSet(seedsLocation)
            totalCount = len(resultsPerPage)
            allSeedsCount = len(list(allSeeds.intersection(set(resultsPerPage))))
            extracount = totalCount - allSeedsCount
            extraResults += extracount
#         print("Results per page " + str(resultsPerPage))
    seedsMissed = len(testPageLocationAndSeed)-recall
#     print("Seeds missed: " + str(seedsMissed))
#     print("Extra junk: "+ str(extraResults))
    return (patterns, seedsMissed, extraResults, len(testDirs))
    
# getAllPatternsByPlainStringMatch(corpusDirectory)

def getPatternStore(eCommerceDataSetLocation):
    allWebsites = getAllDirectoriesInLocation(eCommerceDataSetLocation)
#     print("All websites are " + str(allWebsites))
    patternsStore = []
    for website in allWebsites:
        corpusLocation = website + "/corpus"
        websiteName = os.path.basename(website)
        (plainPatterns, plainMissed, plainJunkCount, plainTotal) = getAllPatternsByPlainStringMatch(corpusLocation)
        (noValPatterns, noValMissed, noValJunkCount, noValTotal) = getAllPatternsByRetainingOnlyClassValue(corpusLocation)
        if plainMissed < noValMissed:
            patterns = (websiteName, "plainPattern", plainPatterns, plainMissed, plainJunkCount, plainTotal)
#             print(plainPatterns)
        else:
            patterns = (websiteName, "noValuePattern", noValPatterns, noValMissed, noValJunkCount, noValTotal)
        patternsStore.append(patterns)
#     for p in patternsStore:
#         print(p)
    return patternsStore


def getLastWordWithoutEqualSign(leftContext):
    words = leftContext.strip().split(" ")
    totalWords = len(words)
    if len(words)<2:
        return ""
    else:
        if words[totalWords-1]=="=":
            return words[totalWords-2]
        else:
            return ""


def getSpecialWords(document, seedsSet):
    totalLength = len(document)
    specialWords = []
    for s in seedsSet:
        allSeedPositions = getAllStartEndPairs(document, s)
        for (start, end) in allSeedPositions:
            prevLoc = start-1
            nextLoc = end
            if prevLoc>=0 and nextLoc<totalLength:
                prevQuot = document[prevLoc]
                nextQuot  = document[nextLoc]
                print(prevQuot + " " + nextQuot)
                if prevQuot=="\"" and nextQuot=="\"":
                    leftContext = document[max(0, prevLoc-100):prevLoc]
                    lastWord = getLastWordWithoutEqualSign(leftContext)
                    if len(lastWord)>0:
                        specialWords.append(lastWord)
    return specialWords


def getAllSpecialWords(pageLocationAndSeed):
    specialWords = []
    specialWords.append("class")
    for (pageLocation, seeds) in pageLocationAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))
        specialWords.extend(getSpecialWords(pageContent, seeds))
    specialWords = list(set(specialWords))
    return specialWords

def getAllPatternsByRetainingOnlyClassValue(corpusLocation):
    directories             = getAllDirectoriesInLocation(corpusLocation)
    (trainDirs, testDirs)   = trainTestSplit(directories)
    pageLocationsAndSeed    = getPageLocationAndSeed(trainDirs)
    testPageLocationAndSeed = getPageLocationAndSeed(testDirs)
    specialWords = getAllSpecialWords(trainDirs)
#     print(trainDirs)
#     print("\n===========================\n")
#     for (a, b) in testPageLocationAndSeed:
#         print("Page location is " + str(a))
#         print("Seeds are " + str(b))
#     print("\n==========================\n")
    leftContexts  = []
    rightContexts = []
    for (pageLocation, seed) in pageLocationsAndSeed:
#         print("pageLocation is " + pageLocation)
#         print("seed is ")
#         print(seed)
        pageContent = doProcessingWithoutClass(preprocessDocument(readPage(pageLocation)), specialWords)
        (leftContextsPerPage, rightContextsPerPage) = getContextsForAllSeeds(pageContent, seed)
#         (leftContextsPerPage, rightContextsPerPage) = getLeftAndRightContexts(pageContent, seed)
        leftContexts.extend(leftContextsPerPage)
        rightContexts.extend(rightContextsPerPage)
#     printContextInformation(leftContexts, pageLocationsAndSeed, True)
#     printContextInformation(rightContexts, pageLocationsAndSeed)
    leftPatterns = getLeftPatterns(leftContexts)
    rightPatterns = getRightPatterns(rightContexts)
    
#     print("Left patterns are " + str(leftPatterns))
#     print("Right patterns are " + str(rightPatterns))
    patterns = allPossiblePairs(leftPatterns, rightPatterns)
#     print("All possible patterns: ")
#     print(patterns)
    patterns = filterPatterns(pageLocationsAndSeed, patterns, True)
    #TODO if number of patterns are empty go for empty class thing
#     print("Final Patterns ")
#     print(patterns)
    recall = 0
    extraResults = 0
    for (pageLocation, seed) in testPageLocationAndSeed:
        pageContent = doProcessingWithoutClass(preprocessDocument(readPage(pageLocation)))
        resultsPerPage = preprocessResults(list(extractSet(patterns, pageContent)))
#         resultsPerPage = list(set(resultsPerPage))
        flag=True
#         print("Seed wassss " + str(seed))
#         print("Results per page: " + str(resultsPerPage))
        for s in seed:
            if not s in resultsPerPage:
#                 print("Page location " + str(pageLocation))
#                 print("\n\nSeed was " + str(s))
                flag=False
        if flag==True:
            recall+=1
#         if seed in resultsPerPage:
#             recall += 1
        extraElements = len(resultsPerPage) - len(seed)
        if extraElements>1:
            seedsLocation = os.path.dirname(pageLocation) + "/seed"
            allSeeds = readAllSeedsAsSet(seedsLocation)
#             print("============================")
#             print("page is " + str(pageLocation))
#             print("All seeds are")
#             print(allSeeds)
#             print("Results per page are: ")
#             print(resultsPerPage)
#             print("============================")
            totalCount = len(resultsPerPage)
            allSeedsCount = len(list(allSeeds.intersection(set(resultsPerPage))))
            extracount = totalCount - allSeedsCount
            extraResults += extracount
#         print("Results per page " + str(resultsPerPage))
    seedsMissed = len(testPageLocationAndSeed)-recall
#     print("Seeds missed: " + str(seedsMissed))
#     print("Extra junk: "+ str(extraResults))
    return (patterns, seedsMissed, extraResults, len(testDirs))
    
# getAllPatternsByRetainingOnlyClassValue(corpusDirectory)


def writeListToFile(loc, l):
    with open(loc, 'w') as f:
        for item in l:
            f.write(item+"\n")

            
def escapeAllDoubleQuote(s):
    return re.sub("\"", "\\\"", s)
            
def writeSpecificationPatternsToFile(fileLocation, productPatterns):
    header = "GroupId\tWebsiteName\tPatternType\tMissed\tJunk\tOutof\tLeftPattern\tRightPattern"
    groupId = 1
    output = []
    output.append(header)
    for (website, pType, patterns, missed, junk, total) in productPatterns:
        rowPrefix = str(groupId) + "\t" + website + "\t" + pType
        rowPrefix+="\t" + str(missed) + "\t" + str(junk) + "\t" + str(total)
        for (l, r) in patterns:
            row = rowPrefix + "\t" + l + "\t" + r
            output.append(row)
        groupId+=1
#     print("output is ")
#     for item in output:
#         print(item)
    writeListToFile(fileLocation, output)
    print("Output written at location: " + fileLocation)

In [43]:
print("Enter e-commerce data location: ")
eCommerceDataLocation = raw_input()
print("Enter outputLocation")
outputLocation = raw_input()
outputLocation+="/categoryPatterns.tsv"
print("Patterns will be found at " + str(outputLocation))
patternsStore = getPatternStore(eCommerceDataLocation)
print("Pattern store is ")
print(patternsStore)
writeSpecificationPatternsToFile(outputLocation, patternsStore)
#categoryData

Enter e-commerce data location: 
categoryData
Enter outputLocation
patternsLearnt
Patterns will be found at patternsLearnt/categoryPatterns.tsv
['categoryData/flipkart/corpus/page1', 'categoryData/flipkart/corpus/page6', 'categoryData/flipkart/corpus/page7', 'categoryData/flipkart/corpus/page5', 'categoryData/flipkart/corpus/page8', 'categoryData/flipkart/corpus/page3']
['categoryData/flipkart/corpus/page1', 'categoryData/flipkart/corpus/page6', 'categoryData/flipkart/corpus/page7', 'categoryData/flipkart/corpus/page5', 'categoryData/flipkart/corpus/page8', 'categoryData/flipkart/corpus/page3']
['categoryData/amazon/corpus/page1', 'categoryData/amazon/corpus/page6', 'categoryData/amazon/corpus/page7', 'categoryData/amazon/corpus/page5', 'categoryData/amazon/corpus/page8', 'categoryData/amazon/corpus/page3']
['categoryData/amazon/corpus/page1', 'categoryData/amazon/corpus/page6', 'categoryData/amazon/corpus/page7', 'categoryData/amazon/corpus/page5', 'categoryData/amazon/corpus/page8', 

In [42]:
corpusDirectory = "corpuss"
print(getAllPatternsByPlainStringMatch(corpusDirectory))
print(getAllPatternsByRetainingOnlyClassValue(corpusDirectory))

['corpuss/page1', 'corpuss/page6', 'corpuss/page7', 'corpuss/page5', 'corpuss/page3']
([('">', '</a><svg width="16" height="27" viewBox="0 0 16 27" xmlns="http://www.w3.org/2000/svg" class="_2XP0B')], 0, 0, 4)
['corpuss/page1', 'corpuss/page6', 'corpuss/page7', 'corpuss/page5', 'corpuss/page3']
([('d="" fill="#fff" class="_24NaUy"></path></svg></div><div class="_1HEvv0"><a class="_1KHd47" href="">', '</a><svg width="" height="" viewBox="" xmlns="http://www.w3.org/2000/svg" class="_2XP0B_"><path d=""')], 1, 0, 4)


In [34]:
corpusDirectory = "corpuss"
print(getAllPatternsByPlainStringMatch(corpusDirectory))
print(getAllPatternsByRetainingOnlyClassValue(corpusDirectory))

([('">', '</')], 0, 681, 4)
([('">', '</')], 0, 681, 4)
