In [7]:
import os
import re
from os import listdir
from os.path import isfile, join
#corpus directory for flipkart
# productNameData/flipkart/corpus
print("Enter the corpus directory out of which we'll learn patterns: ")
# corpusDirectory = raw_input()
corpusDirectory = "productNameData/amazon/corpus"
print("Corpus directory is " + corpusDirectory)
directories = getAllDirectoriesInLocation(corpusDirectory)
(trainDirs, testDirs) = trainTestSplit(directories)
print("Train Files are " + str(trainDirs))
print("Test files are " + str(testDirs))

Enter the corpus directory out of which we'll learn patterns: 
Corpus directory is productNameData/amazon/corpus
Train Files are ['productNameData/amazon/corpus/page1', 'productNameData/amazon/corpus/page6', 'productNameData/amazon/corpus/page7', 'productNameData/amazon/corpus/page5', 'productNameData/amazon/corpus/page8', 'productNameData/amazon/corpus/page3']
Test files are ['productNameData/amazon/corpus/page9', 'productNameData/amazon/corpus/page10', 'productNameData/amazon/corpus/page2', 'productNameData/amazon/corpus/page4']


In [1]:
import os
import re
from os import listdir
from os.path import isfile, join
#do train/test split
def trainTestSplit(files):
    sixtyPercent = int(len(files)*(0.6))
    if sixtyPercent<4:
        print("We won't be able to report accuracy of patterns")
        return (files, [])
    else:
        return (files[:sixtyPercent], files[sixtyPercent:])

def getAllDirectoriesInLocation(loc):
    listOfSubDir = [loc+"/"+f for f in os.listdir(loc)]
    return listOfSubDir


def removeMultipleWhiteSpaces(s):
    return ' '.join(s.split())


def getProductNameSeed(fileLocation):
    with open(fileLocation) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    return removeMultipleWhiteSpaces(content[0])






def getPageLocationAndSeed(d):
    output = []
    for pageDir in d:
        pageLocation = pageDir + "/page.html"
        seedLocation = pageDir + "/seed"
        seed = getProductNameSeed(seedLocation)
        output.append((pageLocation, seed))
    return output


#read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent

def removeTerminatedAnd(s):
    return re.sub("&amp;", "&", s)



#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return removeTerminatedAnd(' '.join(document.split()))




def getLeftAndRightContexts(pageContent, seed):
    b = getEachSeedLocationsInPage(pageContent, seed)
    output = []
    totalLength = len(pageContent)
    leftContexts = []
    rightContexts = []
    for (s, e) in b:
        leftContext = pageContent[max(0, s-100):s]
        leftContext = leftContext[::-1]
        rightContext = pageContent[e:min(totalLength, e+100)]
        leftContexts.append(leftContext)
        rightContexts.append(rightContext)
    return (leftContexts, rightContexts)
    
    
#find all start, end pairs of particular key
def getAllStartEndPairs(document, key):
    keyLength = len(key)
    return [(i, i+keyLength) for i in range(len(document)) if document.startswith(key, i)]
 
    
#find seeds for one particular seed(at this stage call only getAllStartEndPairs())
def getEachSeedLocationsInPage(htmlPageContent, seed):
    return getAllStartEndPairs(htmlPageContent, seed)

def printContextInformation(contexts, pageAndSeed, rev=False):
    
    for index in range(0, len(pageAndSeed)):
        (pageLocation, seed) = pageAndSeed[index]
        pageContent = preprocessDocument(readPage(pageLocation))
        contextPerPage = contexts[index]
        print("For seed " + str(seed))
        print("Context is following: " + pageLocation )#+ " number " + str(contextPerPage))
        for item in contextPerPage:
            if rev==True:
                item = item[::-1]
            print(item)
            print("---------------------")

def getCommonPrefix(s1, s2):
    l = [s1, s2]
    return os.path.commonprefix(l)



def insertCommonPrefix(results, commonPrefix):
    if commonPrefix in results or len(commonPrefix)<=0:
        return results
    r = list(results)
    for item in r:
        if len(item)<len(commonPrefix) and commonPrefix.startswith(item):
            results.remove(item)
    r=list(results)
    for item in r:
        if len(item)>len(commonPrefix) and item.startswith(commonPrefix):
            return results
    results.add(commonPrefix)
    return results

def doPrefixIntersection(list1, list2):
    results = set()
    for item1 in list1:
        commonPrefix = ""
        for item2 in list2:
            c = getCommonPrefix(item1, item2)
            if len(c)>len(commonPrefix):
                commonPrefix = c
        results = insertCommonPrefix(results, commonPrefix)
    return results


def doIntersection(patterns):
    if len(patterns)<=0:
        return patterns
    result = patterns[0]
    for index in range(1, len(patterns)):
        result = doPrefixIntersection(result, patterns[index])
    return result


def getLeftPatterns(leftContexts):
    lp = doIntersection(leftContexts)
    results = []
    for item in lp:
        results.append(item[::-1])
    return results


def getRightPatterns(rightContexts):
    return list(doIntersection(rightContexts))


def allPossiblePairs(leftPatterns, rightPatterns):
    patterns = []
    for lp in leftPatterns:
        for rp in rightPatterns:
            patterns.append((lp, rp))
    return patterns

#Pattern is (l, r) and match them to htmlPageContent
def findEntitySetwrtPattern(htmlPageContent, (l, r)):
    #for each start location of pattern find its end
    #for each end page find the pattern right
    #extract everything till that point
    #after extraction move one point above that pattern string
    results = []
    for m in re.finditer(re.escape(l), htmlPageContent):
        start = m.start()
        end = m.end()
        rightPage = htmlPageContent[end:]
        rightLoc  = rightPage.find(r)
        if rightLoc==-1:
            break
        element = rightPage[:rightLoc]
        if len(element)>1 and len(element)<300:
            results.append(element)
    return set(results)


def extractSet(patterns, htmlPageContent):
    output = []
    for pattern in patterns:
        output.extend(findEntitySetwrtPattern(htmlPageContent, pattern))
    return set(output)


#remove elements which were actually tag
def isTag(s):
    if s.find("<")!=-1 and s.find(">")!=-1:
        return True
    return False


#preprocess results before writing it to file
def preprocessResults(output):
    result = []
    for o in output:
        if isTag(o):
            continue
        result.append(o)
    return result

def filterPatterns(pageLocationAndSeeds, patterns, applyClass=False, regExpIn="", regExpOut=""):
    seedsSet = set()
    for (pageLoc, s) in pageLocationAndSeeds:
        seedsSet.add(s)
    seeds = list(seedsSet)
    patternSuccess = []
    for index in range(0, len(patterns)):
        patternSuccess.append(0)
    for index in range(0, len(patterns)):
        for (pageLocation, seed) in pageLocationAndSeeds:
            pageContent = preprocessDocument(readPage(pageLocation))
            if applyClass == True:
                pageContent = doProcessingWithoutClass(pageLocation, regExpIn, regExpOut)
            resultsFound = extractSet([patterns[index]], pageContent)
            if seed in resultsFound:
                patternSuccess[index]+=1
                
    output = []
    for index in range(0, len(patterns)):
        if patternSuccess[index]>0:
            output.append(patterns[index])
    return output


# def filterPatterns(pageLocationAndSeeds, patterns, applyClass=False, regExpIn="", regExpOut=""):
#     seedsSet = set()
#     for (pageLoc, seed) in pageLocationAndSeeds:
#         for s in seed:
#             seedsSet.add(s)
#     seeds = list(seedsSet)
#     patternSuccess = []
#     for index in range(0, len(patterns)):
#         patternSuccess.append(0)
#     for index in range(0, len(patterns)):
#         for (pageLocation, seed) in pageLocationAndSeeds:
#             pageContent = preprocessDocument(readPage(pageLocation))
#             if applyClass == True:
#                 pageContent = doProcessingWithoutClass(preprocessDocument(readPage(pageLocation)), regExpIn, regExpOut)
#             resultsFound = extractSet([patterns[index]], pageContent)
#             flag = True
#             for s in seed:
#                 if not s in resultsFound:
#                     flag=False
#             if flag==True:
#                 patternSuccess[index]+=1
                
#     output = []
#     for index in range(0, len(patterns)):
#         if patternSuccess[index]>0:
#             output.append(patterns[index])
#     return output


def doProcessingWithoutClass(s, regExpIn, regExpOut):
    allManaged =  re.sub(regExpIn, regExpOut, s)
    removeQuoteIn = r'"[^"]*"'
    removeQuoteOut = r'""'
    return re.sub(removeQuoteIn, removeQuoteOut, allManaged)
#     i = "class[\s]*=[\s]*"
#     o       = "class="
#     s = re.sub(i, o, s)
#     i = "([^class])=\"[^\"]*\""
#     o = "\\1=\"\""
#     return re.sub(i, o, s)



def getLastWordWithoutEqualSign(leftContext):
    words = leftContext.strip().split(" ")
    totalWords = len(words)
    if len(words)<2:
        return ""
    else:
        if words[totalWords-1]=="=":
            return words[totalWords-2]
        else:
            return ""


def getSpecialWords(document, seedsSet):
    totalLength = len(document)
    specialWords = []
    for s in seedsSet:
        allSeedPositions = getAllStartEndPairs(document, s)
        for (start, end) in allSeedPositions:
            prevLoc = start-1
            nextLoc = end
            if prevLoc>=0 and nextLoc<totalLength:
                prevQuot = document[prevLoc]
                nextQuot  = document[nextLoc]
#                 print(prevQuot + " " + nextQuot)
                if prevQuot=="\"" and nextQuot=="\"":
                    leftContext = document[max(0, prevLoc-100):prevLoc]
                    lastWord = getLastWordWithoutEqualSign(leftContext)
                    if len(lastWord)>0:
                        specialWords.append(lastWord)
    return specialWords



def createRegExpFromWordStr(wordsStr):
#     wordsStr = getWordsStr(words)
#     print("wordsstr is " + wordsStr)
    return ("(" + wordsStr +")[\s]*=[\s]*\"([^\"]*)\"", "\\1=\\2")



def getWordsStr(words):
    out = words[0]
    if len(words)==1:
        return out
    else:
        for index in range(1, len(words)):
            out+="|"+words[index]
    print(out)
    return out


def getAllSpecialWordsAsString(pageLocationAndSeed):
    specialWords = []
    specialWords.append("class")
    for (pageLocation, seeds) in pageLocationAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))
        specialWords.extend(getSpecialWords(pageContent, seeds))
    specialWords = list(set(specialWords))
    return getWordsStr(specialWords)












def getAllPatternsByPlainStringMatch(corpusLocation):
    directories             = getAllDirectoriesInLocation(corpusLocation)
    (trainDirs, testDirs)   = trainTestSplit(directories)
    pageLocationsAndSeed    = getPageLocationAndSeed(trainDirs)
    testPageLocationAndSeed = getPageLocationAndSeed(testDirs)
    leftContexts  = []
    rightContexts = []
    for (pageLocation, seed) in pageLocationsAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))
        (leftContextsPerPage, rightContextsPerPage) = getLeftAndRightContexts(pageContent, seed)
        leftContexts.append(leftContextsPerPage)
        rightContexts.append(rightContextsPerPage)
#     printContextInformation(leftContexts, pageLocationsAndSeed, True)
#     printContextInformation(rightContexts, pageLocationsAndSeed)
    leftPatterns = getLeftPatterns(leftContexts)
    rightPatterns = getRightPatterns(rightContexts)
    
#     print("Left patterns are " + str(leftPatterns))
#     print("Right patterns are " + str(rightPatterns))
    patterns = allPossiblePairs(leftPatterns, rightPatterns)
#     print("All possible patterns: ")
#     print(patterns)
    patterns = filterPatterns(pageLocationsAndSeed, patterns)
    #TODO if number of patterns are empty go for empty class thing
#     print("Final Patterns ")
#     print(patterns)
    recall = 0
    extraResults = 0
    for (pageLocation, seed) in testPageLocationAndSeed:
        pageContent = preprocessDocument(readPage(pageLocation))
        resultsPerPage = preprocessResults(list(extractSet(patterns, pageContent)))
#         resultsPerPage = list(set(resultsPerPage))
        if seed in resultsPerPage:
            recall += 1
        if len(resultsPerPage)>1:
            extraResults += len(resultsPerPage)-1
#         print("Results per page " + str(resultsPerPage))
    seedsMissed = len(testPageLocationAndSeed)-recall
#     print("Seeds missed: " + str(seedsMissed))
#     print("Extra junk: "+ str(extraResults))
    return (patterns, seedsMissed, extraResults, len(testDirs))
    
# getAllPatternsByPlainStringMatch(corpusDirectory)

def getAllPatternsByRetainingOnlyClassValue(corpusLocation):
    directories             = getAllDirectoriesInLocation(corpusLocation)
    (trainDirs, testDirs)   = trainTestSplit(directories)
    pageLocationsAndSeed    = getPageLocationAndSeed(trainDirs)
    testPageLocationAndSeed = getPageLocationAndSeed(testDirs)
    specialWordsString      = getAllSpecialWordsAsString(pageLocationsAndSeed)
    (regExpIn, regExpOut)   = createRegExpFromWordStr(specialWordsString)
#     print("Regular expression is ")
#     print(regExpIn)
    leftContexts  = []
    rightContexts = []
    for (pageLocation, seed) in pageLocationsAndSeed:
        pageContent = doProcessingWithoutClass(preprocessDocument(readPage(pageLocation)), regExpIn, regExpOut)
        (leftContextsPerPage, rightContextsPerPage) = getLeftAndRightContexts(pageContent, seed)
        leftContexts.append(leftContextsPerPage)
        rightContexts.append(rightContextsPerPage)
#     printContextInformation(leftContexts, pageLocationsAndSeed, True)
#     printContextInformation(rightContexts, pageLocationsAndSeed)
#     print("Printed context infromation")
    leftPatterns = getLeftPatterns(leftContexts)
    rightPatterns = getRightPatterns(rightContexts)
    
#     print("Left patterns are " + str(leftPatterns))
#     print("Right patterns are " + str(rightPatterns))
    patterns = allPossiblePairs(leftPatterns, rightPatterns)
#     print("All possible patterns: ")
#     print(patterns)
#     patterns = filterPatterns(pageLocationsAndSeed, patterns, True, regExpIn, regExpOut)
    #TODO if number of patterns are empty go for empty class thing
#     print("Final Patterns ")
#     print(patterns)
    recall = 0
    extraResults = 0
    for (pageLocation, seed) in testPageLocationAndSeed:
        pageContent = doProcessingWithoutClass(preprocessDocument(readPage(pageLocation)), regExpIn, regExpOut)
        resultsPerPage = preprocessResults(list(extractSet(patterns, pageContent)))
        if seed in resultsPerPage:
            recall += 1
#         else:
#             print("Page location: " + pageLocation)
#             print("Seed is " + seed)
#             print(resultsPerPage)
        if len(resultsPerPage)>1:
            extraResults += len(resultsPerPage)-1
#     print("number of pages" + str(len(testPageLocationAndSeed)))
    seedsMissed = len(testPageLocationAndSeed)-recall
#     print("Seeds missed: " + str(seedsMissed))
#     print("Extra junk: "+ str(extraResults))
    return (patterns, seedsMissed, extraResults, len(testDirs), specialWordsString)
    
# getAllPatternsByRetainingOnlyClassValue(corpusDirectory)

def writeListToFile(loc, l):
    with open(loc, 'w') as f:
        for item in l:
            f.write(item+"\n")

            
def escapeAllDoubleQuote(s):
    return re.sub("\"", "\\\"", s)
            
def writeProductPatternsToFile(fileLocation, productPatterns):
    header = "GroupId\tWebsiteName\tPatternType\tMissed\tJunk\tOutof\tLeftPattern\tRightPattern"
    groupId = 1
    output = []
    output.append(header)
    for (website, pType, patterns, missed, junk, total) in productPatterns:
        rowPrefix = str(groupId) + "\t" + website + "\t" + pType
        rowPrefix+="\t" + str(missed) + "\t" + str(junk) + "\t" + str(total)
        for (l, r) in patterns:
            row = rowPrefix + "\t" + l + "\t" + r
            output.append(row)
        groupId+=1
#     print("output is ")
#     for item in output:
#         print(item)
    writeListToFile(fileLocation, output)
    print("Output written at location: " + fileLocation)
    
    
def getPatternStore(eCommerceDataSetLocation):
    allWebsites = getAllDirectoriesInLocation(eCommerceDataSetLocation)
#     print("All websites are " + str(allWebsites))
    patternsStore = []
    for website in allWebsites:
        corpusLocation = website + "/corpus"
        websiteName = os.path.basename(website)
        (plainPatterns, plainMissed, plainJunkCount, plainTotal) = getAllPatternsByPlainStringMatch(corpusLocation)
        (noValPatterns, noValMissed, noValJunkCount, noValTotal, words) = getAllPatternsByRetainingOnlyClassValue(corpusLocation)
        print(websiteName)
        print(plainMissed)
        print(noValMissed)
        if plainMissed <= noValMissed:
            patterns = (websiteName, "plainPattern", plainPatterns, plainMissed, plainJunkCount, plainTotal)
            print(plainPatterns)
        else:
            patterns = (websiteName, "noValuePattern_"+words, noValPatterns, noValMissed, noValJunkCount, noValTotal)
        patternsStore.append(patterns)
    for p in patternsStore:
        print(p)
    return patternsStore



                  

In [2]:
print("Enter e-commerce data location: ")
eCommerceDataLocation = raw_input()
print("Enter outputLocation")
outputLocation = raw_input()
outputLocation+="/ProductPatterns.tsv"
print("Patterns will be found at " + str(outputLocation))
patternsStore = getPatternStore(eCommerceDataLocation)
writeProductPatternsToFile(outputLocation, patternsStore)
#productNameData

Enter e-commerce data location: 
productNameData
Enter outputLocation
newPatternsLearnt
Patterns will be found at newPatternsLearnt/ProductPatterns.tsv
flipkart
0
0
[('">', '</div><svg width="16" height="27" viewBox="0 0 16 27" xmlns="http://www.w3.org/2000/svg" class="_2XP')]
pubId|class|minValue
snapdeal
0
0
[('tyle: outside none none; position: absolute; width: 553px; z-index: 0; display: none;"> <img title="', '" slidenum="1" class="cloudzoom" bigsrc="https://n'), ('tyle: outside none none; position: absolute; width: 553px; z-index: 0; display: none;"> <img title="', '" slidenum="2" class="cloudzoom" bigsrc="https://n'), ('" class="tileImg lazy-load" title="', '" alt="'), ('emap\', \'k4\': \'See All Categories\'}, ]" type="hidden"> </ul> </div> </div></div> </div><input value="', '" id="productNamePDP" type="hidden"> <input id="superCategoryLabel" name="superCategoryLabel" value='), ('nk itemprop="availability" href="http://schema.org/InStock"> <div itemprop="name" class="disp-none

In [12]:
pageLocationsWithSeed = getPageLocationAndSeed(trainDirs)
print(pageLocationsWithSeed)
testPageLocationWithSeed = getPageLocationAndSeed(testDirs)
print(testPageLocationWithSeed)

[('productNameData/flipkart/corpus/page1/page.html', 'Mi A1 (Black, 64 GB) (4 GB RAM)'), ('productNameData/flipkart/corpus/page6/page.html', 'Apple iPhone SE (Space Grey, 32 GB)'), ('productNameData/flipkart/corpus/page7/page.html', 'Samsung Galaxy On5 (Gold, 8 GB) (1.5 GB RAM)'), ('productNameData/flipkart/corpus/page5/page.html', 'Puma Men Puma Black-High Risk Red Sports Sandals')]


NameError: name 'testPageLocatioWithSeed' is not defined

In [88]:
eCommerceDataSetLocation = "productNameData"
allWebsites = getAllDirectoriesInLocation(eCommerceDataSetLocation)
print("All websites are " + str(allWebsites))
patternsStore = []
for website in allWebsites:
    corpusLocation = website + "/corpus"
    websiteName = os.path.basename(website)
    (plainPatterns, plainMissed, plainJunkCount, plainTotal) = getAllPatternsByPlainStringMatch(corpusLocation)
    (noValPatterns, noValMissed, noValJunkCount, noValTotal) = getAllPatternsByRetainingOnlyClassValue(corpusLocation)
#     print("Patterns are ")
#     print(plainPatterns)
    if plainMissed < noValMissed:
#         print("yes it is")
        patterns = (websiteName, "plainPattern", plainPatterns, plainMissed, plainJunkCount, plainTotal)
    else:
        patterns = (websiteName, "noValuePattern", noValPatterns, noValMissed, noValJunkCount, noValTotal)
    patternsStore.append(patterns)
print("Pattern store is " + str(patternsStore))
        

All websites are ['productNameData/flipkart', 'productNameData/.~lock.ProductPatterns.tsv#', 'productNameData/ProductPatterns.tsv', 'productNameData/amazon']


OSError: [Errno 20] Not a directory: 'productNameData/.~lock.ProductPatterns.tsv#/corpus'

In [162]:
def removeTerminatedAnd(s):
    return re.sub("&amp;", "&", s)
removeTerminatedAnd("sanjeev&amp;kumar")

'sanjeev&kumar'

In [12]:
re.sub("sanjeev", "$", "sanju sanjeev kumar")

'sanju $ kumar'