In [99]:
import re
import os
#read the relation seeds from seedsLocation
def readRelationSeeds(seedsLocation):
    relationSeeds = []
    with open(seedsLocation) as f:
        for line in f:
            relationSeed = re.split(r'\t+', line)
            if len(relationSeed)!=2:
                continue
            relationSeeds.append((relationSeed[0].strip(), relationSeed[1].strip()))
    return relationSeeds

#read the page from pageLocation
def readPage(pageLocation):
    htmlPageContent = ""
    with open(pageLocation, 'r') as myfile:
        htmlPageContent = myfile.read().strip()
    return htmlPageContent

#document processing logic at this stage is about removing multiple whitespaces into single one 
def preprocessDocument(document):
    return ' '.join(document.split())


def isRelationPresent(document, (lSeed, rSeed), kvDiff=200):
    lSeedLocations = [(i, i+len(lSeed)) for i in range(len(document)) if document.startswith(lSeed, i)]
    docLength = len(document)
    for (ls, le) in lSeedLocations:
        subStr = document[le:min(le+200, docLength)]
        if rSeed in subStr:
            return True
    return False



def getPresentSeeds(pageContent, seedsSet):
    output = []
    for seed in seedsSet:
        if isRelationPresent(pageContent, seed):
            output.append(seed)
    return output
        
    
def getEachSeedLocationsInPage(document, (lSeed, rSeed), kvDiff=200):
    lSeedLocations = [(i, i+len(lSeed)) for i in range(len(document)) if document.startswith(lSeed, i)]
    docLength = len(document)
    output = []
    for (ls, le) in lSeedLocations:
        subStr = document[le: min(le+200, docLength)]
        loc = subStr.find(rSeed)
        if loc==-1:
            continue
        rs = loc
        re = loc + len(rSeed)
        output.append((ls, le, rs, re))
    return output
    

def getSeedsPositions(pageContent, presentSeeds):
    output = []
    for seed in presentSeeds:
        output.append(getEachSeedLocationsInPage(pageContent, seed))
    return output



def getStringFromTuple(tList, document):
    for t in tList:
        for (a, b, c, d) in t:
            temp = document[b:]
            print((document[a:b],temp[c:d]))
        
    
    


def isPageReady(pageContent, seedsSet):
    presentSeeds = getPresentSeeds(pageContent, seedsSet)
    if len(presentSeeds)<2:
        return False
    seedsPositionsOnPage = getSeedsPositions(pageContent, presentSeeds)
    if numberOfClusters(seedsPositionsOnPage)<=0:
        return False
    return True


#check if one seed is close to other
def closeTo((l1s,l1e, r1s, r1e), valueList):
    for (l2s, l2e, r2s, r2e) in valueList:
        if abs(l1e+r1e-l2s)<=3000 or abs(l2e+r2e-l1s)<=3000:
            return True
    return False





def numberOfClusters(seedsPositionOnPage):
    totalCount = 0
    for i in range(0, len(seedsPositionOnPage)):
        countEachSeed = 0
        l1 = seedsPositionOnPage[i]
        for j in range(i+1, len(seedsPositionOnPage)):
            l2 = seedsPositionOnPage[j]
            for value in l1:
                if closeTo(value, l2):
                    countEachSeed+=1
                    break
        totalCount+=countEachSeed
    return totalCount




def addSeedLocationToClusters(clusters, seedsPositionOnPage):
    if len(clusters) == 0:
        for seedPosition in seedsPositionOnPage:
            clusters.append([seedPosition])
        return clusters
    for seedPosition in seedsPositionOnPage:
        for cluster in clusters:
            if closeTo(seedPosition, cluster):
                cluster.append(seedPosition)
    return clusters










#find string representation of clusters present in htmlPageContent    
def getStrOfCluster(clusters, htmlPageContent):
    output = []
    for cluster in clusters:
        clusterOutput = []
        for (lstart, lend, rstart, rend) in cluster:
            rs = lend+rstart
            re = lend+rend
            clusterOutput.append((htmlPageContent[lstart:lend].strip(), htmlPageContent[rs:re].strip()))
        output.append(clusterOutput)
    return output    


def addSeedLocationToClusters2(clusters, seedsPositionOnPage):
    recorded = set()
    if len(clusters) == 0:
        for seedPosition in seedsPositionOnPage:
            clusters.append([seedPosition])
        return clusters
    for seedPosition in seedsPositionOnPage:
        isCloseToSomething = False
        clusters2 = clusters
        totalClusters = len(clusters)
        for index in range(0, totalClusters):
            cluster = clusters2[index]
            if closeTo(seedPosition, cluster):
                if index in recorded:
                    elt = cluster[0:len(cluster)-1]
                    elt.append(seedPosition)
                    clusters2.append(elt)
                    recorded.add(len(clusters2)-1)
                else:
                    cluster.append(seedPosition)
                    recorded.add(index)
                isCloseToSomething = True
        if isCloseToSomething==False:
            clusters2.append([seedPosition])
            recorded.add(len(clusters2)-1)
    return clusters



def getClusters(htmlPageContent, seedsSet):
    presentSeeds = getPresentSeeds(htmlPageContent, seedsSet)
    clusters = []
    seedsPositionsOnPage = getSeedsPositions(htmlPageContent, presentSeeds)
#     print("Number of seeds " + str(len(seedsPositionsOnPage)))
    for eachSeedPositions in seedsPositionsOnPage:
        clusters = addSeedLocationToClusters2(clusters, eachSeedPositions)
#         print("Number of clusters now " + str(len(clusters)))
#         print("Positions were " + str(eachSeedPositions))
#         print("clusters are " + str(clusters))
    return clusters


#get left contexts of elements in cluster
def getLeftContexts(clusters, htmlPageContent):
    output = []
    for cluster in clusters:
        clusterOutput = []
        for (start, end, x, y) in cluster:
            lc = htmlPageContent[(max(0, start-100)):start].strip()
            if len(lc)<=0:
                continue
            clusterOutput.append(lc[::-1])
        output.append(clusterOutput)
    return output
    

#get right contexts for each cluster    
def getRightContexts(clusters, htmlPageContent):
    output = []
    for cluster in clusters:
        clusterOutput = []
        for (ls, le, rs, re) in cluster:
            start = le+rs
            end = le+re
            rc = htmlPageContent[end:min(len(htmlPageContent), end+100)].strip()
            if len(rc)<=0:
                continue
            clusterOutput.append(rc)
        output.append(clusterOutput)
    return output


def getMiddlePatterns(clusters, pageContent):
    output = []
    for cluster in clusters:
        clusterOutput = []
        for (x, y, start, end) in cluster:
            mid = pageContent[y:y+start].strip()
            if len(mid)<=0:
                continue
            clusterOutput.append(mid)
        output.append(clusterOutput)
    return output



#reverse strings in list
def reverseListElements(l):
    output = []
    for item in l:
        output.append(item[::-1])
    return output



#get the prefix of clusters present and return them as output
def getClusterPrefix(clusters):
    output = []
    for cluster in clusters:
        output.append(commonprefix(cluster))        
    return output

def commonprefix(m):
    "Given a list of pathnames, returns the longest common leading component"
    if not m: return ''
    s1 = min(m)
    s2 = max(m)
    for i, c in enumerate(s1):
        if c != s2[i]:
            return s1[:i]
    return s1



#we might get empty string present inside left and right element
def getMiddlePatternIntersection(middleContext):
    output = []
    for context in middleContext:
        l = list(set(context))
        if len(l)==1:
            output.append(l[0])
        elif len(l)==0:
            output.append([""])
        else:
            output.append([])
    return output



#return only those patterns whose length is atleast 10
def clubPatterns(leftPatterns, middlePatterns, rightPatterns):
    l1 = len(leftPatterns)
    l2 = len(rightPatterns)
    l3 = len(middlePatterns)
    print("Lengths are " + str(l1) +" " + str(l2) + " " + str(l3))
    if l1 != l2 or l1!=l3:
        return []
    output = []
    for index in range(0, l1):
        l = leftPatterns[index]
        m = middlePatterns[index]
        r = rightPatterns[index]
        if len(l)<2 or len(m)<2 or len(r)<2:
            continue
        elif len(l) + len(m)+len(r)>15:  
            output.append((l, m, r))
    return output




def getPatterns(pageContent, seedsSet):
    print("Oye hoye")
    clusters              = getClusters(pageContent, seedsSet)
    strClusters           = getStrOfCluster(clusters, pageContent)
    print("String clusters: ")
    print(strClusters)
    print("NUmber of clusters: " + str(len(clusters)))
    leftContexts          = getLeftContexts(clusters, pageContent)
    rightContexts         = getRightContexts(clusters, pageContent)
    middleContext         = getMiddlePatterns(clusters, pageContent)
    print("Patterns are ")
#     print("left patterns")
#     print(leftContexts)
#     print("RIght Patterns")
#     print(rightContexts)
    print("Middle contexts")
    print(middleContext)
    print("Number of middle contexts " + str(len(middleContext)))
    leftPrefixes          = reverseListElements(getClusterPrefix(leftContexts))
    middleIntersection    = getMiddlePatternIntersection(middleContext)
    rightPrefixes         = getClusterPrefix(rightContexts)
    print("left patterns: " + str(leftPrefixes))
    print("Right Patterns: " + str(rightPrefixes))
    print("Middle Patterns: " + str(middleIntersection))
    return clubPatterns(leftPrefixes, middleIntersection, rightPrefixes)




#get fileName,list and outputdirectory and write list onto file
def writeOutputToFile(fileName, results, outputSeedsLocation):
    with open(outputSeedsLocation + "/" + fileName,'w') as f:
        f.write('\n'.join(results))



In [60]:
#initalise user speicific variables
seedsLocation  = "flipkart/fashion/seeds.tsv"
corpusLocation = "flipkart/fashion/corpus"
outputSeedsLocation = "flipkart/fashion/results"
unseenCorpus = "flipkart/fashion/unseenCorpus"
iterations     = 1

In [61]:
from os import listdir
from os.path import isfile, join
#give it the name of directory and get back list of files present in directory
def getAllFilesInDirectory(directory):
    return [join(directory, f) for f in listdir(directory) if isfile(join(directory, f))]

#create direcotries if not present already
def mkDirsInResults(outputSeedsLocation, seedsDirectory, patternsDirectory, unseenDataSeedsDirectory):
    if not os.path.exists(outputSeedsLocation):
        os.makedirs(outputSeedsLocation)
    if not os.path.exists(seedsDirectory):
        os.makedirs(seedsDirectory)
    if not os.path.exists(patternsDirectory):
        os.makedirs(patternsDirectory)
    if not os.path.exists(unseenDataSeedsDirectory):
        os.makedirs(unseenDataSeedsDirectory)


        
#remove elements which were actually tag
def isTag((a,b)):
    if a.find("<")!=-1 and a.find(">")!=-1 and b.find("<")!=-1 and b.find(">")!=-1:
        return True
    return False


#preprocess results before writing it to file
def preprocessResults(output):
    result = []
    for o in output:
        if isTag(o):
            continue
        result.append(o)
    return result




#Pattern is (l, m, r) and match them to htmlPageContent
def findRelationSetwrtPattern(htmlPageContent, (l, mid, r)):
    #for each start location of pattern find its end
    #for each end page find the pattern right
    #extract everything till that point
    #after extraction move one point above that pattern string
    results = []
    for m in re.finditer(re.escape(l), htmlPageContent):
        start = m.start()
        end = m.end()
        rightPage = htmlPageContent[end:]
        rightLoc  = rightPage.find(r)
        if rightLoc==-1:
            break
        substr = rightPage[:rightLoc]
        mLoc = substr.find(mid)
        if mLoc==-1:
            continue
        e1 = substr[:mLoc]
        e2 = substr[mLoc+len(mid):]
        if len(e1)>1 and len(e2)>1 and len(e1)<100 and len(e2)<100:
            results.append((e1, e2))
    return set(results)



#input: all the pattern found and new html page
#output: extract set of elements from the page that are in our set(whose definition we know)
def extractSet(patterns, htmlPageContent):
    output = []
    for pattern in patterns:
        output.extend(findRelationSetwrtPattern(htmlPageContent, pattern))
    return output



def getCsvRecords(l):
    results = []
    for (a, b) in l:
        results.append(a + "\t" + b)
    return results


def convertTripleToRow(l):
    output = []
    for (a, b, c) in l:
        output.append(a + "\t" + b + "\t" + c)
    return output


#try removing anything except class="value" FLIPKART troubled us to write this pattern processing
def deleteEveryValueWithoutClass(s):
    index = 0
    output = ""
    KEEP_REMOVING = 0
    KEEP_TAKING = 1
    KEEP_TAKING_TWICE=2
    status=KEEP_TAKING
    index=0
    print("Boom")
    while index<len(s):
        if s[index:].startswith("Brand: "):
            print("Hi with status " + str(status))
        ch = s[index]
        if ch=="\"":
            if status==KEEP_REMOVING:
                index+=1
                status = KEEP_TAKING
            elif status==KEEP_TAKING:
                status = KEEP_REMOVING
                index+=1
            elif status==KEEP_TAKING_TWICE:
                output+=ch
                status=KEEP_TAKING
                index+=1
        elif status==KEEP_REMOVING:
            index+=1
        elif s[index:].startswith("class=\""):
            output+="class=\""
            index+=7
            status=KEEP_TAKING_TWICE
        else:
            output+=ch+""
            index+=1
    return output        





def writeStrToFile(s, loc):
    text_file = open(loc, "w")
    text_file.write(s)
    text_file.close()
    
def doProcessingWithoutClass(s):
    i = "class[\s]*=[\s]*"
    o       = "class="
    s = re.sub(i, o, s)
    i = "([^class])=\"[^\"]*\""
    o = "\\1=\"\""

    return re.sub(i, o, s)



# s = "my name is sanjeev class = \"gaga\" and value=\"kua\"mr"
# doProcessingWithoutClass(s)

In [101]:
##directory construction
seedsDirectory = outputSeedsLocation +"/iterationSeeds"
patternsDirectory = outputSeedsLocation +"/patterns"
unseenDataSeedsDirectory = outputSeedsLocation + "/unseenDataSeeds"                   
mkDirsInResults(outputSeedsLocation, seedsDirectory, patternsDirectory, unseenDataSeedsDirectory)
seedsSet = readRelationSeeds(seedsLocation)
corpusFiles = getAllFilesInDirectory(corpusLocation)
unseenCorpusFiles = getAllFilesInDirectory(unseenCorpus)
results = []
patternsFound = []
for iteration in range(1, iterations+1):
    print("Iteration# " + str(iteration))
    for productPage in corpusFiles:
        pageContent = preprocessDocument(readPage(productPage))
        print("Present seeds are: ")
        print(getPresentSeeds(pageContent, seedsSet))
        seedsPositions = getSeedsPositions(pageContent, seedsSet)
#         print("Seeds positions are " + str(seedsPositions))
        getStringFromTuple(seedsPositions, pageContent)
        pageStatus  = isPageReady(pageContent, seedsSet)
        print("Page status is " + str(pageStatus))
        if pageStatus == True:
            patternsFoundEachPage = getPatterns(pageContent, seedsSet)
            print("Patterns found are ")
            print(patternsFoundEachPage)
            if len(patternsFoundEachPage)<=0:
                print("Hey hey")
                pageContent = doProcessingWithoutClass(pageContent)
                writeStrToFile(pageContent, "snapdeal/pageContent")
                if isPageReady(pageContent, seedsSet):
                    print("Page is ready again")
                    patternsFoundEachPage = getPatterns(pageContent, seedsSet)
                else:
                    print("Page is not ready")
                    continue
            moreSetElements = preprocessResults(list(extractSet(patternsFoundEachPage, pageContent)))
            results.extend(moreSetElements)
            patternsFound.extend(patternsFoundEachPage)
    results2 = set(results)
    results2 = getCsvRecords(list(results2))
    
    writeOutputToFile("results_"+str(iteration)+".tsv", results2, seedsDirectory)
    print("Seen Results written at location: " + seedsDirectory + " for iteration " + str(iteration))
unseenResults = []
for productPage in unseenCorpusFiles:
    pageContent = preprocessDocument(readPage(productPage))
    resultsPerPage = preprocessResults(list(extractSet(patternsFound, pageContent)))
    if len(resultsPerPage)<=0:
        pageContent = doProcessingWithoutClass(pageContent)
        resultsPerPage = preprocessResults(list(extractSet(patternsFound, pageContent)))
    unseenResults.extend(resultsPerPage)
unseenResults = list(set(unseenResults))
print("Total number of unseen results: " + str(len(unseenResults)))
writeOutputToFile("unseenSeeds.tsv", getCsvRecords(unseenResults), unseenDataSeedsDirectory)
print("unseen seeds written at " + unseenDataSeedsDirectory)
#patternsFound = [("Left patterns", "Right Patterns")].extend(patternsFound)
patternsFound.insert(0, ("Left Patterns", "MiddlePattern", "Right Patterns"))
writeOutputToFile("patterns.tsv", convertTripleToRow(patternsFound), patternsDirectory)
print("Patterns written at location " + patternsDirectory)

Iteration# 1
Present seeds are: 
[('Style Code', 'AJ0004NB'), ('Pattern', 'Solid'), ('Collar', 'Mandarin')]
('Style Code', 'AJ0004NB')
('Pattern', 'Solid')
('Pattern', 'Solid')
('Collar', 'Mandarin')
('Collar', 'Mandarin')
Page status is True
Oye hoye
String clusters: 
[[('Style Code', 'AJ0004NB'), ('Pattern', 'Solid'), ('Collar', 'Mandarin')], [('Pattern', 'Solid'), ('Collar', 'Mandarin')]]
NUmber of clusters: 2
Patterns are 
Middle contexts
[['</div><ul class="_3dG3ix col col-9-12" data-reactid="390"><li class="sNqDog" data-reactid="391">', '</div><ul class="_3dG3ix col col-9-12" data-reactid="406"><li class="sNqDog" data-reactid="407">', '</div><ul class="_3dG3ix col col-9-12" data-reactid="414"><li class="sNqDog" data-reactid="415">'], [':', 'Type:']]
Number of middle contexts 2
left patterns: ['">', '">']
Right Patterns: ['</li></ul></li><li class="_1KuY3T row" data-reactid="', '</li><li class="_2-riNZ" data-reactid="32']
Middle Patterns: [[], []]
Lengths are 2 2 2
Patterns found 

['amazon/corpus/Amazon.in_ Buy HP 14q-BU005TU 2017 14-inch Lightweight, Laptop (Core i3-6006U_4GB_1TB_Windows 10_Integrated Graphics), Jet Black Online at Low Prices in India _ HP Reviews & Ratings.html']
1
