In [1]:
import os
import re
import math
import functools
import csv
import pandas
from multiprocessing import Pool
import time

In [4]:
rootDataFolder = "Data"
dataSourceFolder,dataResultsFolder = "wiki","results"

dataSourcePath = os.path.join(rootDataFolder, dataSourceFolder)
dataResultsPath = os.path.join(rootDataFolder,dataResultsFolder)

**Printing the number of wikipedia files in the wiki folder**

In [5]:
file_names = os.listdir(dataSourcePath)
print(f"The wiki folder has {len(file_names)} files")

The wiki folder has 999 files


**Printing the contents of the first file in the list**

In [6]:
with open(os.path.join(dataSourcePath, file_names[0]),encoding='UTF-8') as f:
    for line in f.readlines()[:10]:
        print(line)

<!DOCTYPE html>

<html class="client-nojs" lang="en" dir="ltr">

<head>

<meta charset="UTF-8"/>

<title>Bay of Concepción - Wikipedia</title>

<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>

<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgM

**Observation: The wikepedia file content is in HTML format**

**The following utility class would search for a specific target string in all the files in a particular folder.It will create a csv file with below columns based on the results of the search using map-reduce**

* *File*: Wikepedia file name containing the target string
* *Line*: Line number in this file containing the target string
* *Index*: Start index of the line containing the target string
* *Context*: A few characters around the target string to indicate, in which context was the string used

In [9]:
class GrepUtility:
    
    def __init__(self,target,sourceDataPath,resultDataPath,numProcess):
        """
        Parameters
        ----------
        target : str
            The string to be searched
        sourceDataPath : str
            The path that contains the files that will be searched
        resultDataPath : str
            The path that will contain the resultant CSV file 
        numProcess : int
            The number of processes to be used in parallel
        """
        self._target = target 
        # Storing all files in folder in __fileNames attribute
        self._sourceDataPath = sourceDataPath
        self._fileNames = os.listdir(sourceDataPath)
        self._resultDataPath = resultDataPath
        self._numProcess = numProcess
        
    def _makeChunks(self):
        """
        Divides the number of files into chunks to be used by each process
        """
        data = self._fileNames
        numChunks = self._numProcess
        chunkSize = math.ceil(len(data) / numChunks)
        return [data[i:i+chunkSize] for i in range(0, len(data), chunkSize)]
    
    def _mapGrepWoCase(self,fileChunk):
        """
        Map method to search for target in fileChunk in case insensitive way
        
        Parameters
        ----------
        fileChunk: list
           List of files to be searched
        """
        searchResult = {}
        for file in fileChunk:
            with open(os.path.join(self._sourceDataPath, file)) as f:
                for idx,line in enumerate(f.readlines()):
                    matches = re.finditer(self._target.upper(), line.upper())
                    matchIndexes = [(idx, match.start()) for match in matches]
                    if matchIndexes:
                        if file not in searchResult:
                            searchResult[file] = []
                        searchResult[file].extend(matchIndexes)
                
        return searchResult
    
    def _mapGrepWiCase(self,fileChunk):
        """
        Map method to search for target in fileChunk in case sensitive way
        
        Parameters
        ----------
        fileChunk: list
           List of files to be searched
        """
        searchResult = {}
        for file in fileChunk:
            with open(os.path.join(self._sourceDataPath, file)) as f:
                for idx,line in enumerate(f.readlines()):
                    matches = re.finditer(self._target, line)
                    matchIndexes = [(idx, match.start()) for match in matches]
                    if matchIndexes:
                        if file not in searchResult:
                            searchResult[file] = []
                        searchResult[file].extend(matchIndexes)
                
        return searchResult
    
    def reduceGrep(self,result1, result2):
        """
        Reduce method to combine results obtaines from two different map methods
        
        Parameters
        ----------
        result1: dict
           result produced by Map method
        result2: dict
           result produced by Map method
        """
        result1.update(result2)
        return result1
    
    def _mapReduce(self,withCase,fileName):
        """
        This method will trigger the map and reduce functionality for each chunk
        
        Parameters
        ----------
        withCase: Bool
          To indicate if serach is case sensitive or case insensitive
        fileName: str
           The name of the CSV file that will hold the search results
        """
        chunks = self._makeChunks()
        pool = Pool(self._numProcess)
        chunkResults = pool.map(self._mapGrepWiCase, chunks) if withCase else pool.map(self._mapGrepWoCase, chunks) 
        self._results = functools.reduce(self.reduceGrep, chunkResults)
        if fileName:
            self._createCsv(fileName)
    
    def searchTarget(self,withCase=False,fileName=""):
        """
        This method should get called from outside of the class to search for target
        
        Parameters
        ----------
        withCase: Bool
          To indicate if serach is case sensitive or case insensitive
        fileName: str
           The name of the CSV file that will hold the search results
        """
        self._mapReduce(withCase,fileName)
    
    def _createCsv(self,csvFileName):
        """
        Creates a csv file based on the result of search using map-reduce
        
        Parameters
        ----------
        csvFileName: str
          Name of the csv file that will be created
        """
        context_delta = 30
        with open(os.path.join(self._resultDataPath,csvFileName), "w") as f:
            writer = csv.writer(f)
            rows = [["File", "Line", "Index", "Context"]]
            for file_name in self._results:
                with open(os.path.join(self._sourceDataPath, file_name)) as file:
                    lines = [line.strip() for line in file.readlines()]
                for line, index in self._results[file_name]:
                    start = max(index - context_delta, 0)
                    end   = index + len(self._target) + context_delta
                    rows.append([file_name, line, index, lines[line][start:end]])
            writer.writerows(rows)
        

**Running a case insensitive search for all the files in the wiki folder by varying the number of parallel proceses from 1 to 10**

In [11]:
for numProcesses in range(1, 11):
    start = time.time()
    search = GrepUtility(
               target="data", 
               sourceDataPath=dataSourcePath,
               resultDataPath=dataResultsPath, 
               numProcess=numProcesses
             )
    search.searchTarget(withCase=False)
    end = time.time()
    print(f"The search completed in {end - start:.2f} seconds using {numProcesses} processes")

The search completed in 1.83 seconds using 1 processes
The search completed in 0.96 seconds using 2 processes
The search completed in 0.68 seconds using 3 processes
The search completed in 0.60 seconds using 4 processes
The search completed in 0.70 seconds using 5 processes
The search completed in 0.83 seconds using 6 processes
The search completed in 0.85 seconds using 7 processes
The search completed in 0.84 seconds using 8 processes
The search completed in 0.85 seconds using 9 processes
The search completed in 0.85 seconds using 10 processes


**Observation: The case insensitive search performed best for 4 parallel processes. Increasing the number of processes made the performance worse due to parallelization overhead**

**Running a case sensitive search for all the files in the wiki folder by varying the number of parallel proceses from 1 to 10**

In [31]:
for numProcesses in range(1, 11):
    start = time.time()
    search = GrepUtility(
               target="time",
               sourceDataPath=dataSourcePath,
               resultDataPath=dataResultsPath,
               numProcess=numProcesses
    )
    search.searchTarget(withCase=True)
    end = time.time()
    print(f"The search completed in {end - start:.2f} seconds using {numProcesses} processes")

The search completed in 1.40 seconds using 1 processes
The search completed in 0.72 seconds using 2 processes
The search completed in 0.51 seconds using 3 processes
The search completed in 0.63 seconds using 4 processes
The search completed in 0.62 seconds using 5 processes
The search completed in 0.75 seconds using 6 processes
The search completed in 0.73 seconds using 7 processes
The search completed in 0.74 seconds using 8 processes
The search completed in 0.75 seconds using 9 processes
The search completed in 0.80 seconds using 10 processes


**Observation: The case sensitive search performed best for 4 parallel processes. Increasing the number of processes made the performance worse due to parallelization overhead**

**Case insensitive search of target "This" in all the files within the wiki folder**

In [14]:
search1 = GrepUtility(
            target="This", 
            sourceDataPath=dataSourcePath,
            resultDataPath=dataResultsPath,
            numProcess=3
          )
search1.searchTarget(withCase=False,fileName="results1.csv")

df1 = pandas.read_csv("results1.csv")
df1.head(10)

Unnamed: 0,File,Line,Index,Context
0,Bay_of_ConcepciC3B3n.html,18,60,"plication/x-wiki"" title=""Edit this page"" href=..."
1,Bay_of_ConcepciC3B3n.html,19,29,"<link rel=""edit"" title=""Edit this page"" href=""..."
2,Bay_of_ConcepciC3B3n.html,58,456,"al photos, and other data for this location""><..."
3,Bay_of_ConcepciC3B3n.html,58,701,"al photos, and other data for this location"">3..."
4,Bay_of_ConcepciC3B3n.html,63,7,"<td><i>This <a href=""/wiki/B%C3%ADo_B%C3%"
5,Bay_of_ConcepciC3B3n.html,66,122,"o-geo-stub""><abbr title=""View this template"">v..."
6,Bay_of_ConcepciC3B3n.html,67,135,"eo-stub""><abbr title=""Discuss this template"">t..."
7,Bay_of_ConcepciC3B3n.html,68,158,"ction=edit""><abbr title=""Edit this template"">e..."
8,Bay_of_ConcepciC3B3n.html,115,137,"ssion about edits from this IP address [n]"" ac..."
9,Bay_of_ConcepciC3B3n.html,115,283,"ist of edits made from this IP address [y]"" ac..."


**Case sensitive search of target "However" in all the files within the wiki folder**

In [15]:
search2 = GrepUtility(
           target="However", 
           sourceDataPath=dataSourcePath,
           resultDataPath=dataResultsPath,
           numProcess=4
          )

search2.searchTarget(withCase=True,fileName="results2.csv")

df2 = pandas.read_csv("results2.csv")
df2.head(10)

Unnamed: 0,File,Line,Index,Context
0,Coalville_Town_railway_station.html,156,209,"ne"">Ivanhoe Line</a> project. However, after t..."
1,Urban_chicken.html,110,265,"reason to keep a ban on them. However, the ave..."
2,Elgin_National_Watch_Company.html,119,75,"re operated, mostly in Elgin. However, additio..."
3,Supermoon.html,107,284,"ull moon will be a supermoon. However, halfway..."
4,Supermoon.html,111,523,"needed</span></a></i>]</sup> However, because..."
5,Meydane_Jahad_Metro_Station.html,111,604,"two months of its operation. However, due to ..."
6,Yarkant_County.html,316,330,"is brothers and killed Junde. However, another..."
7,Yemeni_rial.html,135,1263,"een stable for several years. However, since 2..."
8,2011_ITU_Duathlon_World_Championships.html,86,269,"leaders during the bike leg. However, Silva c..."
9,Discretionary_trust.html,297,312,"=""#cite_note-8"">[8]</a></sup> However, it seem..."
