In [1]:
import os,re
import math
import functools
import csv
import pandas
from multiprocessing import Pool

**Printing the number of wikipedia files in the wiki folder**

In [2]:
file_names = os.listdir("wiki")
print(f"The wiki folder has {len(file_names)} files")

The wiki folder has 999 files


**Printing the contents of the first file in the list**

In [3]:
with open(os.path.join("wiki", file_names[0])) as f:
    for line in f.readlines():
        print(line)

<!DOCTYPE html>

<html class="client-nojs" lang="en" dir="ltr">

<head>

<meta charset="UTF-8"/>

<title>Bay of Concepción - Wikipedia</title>

<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>

<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgM

**Observation: The wikepedia file content is in HTML format**

**The following utility class would search for a specific target string in all the files in a particular folder.It will create a csv file with below columns based on the results of the search using map-reduce**

* *File*: Wikepedia file name containing the target string
* *Line*: Line number in this file containing the target string
* *Index*: Start index of the line containing the target string
* *Context*: A few characters around the target string to indicate, in which context was the string used

In [4]:
class GrepUtility:
    
    def __init__(self,target,folderName,numProcess=5):
        """
        Parameters
        ----------
        target : str
            The string to be searched
        folderName : str
            The sound the animal makes
        numProcess : int, optional
            The number of processes to be used in parallel
        """
        self._target = target 
        # Storing all files in folder in __fileNames attribute
        self._fileNames = os.listdir(folderName)
        self._folderName = folderName
        self._numProcess = numProcess
        
    def make_chunks(self):
        """
        Divides the number of files into chunks to be used by each process
        """
        data = self._fileNames
        num_chunks = self._numProcess
        chunk_size = math.ceil(len(data) / num_chunks)
        return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
    
    def map_grep_wo_case(self,file_chunk):
        """
        Map method to search for target in file_chunk in case insensitive way
        Parameters
        ----------
        file_chunk: list
           List of files to be searched
        """
        searchResult = {}
        for file in file_chunk:
            with open(os.path.join(self._folderName, file)) as f:
                for idx,line in enumerate(f.readlines()):
                    matches = re.finditer(self._target.upper(), line.upper())
                    match_indexes = [(idx, match.start()) for match in matches]
                    if match_indexes:
                        if file not in searchResult:
                            searchResult[file] = []
                        searchResult[file].extend(match_indexes)
                
        return searchResult
    
    def map_grep_wi_case(self,file_chunk):
        """
        Map method to search for target in file_chunk in case sensitive way
        Parameters
        ----------
        file_chunk: list
           List of files to be searched
        """
        searchResult = {}
        for file in file_chunk:
            with open(os.path.join(self._folderName, file)) as f:
                for idx,line in enumerate(f.readlines()):
                    matches = re.finditer(self._target, line)
                    match_indexes = [(idx, match.start()) for match in matches]
                    if match_indexes:
                        if file not in searchResult:
                            searchResult[file] = []
                        searchResult[file].extend(match_indexes)
                
        return searchResult
    
    def reduce_grep(self,result1, result2):
        """
        Reduce method to combine results obtaines from 2 different map methods
        Parameters
        ----------
        result1: dict
           result produced by Map method
        result2: dict
           result produced by Map method
        """
        result1.update(result2)
        return result1
    
    def map_reduce(self,withCase=False):
        """
        This method will trigger the map and reduce functionality for each chunk
        Parameters
        ----------
        withCase: Bool
          To indicate if serach is case sensitive or case insensitive
        """
        chunks = self.make_chunks()
        pool = Pool(self._numProcess)
        chunk_results = pool.map(self.map_grep_wi_case, chunks) if withCase else pool.map(self.map_grep_wo_case, chunks) 
        self._results = functools.reduce(self.reduce_grep, chunk_results)
    
    def create_csv(self,csvFileName):
        """
        Creates a csv file based on the result of search using map-reduce
        Parameters
        ----------
        csvFileName: str
          Name of the file that will be created
        """
        context_delta = 30
        with open(csvFileName, "w") as f:
            writer = csv.writer(f)
            rows = [["File", "Line", "Index", "Context"]]
            for file_name in self._results:
                with open(os.path.join(self._folderName, file_name)) as file:
                    lines = [line.strip() for line in file.readlines()]
                for line, index in self._results[file_name]:
                    start = max(index - context_delta, 0)
                    end   = index + len(self._target) + context_delta
                    rows.append([file_name, line, index, lines[line][start:end]])
            writer.writerows(rows)
        

In [5]:
search1 = GrepUtility(target="data", folderName="wiki", numProcess=5)
search1.map_reduce(withCase=False)
search1.create_csv('results1.csv')

# Loading the output file into pandas for visualization
df = pandas.read_csv("results1.csv")
df.head(10)

Unnamed: 0,File,Line,Index,Context
0,Bay_of_ConcepciC3B3n.html,6,422,"egories"":[""Coordinates on Wikidata"",""All stub ..."
1,Bay_of_ConcepciC3B3n.html,45,628,"78-sj18-04-quiriquina.jpg 2x"" data-file-width=..."
2,Bay_of_ConcepciC3B3n.html,45,650,"jpg 2x"" data-file-width=""960"" data-file-height..."
3,Bay_of_ConcepciC3B3n.html,58,447,"aps, aerial photos, and other data for this lo..."
4,Bay_of_ConcepciC3B3n.html,58,692,"aps, aerial photos, and other data for this lo..."
5,Bay_of_ConcepciC3B3n.html,60,18,"<table class=""metadata plainlinks stub"" role=""..."
6,Bay_of_ConcepciC3B3n.html,62,568,"o_Region%2C_Chile.svg.png 2x"" data-file-width=..."
7,Bay_of_ConcepciC3B3n.html,62,590,"png 2x"" data-file-width=""600"" data-file-height..."
8,Bay_of_ConcepciC3B3n.html,105,40,"atlinks"" class=""catlinks"" data-mw=""interface"">..."
9,Bay_of_ConcepciC3B3n.html,105,748,"tegory:Coordinates_on_Wikidata"" title=""Categor..."


In [6]:
search2 = GrepUtility(target="data", folderName="wiki", numProcess=5)
search2.map_reduce(withCase=True)
search2.create_csv('results2.csv')

# Loading the output file into pandas for visualization
df = pandas.read_csv("results1.csv")
df.head(10)

Unnamed: 0,File,Line,Index,Context
0,Bay_of_ConcepciC3B3n.html,6,422,"egories"":[""Coordinates on Wikidata"",""All stub ..."
1,Bay_of_ConcepciC3B3n.html,45,628,"78-sj18-04-quiriquina.jpg 2x"" data-file-width=..."
2,Bay_of_ConcepciC3B3n.html,45,650,"jpg 2x"" data-file-width=""960"" data-file-height..."
3,Bay_of_ConcepciC3B3n.html,58,447,"aps, aerial photos, and other data for this lo..."
4,Bay_of_ConcepciC3B3n.html,58,692,"aps, aerial photos, and other data for this lo..."
5,Bay_of_ConcepciC3B3n.html,60,18,"<table class=""metadata plainlinks stub"" role=""..."
6,Bay_of_ConcepciC3B3n.html,62,568,"o_Region%2C_Chile.svg.png 2x"" data-file-width=..."
7,Bay_of_ConcepciC3B3n.html,62,590,"png 2x"" data-file-width=""600"" data-file-height..."
8,Bay_of_ConcepciC3B3n.html,105,40,"atlinks"" class=""catlinks"" data-mw=""interface"">..."
9,Bay_of_ConcepciC3B3n.html,105,748,"tegory:Coordinates_on_Wikidata"" title=""Categor..."
