# Map Reduce

## Given a big data set of wikipedia pages, the goal is to find the top occuring words.

### To achieve this, words associated with metadata such as HTML tags are discounted.  

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
from bs4 import BeautifulSoup as bs
import re
import heapq as hq
import mwparserfromhell

WORD_RE = re.compile(r"\w+")
START_RE = re.compile('.*<page>.*')
END_RE= re.compile('.*</page>.*')

class stripper(MRJob):

        def mapper_init(self):
                self.page = ''
                self.key = 0

        def mapper(self,_,line):

                if END_RE.match(line):
                        self.page = self.page + line.strip()
                        yield (self.key, self.page)
                        self.key += 1

                elif START_RE.match(line):
                        self.page = ''
                        self.page = self.page + line.strip()
                else:
                        self.page = self.page + line.strip()

        def reducer(self, key, page):
                yield (key, page)

class stripper_2(MRJob):

        def mapper(self, _, pages):
                for page in pages:
                        soup = bs(page, 'lxml')
                        for item in soup.find_all('text'):
                                parsed = mwparserfromhell.parse(item.text)
                                for word in WORD_RE.findall(parsed.strip_code()):
                                        yield (word.lower(), 1)

        def reducer(self, word, counts):
                yield (word, sum(counts))


class stripper_3(MRJob):

        def mapper_init(self):
                self.lst_tuples = []

        def mapper(self, word, counts):
                hq.heappush(self.lst_tuples, (counts, word))

        def mapper_final(self):
                for item in hq.nlargest(100, self.lst_tuples):
                        yield (item[1], item[0])

        def reducer_init(self):
                self.lst_tuples_ = []

        def reducer(self, word, counts):
                hq.heappush(self.lst_tuples_, (sum(counts), word))

        def reducer_final(self):
                for item in hq.nlargest(100, self.lst_tuples_):
                        yield (item[1], item[0])


class SteppedJob(MRJob):
        def steps(self):
                return stripper().steps() + stripper_2().steps() + stripper_3().steps()

if __name__ == '__main__':
        SteppedJob.run()


### I ran the code above in CLI and stored the output in output.txt

In [1]:
with open('output.txt', 'rb') as file:
    words = []
    counts = []
    for line in file:
        line = line.strip('\n')
        splited = line.split('\t')
        words.append(splited[0])
        counts.append(int(splited[1]))

In [2]:
trimmed_words = []
for word in words:
    trimmed_words.append(word.split('"')[1])
print(trimmed_words)

['the', 'of', 'in', 'and', 'a', 'to', 'is', 'was', 'it', 'for', 'on', '0', 'that', 's', 'as', 'align', 'by', 'are', '1', 'from', '2', 'he', 'with', 'this', 'be', 'i', 'or', 'category', 'at', 'an', 'center', 'not', 'style', '3', 'other', 'they', 'his', 'have', 'utc', 'people', 'has', 'talk', 'also', 'american', 'bgcolor', 'one', '4', 'right', 'which', 'can', 'but', 'were', 'new', 'first', '5', 'there', 'you', 'b', 'references', 'rowspan', 'left', '6', 'd', 'about', 'redirect', 't', 'if', 'all', 'may', 'font', 'when', 'their', 'who', 'thumb', 'used', 'had', 'after', '10', 'more', 'many', 'color', 'some', '2009', 'she', 'made', 'united', 'user', '7', 'time', 'city', 'background', 'two', '2008', 'no', 'world', 'its', 'most', 'called', '8', 'english']


In [3]:
zipped = zip(trimmed_words, counts)
print(zipped)

[('the', 1430692), ('of', 747428), ('in', 586232), ('and', 547669), ('a', 517123), ('to', 417330), ('is', 391092), ('was', 209485), ('it', 206636), ('for', 185137), ('on', 163090), ('0', 157286), ('that', 154468), ('s', 152065), ('as', 148802), ('align', 141510), ('by', 132262), ('are', 129040), ('1', 126803), ('from', 126512), ('2', 125455), ('he', 122979), ('with', 121300), ('this', 109145), ('be', 101359), ('i', 101152), ('or', 95304), ('category', 95291), ('at', 94522), ('an', 94144), ('center', 92033), ('not', 86868), ('style', 80096), ('3', 77762), ('other', 76109), ('they', 75353), ('his', 72995), ('have', 66943), ('utc', 65867), ('people', 65017), ('has', 63434), ('talk', 61312), ('also', 60630), ('american', 59581), ('bgcolor', 59079), ('one', 58981), ('4', 57901), ('right', 57859), ('which', 54563), ('can', 54264), ('but', 53599), ('were', 52494), ('new', 51013), ('first', 50759), ('5', 50719), ('there', 50628), ('you', 45617), ('b', 45561), ('references', 44545), ('rowspan',

In [19]:
new_zipped = []
for n in zipped:
    if n[0] in  ['de', 'bgcolor', 'page', 'category', 'style', 'references', 'infobox', 'image', 'redirect', 'width', 'colspan', 'rowspan', 'thumb', 'www', 'http', 'jpg', 'color', 'utc']:
        continue
    else:
        new_zipped.append(n)
        
new_zipped

[('the', 2927387),
 ('of', 1748781),
 ('in', 1199343),
 ('and', 1147630),
 ('a', 1065119),
 ('to', 824917),
 ('is', 751640),
 ('ref', 678208),
 ('s', 532012),
 ('1', 438216),
 ('0', 401109),
 ('it', 398436),
 ('was', 394008),
 ('for', 385220),
 ('text', 381110),
 ('2', 367002),
 ('on', 344698),
 ('name', 324024),
 ('br', 301598),
 ('that', 295848),
 ('as', 290810),
 ('font', 288553),
 ('by', 274275),
 ('align', 270913),
 ('user', 270367),
 ('from', 263904),
 ('are', 247744),
 ('with', 243735),
 ('he', 235118),
 ('this', 221476),
 ('i', 221453),
 ('com', 221437),
 ('3', 220144),
 ('talk', 209190),
 ('title', 208315),
 ('at', 204630),
 ('center', 201554),
 ('be', 200936),
 ('or', 194045),
 ('small', 188935),
 ('an', 185689),
 ('not', 171186),
 ('space', 170689),
 ('new', 169601),
 ('people', 165587),
 ('football', 165265),
 ('4', 164687),
 ('b', 162805),
 ('xml', 156476),
 ('preserve', 155883),
 ('united', 153944),
 ('first', 153701),
 ('cite', 149836),
 ('date', 148415),
 ('5', 147804),