<h1 class="text-center">Analyzing Wiki Pages </h1>

## List all the files in the wiki folder

In [6]:
import os
wiki_files = os.listdir('wiki/')
wiki_files[:10]

['Furubira_District,_Hokkaido.html',
 'Valentin_Yanin.html',
 'Kings_XI_Punjab_in_2014.html',
 'William_Harvey_Lillard.html',
 'Radial_Road_3.html',
 'George_Weldrick.html',
 'Zgornji_Otok.html',
 'Blue_Heelers_(season_8).html',
 'Taggen_Nunatak.html',
 '1951_National_League_tie-breaker_series.html']

## Number of files in wiki folder

In [8]:
print('Number of files to analyze in wiki folder: {}'.format(len(wiki_files)))

Number of files to analyze in wiki folder: 1002


## Lets display one of the file

In [21]:
with open('wiki/Blue_Heelers_(season_8).html') as f:
    print(f.read(100))

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title


## Lets time the single thread performance

In [48]:
%%time
content, articles = [],[]
for file in wiki_files:
    with open('wiki/'+file, mode='r') as f:
        content.append(f.read())
        articles.append(file.replace('.html',''))

CPU times: user 63.4 ms, sys: 37.8 ms, total: 101 ms
Wall time: 106 ms


### Create a function to read a file and return the content for map

In [29]:
def read_wikifile(file):
    with open(file, mode='r') as f:
        return f.read()

## Lets multi threaded performance of max_workers 5

In [52]:
%%time
import concurrent.futures
pool = concurrent.futures.ThreadPoolExecutor(max_workers=2)

filenames = ['wiki/' + file for file in wiki_files]
content =list(pool.map(read_wikifile, filenames))
articles = [file.replace('.html','') for file in wiki_files]

CPU times: user 249 ms, sys: 226 ms, total: 475 ms
Wall time: 334 ms


> # Creating multiple threads & processes does not seem to help the performance

## Lets create a function to parse the files

In [55]:
from bs4 import BeautifulSoup
def html_parser(file_content):
    soup = BeautifulSoup(file_content, 'html.parser')
    
    # parse the first div content
    return (str(soup.find_all('div',id='content')[0]))

## Lets single thread it and see how long it takes

In [56]:
%%time
parsed = list(map(html_parser, content))

CPU times: user 59.1 s, sys: 0 ns, total: 59.1 s
Wall time: 1min 1s


## Lets multi process it and see how long it runs

In [61]:
%%time
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
parsed = list(pool.map(html_parser, content))

CPU times: user 776 ms, sys: 435 ms, total: 1.21 s
Wall time: 31.6 s


> # We can see multi process is increasing the performance from single thread

In [82]:
import collections
def html_number_tags(file_content):
    soup = BeautifulSoup(file_content, 'html.parser')
    
    # find all the tags
    tags = [tag.name for tag in soup.find_all()]
    tag_counter = collections.Counter(tags)
    
    # return the tag counter
    return tag_counter

## Lets single thread it

In [86]:
%%time
list_tag_counters = list(map(html_number_tags, content))
final_tag_counter = collections.Counter()
for tc in list_tag_counters:
    final_tag_counter += tc
final_tag_counter

CPU times: user 43.4 s, sys: 0 ns, total: 43.4 s
Wall time: 45.3 s


In [88]:
final_tag_counter.most_common(10)

[('a', 215498),
 ('li', 134053),
 ('span', 75468),
 ('div', 59113),
 ('td', 57798),
 ('tr', 27416),
 ('ul', 24238),
 ('i', 18289),
 ('th', 14538),
 ('b', 14479)]

## Lets multi process it

In [90]:
%%time
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
list_tag_counters = list(pool.map(html_number_tags, content))

final_tag_counter = collections.Counter()
for tc in list_tag_counters:
    final_tag_counter += tc
final_tag_counter

CPU times: user 910 ms, sys: 316 ms, total: 1.23 s
Wall time: 23.7 s


> # Multi-process certainly helps it out as its CPU bound process

> # Lot of a tags and certianly means it links to other articles

## Lets find the common word from all the wiki pages provided

In [112]:
import collections, re
def find_words(file_content, len_words=0):
    # regex to find the words
    regex = re.compile('[^a-zA-Z_0-9]+')
    
    # parse the html content
    soup = BeautifulSoup(file_content, 'html.parser')
    text = regex.sub(' ',soup.get_text().lower())
    
    # filter out the non-words and less than
    words = [word for word in text.split() if len(word) >= len_words]
    words_counter = collections.Counter(words)
    
    # return the tag counter
    return len(words), words_counter

In [118]:
%%time
_ = find_words(content[0])

CPU times: user 102 ms, sys: 3.98 ms, total: 106 ms
Wall time: 108 ms


## Lets single thread it

In [114]:
%%time
words_tuple = list(map(find_words, content))
final_words_counter = collections.Counter()
final_count_words = 0
for wt in words_tuple:
    final_words_counter += wt[1]
    final_count_words += wt[0]
final_words_counter

CPU times: user 59.1 s, sys: 143 ms, total: 59.2 s
Wall time: 1min 2s


In [109]:
final_count_words, final_words_counter.most_common(10)

(1622420,
 [('the', 38296),
  ('ext', 29758),
  ('of', 23388),
  ('ready', 19013),
  ('in', 17047),
  ('and', 16469),
  ('a', 15957),
  ('true', 14269),
  ('1', 14127),
  ('to', 13371)])

## Lets process words longer than 7

In [120]:
%%time
words_tuple = list(map(find_words, content,[7 for _ in  range(len(content))]))
final_words_counter = collections.Counter()
final_count_words = 0
for wt in words_tuple:
    final_words_counter += wt[1]
    final_count_words += wt[0]
final_words_counter

CPU times: user 55 s, sys: 0 ns, total: 55 s
Wall time: 57.6 s


In [121]:
final_count_words, final_words_counter.most_common(10)

(547307,
 [('mediawiki', 11522),
  ('template', 8507),
  ('wikipedia', 8020),
  ('function', 5048),
  ('articles', 4861),
  ('globalcssjs', 4008),
  ('retrieved', 3220),
  ('navigation', 3020),
  ('options', 3013),
  ('visualeditor', 3006)])

## Lets multi-process it

In [122]:
%%time
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
words_tuple = list(pool.map(find_words, content,[7 for _ in  range(len(content))]))

final_words_counter = collections.Counter()
final_count_words = 0
for wt in words_tuple:
    final_words_counter += wt[1]
    final_count_words += wt[0]
final_words_counter

CPU times: user 7.35 s, sys: 444 ms, total: 7.79 s
Wall time: 31.7 s


> # Multi-process certainly helps it out as its CPU bound process