In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np
import io

In [6]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/outage.report/2019/20194' #subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/outage.report*' # all directories

#### Excuse the dirtiness...

In [7]:
def extract_html_data(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    try:
        ts_site = soup.find('span', {'class':'ServicePage__TimeZoneWrap-l00bv6-0 dPxUZH'})
        result.append(' '.join(ts_site.text.split()))
    except:
        result.append('?')
    
    try:
        gauge_count = soup.find('text', {'class':'Gauge__Count-s1qahqgd-5 gzhUJh'})
        result.append(' '.join(gauge_count.text.split()))
    except:
        result.append('?')
    
    try:
        section = soup.find('section', {'class':'OverlayLoader__Wrap-yneds0-0 hEfbHu'})
        div = section.find('div', {'class':'sc-bdVaJa frYeKI'})
        report_stats = ' '.join(div.text.split())
    except:
        # assume the tag does not always exist...
        report_stats = '0'
        
    # number of countries reporting problems
    # the site html was updated at some point, so we need to check this
    count = 0
    try:
        # here the countries are listed separately
        x = soup.find('span', {'data-message':'CNS'})
        y = x.find('ul', {'class':'DominantMessage2__CommaList-s109d9sz-0 harauJ'})
        count = len(y.find_all('li'))
    except:
        # otherwise its all in one text
        # e.g. Received 4940 reports, originating from Malaysia, United Kingdom, Singapore, Spain, United States and 111 more countries
        count = len(report_stats.split(',')) - 1
    
    # first digit is #reports in 24hrs
    # second digit (if present) is #additional countries reporting problems
    for s in report_stats.split():
        if s.isdigit():
            result.append(s)
    if len(result) != 4:
        result.append(0)
    result[-1] = int(result[-1]) + count
    
    outage_causes = []
    try:
        ul = soup.find('ul', {'class':'OutageSubjectsList__Ul-s10w08k0-0 jfDAxG'})
        for li in ul.find_all('li'):
            outage_causes.append(li.get_text())
    except:
        pass
    result.append(outage_causes)
    
    return result

In [8]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item and 'overview' not in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            ts = item.split('/')[0]
            data = extract_html_data(html)
            data.insert(0, ts)
            data.insert(0, service)
            data.insert(0, source)
            ret.append(data)
    archive.close()
    return(ret)

In [9]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())
start_time = time.time()
results = pool.map(read_file, [file for file in files])
pool.close()
end_time = time.time()

In [10]:
print(end_time - start_time)

15.659606456756592


In [11]:
res = []
for outer in results:
    for inner in outer:
        res.append(inner)
print(len(res))

4066


In [12]:
column_names = ['source','service','timestamp_dir','timezone','reports_20mins','reports_24hrs','reports_countries24hrs','causes']
df = pd.DataFrame(res, columns=column_names)
df.head(50)

Unnamed: 0,source,service,timestamp_dir,timezone,reports_20mins,reports_24hrs,reports_countries24hrs,causes
0,outage.report,youtube,20190430T110001,EDT (GMT -04:00),0,22,14,"[Mobile app not working - 36%, Website down - ..."
1,outage.report,facebook,20190430T110001,EDT (GMT -04:00),2,223,35,"[Mobile app not working - 45%, Mobile app cras..."
2,outage.report,twitter,20190430T110001,EDT (GMT -04:00),0,21,11,"[Mobile app not working - 55%, Website down - ..."
3,outage.report,gmail,20190430T110001,EDT (GMT -04:00),0,43,17,"[Email down - 63%, Website down - 13%, Email r..."
4,outage.report,netflix,20190430T110001,EDT (GMT -04:00),0,28,11,"[Stream not working - 58%, Everything is down ..."
5,outage.report,facebook-messenger,20190430T110001,EDT (GMT -04:00),0,9,5,"[Message read problems - 50%, Message send pro..."
6,outage.report,whatsapp,20190430T110001,EDT (GMT -04:00),1,35,14,"[Message send problems - 43%, Mobile app not w..."
7,outage.report,skype,20190430T110001,EDT (GMT -04:00),0,6,4,"[Message send problems - 33%, Everything is do..."
8,outage.report,instagram,20190430T110001,EDT (GMT -04:00),1,82,21,"[Mobile app not working - 49%, Can't upload pi..."
9,outage.report,snapchat,20190430T110001,EDT (GMT -04:00),0,14,5,"[Mobile app crashes - 40%, Can't login - 20%, ..."


In [None]:
df.to_csv(r'/home/shane/Documents/thesis/output/parsed/outage-report.csv', index=False)