In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np
import io

In [2]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/outage.report/2019/20194' #subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/outage.report*' # all directories

#### Excuse the dirtiness...

In [3]:
def extract_html_data(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    try:
        ts_site = soup.find('span', {'class':'ServicePage__TimeZoneWrap-l00bv6-0 dPxUZH'})
        result.append(' '.join(ts_site.text.split()))
    except:
        result.append('?')
    
    try:
        gauge_count = soup.find('text', {'class':'Gauge__Count-s1qahqgd-5 gzhUJh'})
        result.append(' '.join(gauge_count.text.split()))
    except:
        result.append('?')
    
    try:
        section = soup.find('section', {'class':'OverlayLoader__Wrap-yneds0-0 hEfbHu'})
        div = section.find('div', {'class':'sc-bdVaJa frYeKI'})
        report_stats = ' '.join(div.text.split())
    except:
        # assume the tag does not always exist...
        report_stats = '0'
        
    # number of countries reporting problems
    # the site html was updated at some point, so we need to check this
    count = 0
    try:
        # here the countries are listed separately
        x = soup.find('span', {'data-message':'CNS'})
        y = x.find('ul', {'class':'DominantMessage2__CommaList-s109d9sz-0 harauJ'})
        count = len(y.find_all('li'))
    except:
        # otherwise its all in one text
        # e.g. Received 4940 reports, originating from Malaysia, United Kingdom, Singapore, Spain, United States and 111 more countries
        count = len(report_stats.split(',')) - 1
    
    # first digit is #reports in 24hrs
    # second digit (if present) is #additional countries reporting problems
    for s in report_stats.split():
        if s.isdigit():
            result.append(s)
    if len(result) != 4:
        result.append(0)
    result[-1] = int(result[-1]) + count
    
    outage_causes = []
    try:
        ul = soup.find('ul', {'class':'OutageSubjectsList__Ul-s10w08k0-0 jfDAxG'})
        for li in ul.find_all('li'):
            outage_causes.append(li.get_text())
    except:
        pass
    result.append(outage_causes)
    
    return result

In [4]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item and 'overview' not in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            ts = item.split('/')[0]
            data = extract_html_data(html)
            data.insert(0, ts)
            data.insert(0, service)
            data.insert(0, source)
            ret.append(data)
    archive.close()
    return(ret)

In [5]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())
start_time = time.time()
results = pool.map(read_file, [file for file in files])
pool.close()
end_time = time.time()

In [6]:
print(end_time - start_time)

374.31628131866455


In [7]:
res = []
for outer in results:
    for inner in outer:
        res.append(inner)
print(len(res))

120018


In [8]:
column_names = ['source','service','timestamp_dir','timezone','reports_20mins','reports_24hrs','reports_countries24hrs','causes']
df = pd.DataFrame(res, columns=column_names)
df.head(50)

Unnamed: 0,source,service,timestamp_dir,timezone,reports_20mins,reports_24hrs,reports_countries24hrs,causes
0,outage.report,apple-servers,20200508T150001,EDT (GMT -04:00),0,3,2,[]
1,outage.report,youtube,20200508T150001,EDT (GMT -04:00),4,106,36,"[Website down - 75%, Other - 9%, Videos won't ..."
2,outage.report,github,20200508T150001,EDT (GMT -04:00),0,3,2,[Website down - 100%]
3,outage.report,facebook,20200508T150001,EDT (GMT -04:00),2,20,7,"[Can't login - 33%, Newsfeed - 17%, Other - 17..."
4,outage.report,twitter,20200508T150001,EDT (GMT -04:00),25,22,7,[Website down - 100%]
5,outage.report,gmail,20200508T150001,EDT (GMT -04:00),1,15,7,"[Website down - 63%, Can't login - 13%, Email ..."
6,outage.report,netflix,20200508T150001,EDT (GMT -04:00),0,8,4,"[Website down - 50%, Freezing up - 25%, Buffer..."
7,outage.report,facebook-messenger,20200508T150001,EDT (GMT -04:00),1,3,3,[Mobile app not working - 100%]
8,outage.report,whatsapp,20200508T150001,EDT (GMT -04:00),0,5,4,"[Message send problems - 33%, Mobile app crash..."
9,outage.report,skype,20200508T150001,EDT (GMT -04:00),0,3,3,[Message read problems - 100%]


In [10]:
df.to_csv(r'/home/shane/Documents/thesis/output/parsed/outage-report.csv', index=False)