In [1]:
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np

In [2]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/downdetector' # one directory
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/downdetector/2017/201711' # subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/downdetector*' # all directories

In [3]:
def extract_html_data(html_doc, source, service):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    
    result.append(source)
    result.append(service)
    
    timestamp = soup.find('meta', attrs={'name':'generated'})
    result.append(timestamp.get('content'))
    
    # status of the service in last 24hrs (no problems, possible problems, problems)
    for status in soup.find_all("div", {"class": "alert"}):
        result.append(' '.join(status.text.split()))
    
    # time since problems started (empty unless problems are ongoing)
    for problems_since in soup.find_all("div", {"class": "event"}):
        try:
            status = ' '.join(problems_since.text.split())
            date = str(dparser.parse(status, fuzzy=True))
            result.append([date.split()[0].split('-')[1:], date.split()[1]])
        except:
            # some months are apparently out of range
            result.append('')
    
    # TECHNICAL DEBT: problems_since is not appended if tag does not exist in the html_doc
    if len(result) != 5:
        result.append('')
    
    # most reported problems at this time
    mrp = []
    for most_reported in soup.find_all("li"):
        if '%' in most_reported.text:
            mrp.append(' '.join(most_reported.text.split()))
    result.append(mrp)

    return result

In [4]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    for item in namelist:
        if 'html' in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            data = extract_html_data(html, source, service)
            return data
    archive.close()

In [5]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())
results = pool.map(read_file, [file for file in files])
pool.close()



In [6]:
res = np.array([np.array(x, dtype='object') for x in results if x is not None])
print(len(res))

2772


In [10]:
column_names = ['source','service','timestamp','status','problems_since','most_reported_problems']
df = pd.DataFrame(res, columns=column_names)
df.head

<bound method NDFrame.head of                             source  service                         timestamp  \
0         downdetector-deu-germany  spotify  2017-11-21T20:00:45.027659+01:00   
1         downdetector-deu-germany  spotify  2017-11-24T00:02:24.103913+01:00   
2         downdetector-deu-germany  spotify  2017-11-18T01:00:14.774816+01:00   
3         downdetector-deu-germany  spotify  2017-11-30T11:01:20.891668+01:00   
4         downdetector-deu-germany  spotify  2017-11-19T03:00:10.084361+01:00   
...                            ...      ...                               ...   
2767  downdetector-che-switzerland  spotify  2018-09-07T18:00:06.436682+02:00   
2768  downdetector-che-switzerland  spotify  2018-09-17T04:01:20.046055+02:00   
2769  downdetector-che-switzerland  spotify  2018-09-10T00:01:26.258390+02:00   
2770  downdetector-che-switzerland  spotify  2018-09-11T18:01:57.619019+02:00   
2771  downdetector-che-switzerland  netflix  2018-03-07T01:00:27.839313+01:00  

In [11]:
print(len(df))
df.to_csv(r'/home/shane/Documents/thesis/output/downdetector.csv', index=False)

2772
