In [9]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np

In [10]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/downdetector' # one directory
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/downdetector/2017/201711' # subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/downdetector*' # all directories

In [11]:
def extract_html_data(html_doc, source, service):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    
    result.append(source)
    result.append(service)
    
    timestamp = soup.find('meta', attrs={'name':'generated'})
    result.append(timestamp.get('content'))
    
    # status of the service in last 24hrs (no problems, possible problems, problems)
    for status in soup.find_all("div", {"class": "alert"}):
        result.append(' '.join(status.text.split()))
    
    # time since problems started (empty unless problems are ongoing)
    for problems_since in soup.find_all("div", {"class": "event"}):
        try:
            status = ' '.join(problems_since.text.split())
            date = str(dparser.parse(status, fuzzy=True))
            result.append([date.split()[0].split('-')[1:], date.split()[1]])
        except:
            # some months are apparently out of range
            result.append('')
    
    # TECHNICAL DEBT: problems_since is not appended if tag does not exist in the html_doc
    if len(result) != 5:
        result.append('')
    
    # most reported problems at this time
    mrp = []
    for most_reported in soup.find_all("li"):
        if '%' in most_reported.text:
            mrp.append(' '.join(most_reported.text.split()))
    result.append(mrp)

    return result

In [12]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            data = extract_html_data(html, source, service)
            ret.append(data)
    archive.close()
    return(ret)

In [13]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())

start_time = time.time()
results = pool.map(read_file, [file for file in files])
pool.close()
end_time = time.time()

print(end_time - start_time)



3627.567561864853


In [14]:
res = []
for outer in results:
    for inner in outer:
        res.append(inner)

In [15]:
column_names = ['source','service','timestamp','status','problems_since','most_reported_problems']
df = pd.DataFrame(res, columns=column_names)
df.head

<bound method NDFrame.head of                               source   service  \
0           downdetector-deu-germany   spotify   
1           downdetector-deu-germany    reddit   
2           downdetector-deu-germany   youtube   
3           downdetector-deu-germany     zynga   
4           downdetector-deu-germany    airbnb   
...                              ...       ...   
323703  downdetector-che-switzerland   spotify   
323704  downdetector-che-switzerland   youtube   
323705  downdetector-che-switzerland   netflix   
323706  downdetector-che-switzerland  snapchat   
323707  downdetector-che-switzerland   netflix   

                               timestamp                              status  \
0       2017-11-21T20:00:45.027659+01:00                 Störung bei Spotify   
1       2017-11-21T20:01:17.940977+01:00            Keine Störung bei Reddit   
2       2017-11-21T20:00:19.627327+01:00  Möglicherweise Störung bei Youtube   
3       2017-11-21T20:01:13.231012+01:00         

In [16]:
df.to_csv(r'/home/shane/Documents/thesis/output/downdetector.csv', index=False)