In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np

In [2]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/cloudflare-status' # one directory
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/cloudflare-status/2019/20196' # subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/cloudflare-status*' # all directories

In [3]:
def extract_html_data(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    
    try:
        for status in soup.find_all('div', {'class': 'component-inner-container'}):
            region = ' '.join(status.find('span').text.split())
            result.append([region, status['data-component-status']])
    except:
        result.append('?')

    return [result]

In [4]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item and 'history' not in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            ts = item.split('/')[0]
            data = extract_html_data(html)
            data.insert(0, ts)
            data.insert(0, service)
            data.insert(0, source)
            ret.append(data)
    archive.close()
    return(ret)

In [5]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())
start_time = time.time()
results = pool.map(read_file, [file for file in files])
pool.close()
end_time = time.time()

In [6]:
print(end_time - start_time)

82.35500311851501


In [7]:
res = []
for outer in results:
    for inner in outer:
        res.append(inner)
print(len(res))

8433


In [8]:
column_names = ['source','service','timestamp','status']
df = pd.DataFrame(res, columns=column_names)
df.head(50)

Unnamed: 0,source,service,timestamp,status
0,cloudflare-status,global-status,20200508T150001,"[[Cloudflare Sites and Services, operational],..."
1,cloudflare-status,global-status,20200508T060001,"[[Cloudflare Sites and Services, operational],..."
2,cloudflare-status,global-status,20200508T130001,"[[Cloudflare Sites and Services, operational],..."
3,cloudflare-status,global-status,20200508T200001,"[[Cloudflare Sites and Services, operational],..."
4,cloudflare-status,global-status,20200508T030001,"[[Cloudflare Sites and Services, operational],..."
5,cloudflare-status,global-status,20200508T090001,"[[Cloudflare Sites and Services, operational],..."
6,cloudflare-status,global-status,20200508T180001,"[[Cloudflare Sites and Services, operational],..."
7,cloudflare-status,global-status,20200508T220001,"[[Cloudflare Sites and Services, operational],..."
8,cloudflare-status,global-status,20200508T010001,"[[Cloudflare Sites and Services, operational],..."
9,cloudflare-status,global-status,20200508T000001,"[[Cloudflare Sites and Services, operational],..."


In [9]:
df.to_csv(r'/home/shane/Documents/thesis/output/cloudflare-status.csv', index=False)

### Blowup statuses

In [3]:
df2 = pd.read_csv('/home/shane/Documents/thesis/output/parsed/cloudflare-status.csv')
df2.head()

Unnamed: 0,source,service,timestamp_dir,status
0,cloudflare-status,global-status,20200508T150001,"[['Cloudflare Sites and Services', 'operationa..."
1,cloudflare-status,global-status,20200508T060001,"[['Cloudflare Sites and Services', 'operationa..."
2,cloudflare-status,global-status,20200508T130001,"[['Cloudflare Sites and Services', 'operationa..."
3,cloudflare-status,global-status,20200508T200001,"[['Cloudflare Sites and Services', 'operationa..."
4,cloudflare-status,global-status,20200508T030001,"[['Cloudflare Sites and Services', 'operationa..."


In [4]:
# Get names of the sub-services
status_types = set()
statuses = set()
for status in df2['status']:
    replaced = status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", '')
    for part in replaced.split('||'):
        status_types.add(part.split(',')[0])
        statuses.add(part.split(',')[1])

In [5]:
statuses

{' AB',
 ' AL',
 ' AZ',
 ' Angola - (LAD)',
 ' Argentina - (EZE)',
 ' Armenia - (EVN)',
 ' Australia - (ADL)',
 ' Austria - (VIE)',
 ' Azerbaijan - (GYD)',
 ' BC',
 ' Bahrain - (BAH)',
 ' Bangladesh - (CGP)',
 ' Bangladesh - (DAC)',
 ' Belgium - (BRU)',
 ' Bhutan - (PBH)',
 ' Brazil - (CWB)',
 ' Brazil - (FOR)',
 ' Brazil - (GIG)',
 ' Brazil - (GRU)',
 ' Brazil - (POA)',
 ' Brunei - (BWN)',
 ' Bulgaria - (SOF)',
 ' CA',
 ' CO',
 ' Cambodia - (PNH)',
 ' Chile - (ARI)',
 ' Chile - (SCL)',
 ' China - (CAN)',
 ' China - (CGO)',
 ' China - (CKG)',
 ' China - (CSX)',
 ' China - (CTU)',
 ' China - (FOC)',
 ' China - (FUO)',
 ' China - (HGH)',
 ' China - (HNY)',
 ' China - (LYA)',
 ' China - (NAY)',
 ' China - (NBG)',
 ' China - (NNG)',
 ' China - (SHA)',
 ' China - (SHE)',
 ' China - (SJW)',
 ' China - (SZV)',
 ' China - (SZX)',
 ' China - (TAO)',
 ' China - (TNA)',
 ' China - (TSN)',
 ' China - (WUH)',
 ' China - (WUX)',
 ' China - (XIY)"',
 ' Colombia - (BOG)',
 ' Colombia - (MDE)',
 ' Croa

In [6]:
len(status_types) # new columns

248

In [7]:
replaced = []
for status in df2['status']:
    replaced.append(status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", ''))
replaced[0]

'Cloudflare Sites and Services,operational||Cloudflare Dashboard,operational||Cloudflare Teams Dashboard,operational||Cloudflare Marketing Site,operational||"Cloudflare Developers Site", operational||Cloudflare API,operational||Cloudflare Registrar,operational||Cloudflare Workers,operational||Cloudflare Storage,operational||Cloudflare Access,operational||Cloudflare Spectrum,operational||Cloudflare Stream,operational||Cloudflare Logs,operational||Argo Tunnel,operational||Argo Smart Routing,operational||Analytics,operational||Billing,operational||CDN/Cache,operational||CDN Cache Purge,operational||SSL Certificate Provisioning,operational||SSL for SaaS Provisioning,operational||Geo-Key Manager,operational||Image Resizing,operational||Load Balancing and Monitoring,operational||Cloudflare Recursive DNS,operational||Cloudflare Authoritative DNS,operational||DNS Root Servers,operational||DNS Updates,operational||Always Online,operational||WARP,operational||Africa,partial_outage||Antananarivo,

In [8]:
df3 = pd.DataFrame(replaced)
df3.columns = ['status']
df3.head()

Unnamed: 0,status
0,"Cloudflare Sites and Services,operational||Clo..."
1,"Cloudflare Sites and Services,operational||Clo..."
2,"Cloudflare Sites and Services,operational||Clo..."
3,"Cloudflare Sites and Services,operational||Clo..."
4,"Cloudflare Sites and Services,operational||Clo..."


In [9]:
# New dataframe for the statuses
df_status = pd.DataFrame(columns=status_types)
df_status.head()

Unnamed: 0,Durban,Mombasa,Enterprise Log Share (ELS),Cloudflare Stream,Richmond,Cloudflare Teams Dashboard,Düsseldorf,Hong Kong - (HKG),WARP,Jacksonville,...,Chennai,Shijiazhuang,Brussels,Guangzhou,McAllen,Honolulu,Phnom Penh,Belgrade,Paris,Nagpur


In [None]:
# This is a bit ugly
for row, part in enumerate(df3['status']):
    for d in part.split('||'):
        d2 = d.split(',')
        df_status.at[row, d2[0]] = d2[-1]
    if row % 1000 == 0:
        print(row)