# System Status Parsing

In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np

In [2]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/cloudflare-status' # one directory
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/cloudflare-status/2019/20196' # subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/cloudflare-status*' # all directories

In [3]:
def extract_html_data(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    
    timestamp = soup.find('meta', attrs={'name':'issued'}).get('content')
    ts_uct = datetime.utcfromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
    
    try:
        for status in soup.find_all('div', {'class': 'component-inner-container'}):
            region = ' '.join(status.find('span').text.split())
            result.append([region, status['data-component-status']])
    except:
        result.append('?')

    return ts_uct, [result]

In [4]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item and 'history' not in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            ts = item.split('/')[0]
            ts_site, data = extract_html_data(html)
            data.insert(0, ts_site)
            data.insert(0, ts)
            data.insert(0, service)
            data.insert(0, source)
            ret.append(data)
    archive.close()
    return(ret)

In [5]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())
start_time = time.time()
results = pool.map(read_file, [file for file in files])
pool.close()
end_time = time.time()

In [6]:
print(end_time - start_time)

82.53425192832947


In [7]:
res = []
for outer in results:
    for inner in outer:
        res.append(inner)
print(len(res))

8433


In [8]:
column_names = ['source','service','timestamp_dir','timestamp_site','status']
df = pd.DataFrame(res, columns=column_names)
df.head(50)

Unnamed: 0,source,service,timestamp_dir,timestamp_site,status
0,cloudflare-status,global-status,20200508T150001,2020-05-08 14:49:28,"[[Cloudflare Sites and Services, operational],..."
1,cloudflare-status,global-status,20200508T060001,2020-05-08 05:48:02,"[[Cloudflare Sites and Services, operational],..."
2,cloudflare-status,global-status,20200508T130001,2020-05-08 12:58:36,"[[Cloudflare Sites and Services, operational],..."
3,cloudflare-status,global-status,20200508T200001,2020-05-08 19:57:30,"[[Cloudflare Sites and Services, operational],..."
4,cloudflare-status,global-status,20200508T030001,2020-05-08 02:46:12,"[[Cloudflare Sites and Services, operational],..."
5,cloudflare-status,global-status,20200508T090001,2020-05-08 08:53:45,"[[Cloudflare Sites and Services, operational],..."
6,cloudflare-status,global-status,20200508T180001,2020-05-08 17:56:26,"[[Cloudflare Sites and Services, operational],..."
7,cloudflare-status,global-status,20200508T220001,2020-05-08 21:59:27,"[[Cloudflare Sites and Services, operational],..."
8,cloudflare-status,global-status,20200508T010001,2020-05-08 00:55:11,"[[Cloudflare Sites and Services, operational],..."
9,cloudflare-status,global-status,20200508T000001,2020-05-07 23:54:28,"[[Cloudflare Sites and Services, operational],..."


In [22]:
df.to_csv(r'/home/shane/Documents/thesis/output/parsed/cloudflare-status.csv', index=False)

### Blowup statuses

In [23]:
df2 = pd.read_csv('/home/shane/Documents/thesis/output/parsed/cloudflare-status.csv')
df2.head()

Unnamed: 0,source,service,timestamp_dir,timestamp_site,status
0,cloudflare-status,global-status,20200508T150001,2020-05-08 14:49:28,"[['Cloudflare Sites and Services', 'operationa..."
1,cloudflare-status,global-status,20200508T060001,2020-05-08 05:48:02,"[['Cloudflare Sites and Services', 'operationa..."
2,cloudflare-status,global-status,20200508T130001,2020-05-08 12:58:36,"[['Cloudflare Sites and Services', 'operationa..."
3,cloudflare-status,global-status,20200508T200001,2020-05-08 19:57:30,"[['Cloudflare Sites and Services', 'operationa..."
4,cloudflare-status,global-status,20200508T030001,2020-05-08 02:46:12,"[['Cloudflare Sites and Services', 'operationa..."


In [24]:
# Get names of the sub-services
status_types = set()
statuses = set()
for status in df2['status']:
    replaced = status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", '')
    for part in replaced.split('||'):
        status_types.add(part.split(',')[0])
        statuses.add(part.split(',')[1])

In [25]:
statuses

{' AB',
 ' AL',
 ' AZ',
 ' Angola - (LAD)',
 ' Argentina - (EZE)',
 ' Armenia - (EVN)',
 ' Australia - (ADL)',
 ' Austria - (VIE)',
 ' Azerbaijan - (GYD)',
 ' BC',
 ' Bahrain - (BAH)',
 ' Bangladesh - (CGP)',
 ' Bangladesh - (DAC)',
 ' Belgium - (BRU)',
 ' Bhutan - (PBH)',
 ' Brazil - (CWB)',
 ' Brazil - (FOR)',
 ' Brazil - (GIG)',
 ' Brazil - (GRU)',
 ' Brazil - (POA)',
 ' Brunei - (BWN)',
 ' Bulgaria - (SOF)',
 ' CA',
 ' CO',
 ' Cambodia - (PNH)',
 ' Chile - (ARI)',
 ' Chile - (SCL)',
 ' China - (CAN)',
 ' China - (CGO)',
 ' China - (CKG)',
 ' China - (CSX)',
 ' China - (CTU)',
 ' China - (FOC)',
 ' China - (FUO)',
 ' China - (HGH)',
 ' China - (HNY)',
 ' China - (LYA)',
 ' China - (NAY)',
 ' China - (NBG)',
 ' China - (NNG)',
 ' China - (SHA)',
 ' China - (SHE)',
 ' China - (SJW)',
 ' China - (SZV)',
 ' China - (SZX)',
 ' China - (TAO)',
 ' China - (TNA)',
 ' China - (TSN)',
 ' China - (WUH)',
 ' China - (WUX)',
 ' China - (XIY)"',
 ' Colombia - (BOG)',
 ' Colombia - (MDE)',
 ' Croa

In [26]:
len(status_types) # new columns

248

In [27]:
replaced = []
for status in df2['status']:
    replaced.append(status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", ''))
replaced[0]

'Cloudflare Sites and Services,operational||Cloudflare Dashboard,operational||Cloudflare Teams Dashboard,operational||Cloudflare Marketing Site,operational||"Cloudflare Developers Site", operational||Cloudflare API,operational||Cloudflare Registrar,operational||Cloudflare Workers,operational||Cloudflare Storage,operational||Cloudflare Access,operational||Cloudflare Spectrum,operational||Cloudflare Stream,operational||Cloudflare Logs,operational||Argo Tunnel,operational||Argo Smart Routing,operational||Analytics,operational||Billing,operational||CDN/Cache,operational||CDN Cache Purge,operational||SSL Certificate Provisioning,operational||SSL for SaaS Provisioning,operational||Geo-Key Manager,operational||Image Resizing,operational||Load Balancing and Monitoring,operational||Cloudflare Recursive DNS,operational||Cloudflare Authoritative DNS,operational||DNS Root Servers,operational||DNS Updates,operational||Always Online,operational||WARP,operational||Africa,partial_outage||Antananarivo,

In [28]:
df3 = pd.DataFrame(replaced)
df3.columns = ['status']
df3.head()

Unnamed: 0,status
0,"Cloudflare Sites and Services,operational||Clo..."
1,"Cloudflare Sites and Services,operational||Clo..."
2,"Cloudflare Sites and Services,operational||Clo..."
3,"Cloudflare Sites and Services,operational||Clo..."
4,"Cloudflare Sites and Services,operational||Clo..."


In [29]:
# New dataframe for the statuses
df_status = pd.DataFrame(columns=status_types)
df_status.head()

Unnamed: 0,Cebu,Cloudflare Workers,Chennai,Thimphu,Vienna,Tegucigalpa,Richmond,Ho Chi Minh City,Brussels,Port Louis,...,Bangalore,Cairo,Shanghai,Auckland,Riyadh,Athens,Sofia,New Delhi,Adelaide,DNS Updates


In [30]:
# This is a bit ugly
for row, part in enumerate(df3['status']):
    for d in part.split('||'):
        d2 = d.split(',')
        df_status.at[row, d2[0]] = d2[-1]
    if row % 1000 == 0:
        print(row)

0
1000
2000
3000
4000
5000
6000
7000
8000


In [31]:
df_final = df2.join(df_status)

In [32]:
df_final.drop(['status'], axis=1, inplace=True)

In [33]:
df_final['service'] = 'cloudflare'
df_final

Unnamed: 0,source,service,timestamp_dir,timestamp_site,Cebu,Cloudflare Workers,Chennai,Thimphu,Vienna,Tegucigalpa,...,Bangalore,Cairo,Shanghai,Auckland,Riyadh,Athens,Sofia,New Delhi,Adelaide,DNS Updates
0,cloudflare-status,cloudflare,20200508T150001,2020-05-08 14:49:28,operational,operational,operational,partial_outage,operational,operational,...,operational,,operational,operational,operational,operational,operational,operational,operational,operational
1,cloudflare-status,cloudflare,20200508T060001,2020-05-08 05:48:02,operational,operational,operational,partial_outage,operational,operational,...,operational,,operational,operational,operational,operational,operational,operational,operational,operational
2,cloudflare-status,cloudflare,20200508T130001,2020-05-08 12:58:36,operational,operational,operational,partial_outage,operational,operational,...,operational,,operational,operational,operational,operational,operational,operational,operational,operational
3,cloudflare-status,cloudflare,20200508T200001,2020-05-08 19:57:30,operational,operational,operational,partial_outage,operational,operational,...,operational,,operational,operational,operational,operational,operational,operational,operational,operational
4,cloudflare-status,cloudflare,20200508T030001,2020-05-08 02:46:12,operational,operational,operational,partial_outage,operational,operational,...,operational,,operational,operational,operational,operational,operational,operational,operational,operational
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8428,cloudflare-status,cloudflare,20190709T220001,2019-07-09 21:57:11,operational,operational,operational,,operational,,...,,operational,operational,operational,partial_outage,operational,operational,operational,,operational
8429,cloudflare-status,cloudflare,20190709T200001,2019-07-09 19:48:57,operational,operational,operational,,operational,,...,,operational,operational,operational,partial_outage,operational,operational,operational,,operational
8430,cloudflare-status,cloudflare,20190709T080001,2019-07-09 07:48:45,operational,operational,operational,,operational,,...,,operational,operational,operational,operational,operational,operational,operational,,operational
8431,cloudflare-status,cloudflare,20190709T050001,2019-07-09 04:47:30,operational,operational,operational,,operational,,...,,operational,operational,operational,operational,operational,operational,operational,,operational


In [34]:
df_final.to_csv(r'/home/shane/Documents/thesis/output/parsed/final/cloudflare-status.csv', index=False)

# Incidents Parsing

In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np

In [2]:
def extract_history(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    
    try:
        status = soup.find('div', {'class': 'update font-regular resolved'})
        status2 = status.find('strong')
        result.append(' '.join(status2.text.split()))
    except:
        result.append('')
        
    return result

In [3]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item and 'history' not in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            ts = item.split('/')[0]
            data = extract_history(html)
            data.insert(0, ts)
            data.insert(0, source)
            ret.append(data)
    archive.close()
    return(ret)

In [4]:
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/cloudflare-status*'
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())
start_time = time.time()
results = pool.map(read_file, [file for file in files])
pool.close()
end_time = time.time()

In [7]:
results

[[['cloudflare-status', '20200508T150001', 'Resolved'],
  ['cloudflare-status', '20200508T060001', 'Resolved'],
  ['cloudflare-status', '20200508T130001', 'Resolved'],
  ['cloudflare-status', '20200508T200001', 'Resolved'],
  ['cloudflare-status', '20200508T030001', 'Resolved'],
  ['cloudflare-status', '20200508T090001', 'Resolved'],
  ['cloudflare-status', '20200508T180001', 'Resolved'],
  ['cloudflare-status', '20200508T220001', 'Resolved'],
  ['cloudflare-status', '20200508T010001', 'Resolved'],
  ['cloudflare-status', '20200508T000001', 'Resolved'],
  ['cloudflare-status', '20200508T020001', 'Resolved'],
  ['cloudflare-status', '20200508T050001', 'Resolved'],
  ['cloudflare-status', '20200508T140001', 'Resolved'],
  ['cloudflare-status', '20200508T210001', 'Resolved'],
  ['cloudflare-status', '20200508T190001', 'Resolved'],
  ['cloudflare-status', '20200508T080001', 'Resolved'],
  ['cloudflare-status', '20200508T040001', 'Resolved'],
  ['cloudflare-status', '20200508T100001', 'Reso