In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np

In [2]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/atlassian' # one directory
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/atlassian/2020/20203' # subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/atlassian*' # all directories

In [3]:
def extract_html_data(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    
    timestamp = soup.find('meta', attrs={'name':'issued'}).get('content')
    ts_uct = datetime.utcfromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
    
    components = []
    for component in soup.find_all("span", {"class": "name"}):
        s = ' '.join(component.text.split())
        if 'Signup' in s:
            # we don't want commas in component names, for reasons
            s = s.replace('Signup,', 'Signup &')
        components.append(s)

    statuses = []
    for status in soup.find_all("span", {"class": "component-status"}):
        statuses.append(' '.join(status.text.split()))

    for x in range(len(statuses)):
        result.append([components[x], statuses[x]])

    return ts_uct, [result]

In [4]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            ts_dir = item.split('/')[0]
            ts_site, data = extract_html_data(html)
            data.insert(0, ts_site)
            data.insert(0, ts_dir)
            data.insert(0, service)
            data.insert(0, source)
            ret.append(data)
    archive.close()
    return(ret)

In [5]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)

pool = mp.Pool(mp.cpu_count())

start_time = time.time()
results = pool.map(read_file, [file for file in files])
pool.close()
end_time = time.time()

In [6]:
print(end_time - start_time)

204.4103639125824


In [7]:
res = []
for outer in results:
    for inner in outer:
        res.append(inner)
print(len(res))

23864


In [8]:
column_names = ['source','service','timestamp_dir','timestamp_site','status']
df = pd.DataFrame(res, columns=column_names)
df.head(50)

Unnamed: 0,source,service,timestamp_dir,timestamp_site,status
0,atlassian,confluence,20200508T150001,2020-05-08 15:00:09,"[[View Content, Operational], [Create and Edit..."
1,atlassian,jira-software,20200508T150001,2020-05-08 15:00:08,"[[Viewing content, Operational], [Create and e..."
2,atlassian,jira-core,20200508T150001,2020-05-08 15:00:10,"[[Viewing content, Operational], [Create and e..."
3,atlassian,jira-align,20200508T150001,2020-05-08 15:00:26,"[[Pod 1, Operational], [Pod 2, Operational], [..."
4,atlassian,bitbucket,20200508T150001,2020-05-08 14:46:49,"[[Website, Operational], [API, Operational], [..."
5,atlassian,support,20200508T150001,2020-05-08 14:00:07,"[[Support Portal, Operational], [Ticketing, Op..."
6,atlassian,developers,20200508T150001,2020-05-08 14:35:53,"[[APIs, Operational], [Bitbucket Cloud APIs, O..."
7,atlassian,statuspage,20200508T150001,2020-05-08 15:00:11,"[[Hosted Pages, Operational], [HTTP Pages, Ope..."
8,atlassian,trello,20200508T150001,2020-05-08 14:44:34,"[[Trello.com, Operational], [API, Operational]..."
9,atlassian,partners,20200508T150001,2020-05-08 15:00:21,"[[Partner Portal, Operational], [Partner Suppo..."


In [9]:
df.to_csv(r'/home/shane/Documents/thesis/output/parsed/atlassian.csv', index=False)

### Blow-Up Status Column

In [10]:
df2 = pd.read_csv('/home/shane/Documents/thesis/output/parsed/atlassian.csv')
df2.head()

Unnamed: 0,source,service,timestamp_dir,timestamp_site,status
0,atlassian,confluence,20200508T150001,2020-05-08 15:00:09,"[['View Content', 'Operational'], ['Create and..."
1,atlassian,jira-software,20200508T150001,2020-05-08 15:00:08,"[['Viewing content', 'Operational'], ['Create ..."
2,atlassian,jira-core,20200508T150001,2020-05-08 15:00:10,"[['Viewing content', 'Operational'], ['Create ..."
3,atlassian,jira-align,20200508T150001,2020-05-08 15:00:26,"[['Pod 1', 'Operational'], ['Pod 2', 'Operatio..."
4,atlassian,bitbucket,20200508T150001,2020-05-08 14:46:49,"[['Website', 'Operational'], ['API', 'Operatio..."


In [22]:
# Get names of the sub-services
status_types = set()
statuses = set()
for status in df2['status']:
    replaced = status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", '')
    for part in replaced.split('||'):
        status_types.add(part.split(',')[0])
        statuses.add(part.split(',')[1])

In [23]:
statuses

{'Degraded Performance',
 'Major Outage',
 'Operational',
 'Partial Outage',
 'Under Maintenance'}

In [24]:
len(status_types) # new columns

175

In [25]:
replaced = []
for status in df2['status']:
    replaced.append(status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", ''))
replaced[0]

'View Content,Operational||Create and Edit,Operational||Comments,Operational||Authentication and User Management,Operational||Search,Operational||Administration,Operational||Notifications,Operational||Marketplace Apps,Operational||Mobile,Operational||iOS App,Operational||Android App,Operational||Purchasing & Licensing,Operational'

In [26]:
df3 = pd.DataFrame(replaced)
df3.columns = ['status']
df3.head()

Unnamed: 0,status
0,"View Content,Operational||Create and Edit,Oper..."
1,"Viewing content,Operational||Create and edit,O..."
2,"Viewing content,Operational||Create and edit,O..."
3,"Pod 1,Operational||Pod 2,Operational||Pod 3,Op..."
4,"Website,Operational||API,Operational||SSH,Oper..."


In [27]:
# New dataframe for the statuses
df_status = pd.DataFrame(columns=status_types)
df_status.head()

Unnamed: 0,Trello.com,Partner Directory,Source downloads,Marketplace,Pod 18,Mobile Notification Delivery,Public Website,Service Desk Web,Confluence Cloud APIs,Pod 19,...,Support Sites,Management,Category landing pages,App submissions,Web Portal,Jira Cloud APIs,Vendor Home Page,Pod 4,Reporting & Analytics,Authentication and User Management


In [28]:
# This is a bit ugly
for row, part in enumerate(df3['status']):
    for d in part.split('||'):
        d2 = d.split(',')
        df_status.at[row, d2[0]] = d2[1]
    if row % 1000 == 0:
        print(row)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000


In [29]:
df_final = df2.join(df_status)

In [30]:
df_final.drop(['status'], axis=1, inplace=True)

In [31]:
df_final

Unnamed: 0,source,service,timestamp_dir,timestamp_site,Trello.com,Partner Directory,Source downloads,Marketplace,Pod 18,Mobile Notification Delivery,...,Support Sites,Management,Category landing pages,App submissions,Web Portal,Jira Cloud APIs,Vendor Home Page,Pod 4,Reporting & Analytics,Authentication and User Management
0,atlassian,confluence,20200508T150001,2020-05-08 15:00:09,,,,,,,...,,,,,,,,,,Operational
1,atlassian,jira-software,20200508T150001,2020-05-08 15:00:08,,,,Operational,,,...,,,,,,,,,,Operational
2,atlassian,jira-core,20200508T150001,2020-05-08 15:00:10,,,,Operational,,,...,,,,,,,,,,Operational
3,atlassian,jira-align,20200508T150001,2020-05-08 15:00:26,,,,,Operational,,...,,,,,,,,Operational,,
4,atlassian,bitbucket,20200508T150001,2020-05-08 14:46:49,,,Operational,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23859,atlassian,trello,20200604T130001,2020-06-04 12:45:12,Operational,,,,,,...,,,,,,,,,,
23860,atlassian,partners,20200604T130001,2020-06-04 12:00:21,,Operational,,,,,...,,,,,,,,,,
23861,atlassian,access,20200604T130001,2020-06-04 13:00:07,,,,,,,...,,,,,,,,,,
23862,atlassian,jira-service-desk,20200604T130001,2020-06-04 13:00:05,,,,,,,...,,,,,,,,,,Operational


In [32]:
df_final.to_csv(r'/home/shane/Documents/thesis/output/parsed/final/atlassian.csv', index=False)