In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np
import io

In [2]:
# root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/github-status/2019/20199' #subset of above
root_dir = '/media/shane/cloud-availability-sacheen-2020-05-11/github-status*' # all directories

In [3]:
def extract_html_data(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')
    result = []
    
    timestamp = soup.find('meta', attrs={'name':'issued'}).get('content')
    ts_uct = datetime.utcfromtimestamp(int(timestamp)).strftime('%Y-%m-%d %H:%M:%S')
    
    try:
        components = []
        for name in soup.find_all('span', {'class':'name'}):
            components.append(' '.join(name.text.split()))
        statuses = []
        for status in soup.find_all('span', {'class':'component-status'}):
            statuses.append(' '.join(status.text.split()))
    except:
        result.append('?')
    
    for x in range(len(statuses)):
        result.append([components[x], statuses[x]])
    
    return ts_uct, [result]

In [4]:
def read_file(file):
    source = file.split('/')[4]
    archive = ZipFile(file, 'r')
    namelist = archive.namelist()
    ret = []
    for item in namelist:
        if 'html' in item and 'history' not in item:
            html = archive.read(item)
            service = item.split('/')[1].split('.')[0]
            ts_dir = item.split('/')[0]
            ts_site, data = extract_html_data(html)
            data.insert(0, ts_site)
            data.insert(0, ts_dir)
            data.insert(0, service)
            data.insert(0, source)
            ret.append(data)
    archive.close()
    return(ret)

In [5]:
files = glob.glob(root_dir + '/**/*.zip', recursive=True)
pool = mp.Pool(mp.cpu_count())
start_time = time.time()
try:
    results = pool.map(read_file, [file for file in files])
except:
    print('Error')
pool.close()
end_time = time.time()

In [6]:
print(end_time - start_time)

54.51297163963318


In [7]:
res = []
for outer in results:
    for inner in outer:
        res.append(inner)
print(len(res))

9612


In [8]:
column_names = ['source','service','timestamp_dir','timestamp_site','status']
df = pd.DataFrame(res, columns=column_names)
df.head(50)

Unnamed: 0,source,service,timestamp_dir,timestamp_site,status
0,github-status,global-status,20200508T150001,2020-05-08 14:54:02,"[[Git Operations, Operational], [API Requests,..."
1,github-status,global-status,20200508T060001,2020-05-08 05:50:15,"[[Git Operations, Operational], [API Requests,..."
2,github-status,global-status,20200508T130001,2020-05-08 12:53:22,"[[Git Operations, Operational], [API Requests,..."
3,github-status,global-status,20200508T200001,2020-05-08 19:50:28,"[[Git Operations, Operational], [API Requests,..."
4,github-status,global-status,20200508T030001,2020-05-08 02:49:28,"[[Git Operations, Operational], [API Requests,..."
5,github-status,global-status,20200508T090001,2020-05-08 08:52:07,"[[Git Operations, Operational], [API Requests,..."
6,github-status,global-status,20200508T180001,2020-05-08 17:49:47,"[[Git Operations, Operational], [API Requests,..."
7,github-status,global-status,20200508T220001,2020-05-08 21:47:52,"[[Git Operations, Operational], [API Requests,..."
8,github-status,global-status,20200508T010001,2020-05-08 00:50:50,"[[Git Operations, Operational], [API Requests,..."
9,github-status,global-status,20200508T000001,2020-05-07 23:57:42,"[[Git Operations, Operational], [API Requests,..."


In [9]:
df.to_csv(r'/home/shane/Documents/thesis/output/parsed/github-status.csv', index=False)

### Blow-Up Status Column

In [4]:
# df2 = pd.read_csv('/home/shane/Documents/thesis/output/parsed/github-status.csv')
df2 = df.copy()
df2.head()

Unnamed: 0,source,service,timestamp_dir,timestamp_site,status
0,github-status,global-status,20200508T150001,2020-05-08 14:54:02,"[['Git Operations', 'Operational'], ['API Requ..."
1,github-status,global-status,20200508T060001,2020-05-08 05:50:15,"[['Git Operations', 'Operational'], ['API Requ..."
2,github-status,global-status,20200508T130001,2020-05-08 12:53:22,"[['Git Operations', 'Operational'], ['API Requ..."
3,github-status,global-status,20200508T200001,2020-05-08 19:50:28,"[['Git Operations', 'Operational'], ['API Requ..."
4,github-status,global-status,20200508T030001,2020-05-08 02:49:28,"[['Git Operations', 'Operational'], ['API Requ..."


In [11]:
# Get names of the sub-services
status_types = set()
for status in df2['status']:
    replaced = status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", '').replace('Issues, PRs, Projects', 'Issues/PRs/Projects')
    for part in replaced.split('||'):
        status_types.add(part.split(',')[0])

In [12]:
len(status_types) # new columns

12

In [13]:
replaced = []
for status in df2['status']:
    replaced.append(status.replace("], [","||").replace('[','').replace(']','').replace("', '", ',').replace("'", '').replace('Issues, PRs, Projects', 'Issues/PRs/Projects'))
replaced[0]

'Git Operations,Operational||API Requests,Operational||Webhooks,Operational||Visit www.githubstatus.com for more information,Operational||Issues/PRs/Projects,Operational||GitHub Actions,Operational||GitHub Packages,Operational||GitHub Pages,Operational||Other,Operational'

In [14]:
df3 = pd.DataFrame(replaced)
df3.columns = ['status']
df3.head()

Unnamed: 0,status
0,"Git Operations,Operational||API Requests,Opera..."
1,"Git Operations,Operational||API Requests,Opera..."
2,"Git Operations,Operational||API Requests,Opera..."
3,"Git Operations,Operational||API Requests,Opera..."
4,"Git Operations,Operational||API Requests,Opera..."


In [15]:
# New dataframe for the statuses
df_status = pd.DataFrame(columns=status_types)
df_status.head()

Unnamed: 0,Other,Gists,Issues/PRs/Projects,Issues,GitHub Packages,Visit www.githubstatus.com for more information,Webhooks,API Requests,GitHub Pages,Notifications,Git Operations,GitHub Actions


In [16]:
# This is a bit ugly
for row, part in enumerate(df3['status']):
    for d in part.split('||'):
        d2 = d.split(',')
        df_status.at[row, d2[0]] = d2[1]
    if row % 1000 == 0:
        print(row)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [17]:
df_final = df2.join(df_status)
df_final.drop(['status'], axis=1, inplace=True)

In [20]:
df_final['service'] = 'github'
df_final

Unnamed: 0,source,service,timestamp_dir,timestamp_site,Other,Gists,Issues/PRs/Projects,Issues,GitHub Packages,Visit www.githubstatus.com for more information,Webhooks,API Requests,GitHub Pages,Notifications,Git Operations,GitHub Actions
0,github-status,github,20200508T150001,2020-05-08 14:54:02,Operational,,Operational,,Operational,Operational,Operational,Operational,Operational,,Operational,Operational
1,github-status,github,20200508T060001,2020-05-08 05:50:15,Operational,,Operational,,Operational,Operational,Operational,Operational,Operational,,Operational,Operational
2,github-status,github,20200508T130001,2020-05-08 12:53:22,Operational,,Operational,,Operational,Operational,Operational,Operational,Operational,,Operational,Operational
3,github-status,github,20200508T200001,2020-05-08 19:50:28,Operational,,Operational,,Operational,Operational,Operational,Operational,Operational,,Operational,Operational
4,github-status,github,20200508T030001,2020-05-08 02:49:28,Operational,,Operational,,Operational,Operational,Operational,Operational,Operational,,Operational,Operational
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9607,github-status,github,20190709T220001,2019-07-09 21:56:19,,Operational,,PRs,,Operational,,Operational,Operational,Operational,Operational,
9608,github-status,github,20190709T200001,2019-07-09 19:52:46,,Operational,,PRs,,Operational,,Operational,Operational,Operational,Operational,
9609,github-status,github,20190709T080001,2019-07-09 07:54:24,,Operational,,PRs,,Operational,,Operational,Operational,Operational,Operational,
9610,github-status,github,20190709T050001,2019-07-09 04:47:53,,Operational,,PRs,,Operational,,Operational,Operational,Operational,Operational,


In [21]:
df_final.to_csv(r'/home/shane/Documents/thesis/output/parsed/final/github-status.csv', index=False)