# Label the service statuses in a uniform way

In [8]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
from datetime import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np

### Number of records per CSV

In [9]:
root_dir = '/home/shane/Documents/thesis/output/parsed/final/'
files = glob.glob(root_dir + '*.csv')
for file in files:
    df = pd.read_csv(file)
    print(file.split('/')[8], len(df))

cloud-amazon-web-services.csv 291
slack.csv 1835
outage-report.csv 120018
downdetector_v2.csv 323708
downrightnow.csv 11904


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


atlassian.csv 23864
github-status.csv 9612
cloud-google-apps.csv 549
cloudflare-status.csv 8433
discord.csv 1836
cloud-google-cloud-platform.csv 22811


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Number of unique statuses and status features per service

In [10]:
unique_statuses = set()

for file in files:
    if 'outage' in file or 'google-apps' in file or 'cloud-amazon' in file:
        continue
        
    df = pd.read_csv(file)
    statuses = df.iloc[:, 4:]
    unique_count = set()
    
    for s in statuses:
        for t in df[s].unique():
            unique_statuses.add(str(t))
            unique_count.add(str(t))
    print(file.split('/')[8], '; Unique:', len(unique_count), '; Features:', len(df.columns)-4)

slack.csv ; Unique: 2 ; Features: 10
downdetector_v2.csv ; Unique: 4 ; Features: 1
downrightnow.csv ; Unique: 5 ; Features: 1
atlassian.csv ; Unique: 6 ; Features: 175
github-status.csv ; Unique: 5 ; Features: 11
cloudflare-status.csv ; Unique: 8 ; Features: 248
discord.csv ; Unique: 4 ; Features: 21
cloud-google-cloud-platform.csv ; Unique: 25 ; Features: 1


### The unique status types we should encode

In [11]:
unique_statuses

{' operational',
 ' partial_outage',
 '?',
 'All services available',
 'Cloud Developer Tools reporting issues',
 'Cloud Machine Learning reporting issues',
 'Cloud Run reporting issues',
 'Cloud Spanner reporting issues',
 'Degraded Performance',
 'Google App Engine reporting issues',
 'Google BigQuery reporting issues',
 'Google Cloud Composer reporting issues',
 'Google Cloud Console reporting issues',
 'Google Cloud Dataflow reporting issues',
 'Google Cloud Dataproc reporting issues',
 'Google Cloud Datastore reporting issues',
 'Google Cloud Functions reporting issues',
 'Google Cloud Networking reporting issues',
 'Google Cloud Pub/Sub reporting issues',
 'Google Cloud SQL reporting issues',
 'Google Cloud Storage reporting issues',
 'Google Cloud Support reporting issues',
 'Google Cloud infrastructure components reporting issues',
 'Google Compute Engine reporting issues',
 'Google Kubernetes Engine reporting issues',
 'Google Stackdriver reporting issues',
 'Identity & Securi

### Encodings go in a dictionary, as follows:
#### 0 -> operational; 1 -> partial outage/issues; 2 -> full outage/major problems; 3 -> maintenance; 9 -> unknown

In [12]:
labels_dict = {
   ' operational': 0,
   ' partial_outage': 1,
   '?': 9,
   'All services available': 0,
   'Cloud Developer Tools reporting issues': 1,
   'Cloud Machine Learning reporting issues': 1,
   'Cloud Run reporting issues': 1,
   'Cloud Spanner reporting issues': 1,
   'Degraded Performance': 1,
   'Google App Engine reporting issues': 1,
   'Google BigQuery reporting issues': 1,
   'Google Cloud Composer reporting issues': 1,
   'Google Cloud Console reporting issues': 1,
   'Google Cloud Dataflow reporting issues': 1,
   'Google Cloud Dataproc reporting issues': 1,
   'Google Cloud Datastore reporting issues': 1,
   'Google Cloud Functions reporting issues': 1,
   'Google Cloud Networking reporting issues': 1,
   'Google Cloud Pub/Sub reporting issues': 1,
   'Google Cloud SQL reporting issues': 1,
   'Google Cloud Storage reporting issues': 1,
   'Google Cloud Support reporting issues': 1,
   'Google Cloud infrastructure components reporting issues': 1,
   'Google Compute Engine reporting issues': 1,
   'Google Kubernetes Engine reporting issues': 1,
   'Google Stackdriver reporting issues': 1,
   'Identity & Security reporting issues': 1,
   'Likely Service Disruption': 1,
   'Major Outage': 2,
   'Multiple services reporting issues': 1,
   'No issues': 0,
   'Operational': 0,
   'Operations reporting issues': 1,
   'Partial Outage': 1,
   'Possible Service Trouble': 1,
   'Recent Signs of Service Trouble': 1,
   "Something's not quite right View details": 1,
   'Under Maintenance': 3,
   'nan': 9,
   'Up': 0,
   'alert-danger': 2,
   'alert-success': 0,
   'alert-warning': 1,
   'degraded_performance': 1,
   'major_outage': 2,
   'operational': 0,
   'partial_outage': 1,
   'under_maintenance': 3
}

### Replace status with our encoding, add features for dataset unification

In [29]:
for file in files:
    if 'outage' in file or 'google-apps' in file or 'cloud-amazon' in file:
        continue

    df = pd.read_csv(file)
    df.replace(labels_dict, inplace=True)
    
    success_count = df[df.iloc[:, 4:] == 0].count(axis=1)          
    partial_count = df[df.iloc[:, 4:] == 1].count(axis=1)
    major_count = df[df.iloc[:, 4:] == 2].count(axis=1)
    maintenance_count = df[df.iloc[:, 4:] == 3].count(axis=1)
    unknown_count = df[df.iloc[:, 4:] == 3].count(axis=1)
    
    total_count = success_count + partial_count + major_count + maintenance_count + unknown_count
    
    success_pct = success_count / total_count
    partial_pct = partial_count / total_count
    major_pct = major_count / total_count
    maintenance_pct = maintenance_count / total_count
    unknown_pct = unknown_count / total_count
    
    df = df.assign(success_count = success_count, success_pct = success_pct)
    df = df.assign(partial_count = partial_count, partial_pct = partial_pct)
    df = df.assign(major_count = major_count, major_pct = major_pct)
    df = df.assign(maintenance_count = maintenance_count, maintenance_pct = maintenance_pct)
    df = df.assign(unknown_count = unknown_count, unknown_pct = unknown_pct)
    
    df_1 = df.iloc[:, 0:4]
    df_2 = df.iloc[:, -10:]
    df_final = pd.concat([df_1, df_2], axis=1)
    
    try:
        df_final.rename(columns={'placeholder':'timestamp_site'}, inplace=True)
    except:
        pass
    
    df_final.to_csv(r'/home/shane/Documents/thesis/output/cleaned/' + file.split('/')[8], index=False)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [31]:
df_final.head()

Unnamed: 0,source,service,timestamp,timestamp_site,success_count,success_pct,partial_count,partial_pct,major_count,major_pct,maintenance_count,maintenance_pct,unknown_count,unknown_pct
0,cloud-google-cloud-platform,google-cloud,20200508T150001,,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0
1,cloud-google-cloud-platform,google-cloud,20200508T060001,,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0
2,cloud-google-cloud-platform,google-cloud,20200508T130001,,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0
3,cloud-google-cloud-platform,google-cloud,20200508T200001,,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0
4,cloud-google-cloud-platform,google-cloud,20200508T030001,,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0
