In [1]:
import time
import glob as glob
from zipfile import ZipFile
from bs4 import BeautifulSoup
import dateutil.parser as dparser
import datetime
import pandas as pd
import multiprocessing as mp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
from sortedcontainers import SortedSet
import statistics
import operator
from statsmodels.distributions.empirical_distribution import ECDF

%matplotlib inline

## Set types and drop useless data

In [2]:
df = pd.read_csv('/home/shane/Documents/thesis/output/final/failures2.csv', parse_dates=['timestamp_dir'])
df['operational_pct'] = df['operational_pct'].astype('float') * 100
df['partial_pct'] = df['partial_pct'].astype('float') * 100
df['major_pct'] = df['major_pct'].astype('float') * 100
df['maint_pct'] = df['maint_pct'].astype('float') * 100
df.drop(['timestamp_site'], axis=1, inplace=True)
df.dropna(inplace=True)

## Mean aggregation by hour

In [3]:
df_grp = df.reset_index().groupby(['service','timestamp_dir'], as_index=False).agg({'operational_pct': 'mean', 'partial_pct': 'mean', 'major_pct': 'mean', 'maint_pct': 'mean'})
df_grp.describe()

Unnamed: 0,operational_pct,partial_pct,major_pct,maint_pct
count,449279.0,449279.0,449279.0,449279.0
mean,95.619856,4.279417,0.099078,0.001648
std,19.193069,19.001882,2.46838,0.310195
min,0.0,0.0,0.0,0.0
25%,100.0,0.0,0.0,0.0
50%,100.0,0.0,0.0,0.0
75%,100.0,0.0,0.0,0.0
max,100.0,100.0,100.0,100.0


In [4]:
# timeseries['operational_pct'].columns.tolist()
cols = ['Access (Atlassian)',
 'Airbnb',
 'Amazon',
 'Bitbucket',
 'Blogger',
 'Cloudflare',
 'Confluence',
 'Developers (Atlassian)',
 'Discord',
 'Facebook',
 'Foursquare',
 'GitHub',
 'Gmail',
 'Google Cloud',
 'Hotmail',
 'Jira Align',
 'Jira Core',
 'Jira Service Desk',
 'Jira Software',
 'LinkedIn',
 'LiveJournal',
 'Lyft',
 'Netflix',
 'Ning',
 'Opsgenie',
 'Partners (Atlassian)',
 'PayPal',
 'Pinterest',
 'Reddit',
 'Skype',
 'Slack',
 'Snapchat',
 'Spotify',
 'Statuspage',
 'Support (Atlassian)',
 'Trello',
 'Tumblr',
 'Twitter',
 'Typepad',
 'Yahoo! Mail',
 'YouTube',
 'Zynga']

In [5]:
grouped = df.groupby('service')

In [6]:
def get_state_changes(what):
    state_changes = []
    for name, group in grouped:
        sc_ts = group[group['timestamp_dir'].diff() > datetime.timedelta(days=1)].timestamp_dir.tolist()
        sc_list = []
        
        if what == 'all':
            sc_op = group[group['operational_pct'].diff() != 0].timestamp_dir.tolist()
            sc_po = group[group['partial_pct'].diff() != 0].timestamp_dir.tolist()
            sc_mo = group[group['major_pct'].diff() != 0].timestamp_dir.tolist()
            sc_ma = group[group['maint_pct'].diff() != 0].timestamp_dir.tolist()
            sc_list = sc_op + sc_po + sc_mo + sc_ma
        elif what == 'mtbf_po':
            sc_po = group[group['partial_pct'].diff() > 0].timestamp_dir.tolist()
            sc_list = sc_po
        elif what == 'mtbf_mo':
            sc_mo = group[group['major_pct'].diff() > 0].timestamp_dir.tolist()
            sc_list = sc_mo
        elif what == 'mtbf_ma':
            sc_ma = group[group['maint_pct'].diff() > 0].timestamp_dir.tolist()
            sc_list = sc_ma
        elif what == 'mttr_po':
            sc_po = group[group['partial_pct'].diff() < 0].timestamp_dir.tolist()
            sc_list = sc_po
        elif what == 'mttr_mo':
            sc_mo = group[group['major_pct'].diff() < 0].timestamp_dir.tolist()
            sc_list = sc_mo
        elif what == 'mttr_ma':
            sc_ma = group[group['maint_pct'].diff() < 0].timestamp_dir.tolist()
            sc_list = sc_ma
        
        sc_set = SortedSet(sc_list)
        state_changes.append([name, len(group), sc_set, sc_ts])
    return state_changes

In [7]:
def get_timedeltas(state_changes):
    mtbsc_stats = []
    for group in state_changes:
        service = group[0]
        num_records = group[1]
        changes = group[2]
        time_gaps = group[3]
        num_changes = len(changes)-1
        mtbsc = []
        
        try:
            previous = changes.pop(0)
        except:
            mtbsc_stats.append([service, num_records, num_changes, round(num_changes/num_records*100,2), np.nan])
            continue

        for x in range(0, len(changes)):
            current = changes.pop(0)
            if current in time_gaps:
#                 print(current - previous)
                previous = current
                continue
            mtbsc.append(current - previous)
            previous = current
        mtbsc_mean = pd.to_timedelta(mtbsc).mean()
        mtbsc_stats.append([service, num_records, num_changes, round(num_changes/num_records*100,2), mtbsc_mean])
    return mtbsc_stats

In [11]:
col = 'mtbf_po'
mtbf_all = get_state_changes(col)
mean_mtbf_po = get_timedeltas(mtbf_all)
df_mtbf_all = pd.DataFrame(mean_mtbf_po, columns=['service','records','state_changes','pct_state_changes','mean_td'])
df_mtbf_all[col] = round(df_mtbf_all['mean_td'].dt.total_seconds() / (24 * 60 * 60), 2)

col = 'mtbf_mo'
mtbf_mo = get_state_changes(what=col)
mean_mtbf_mo = get_timedeltas(mtbf_mo)
df_mtbf_mo = pd.DataFrame(mean_mtbf_mo, columns=['service','records','state_changes','pct_state_changes','mean_td'])
df_mtbf_mo[col] = round(df_mtbf_mo['mean_td'].dt.total_seconds() / (24 * 60 * 60), 2)
df_mtbf_all[col] = df_mtbf_mo[col]

col = 'mtbf_ma'
mtbf_ma = get_state_changes(what=col)
mean_mtbf_ma = get_timedeltas(mtbf_ma)
df_mtbf_ma = pd.DataFrame(mean_mtbf_ma, columns=['service','records','state_changes','pct_state_changes','mean_td'])
df_mtbf_ma[col] = round(df_mtbf_ma['mean_td'].dt.total_seconds() / (24 * 60 * 60), 2)
df_mtbf_all[col] = df_mtbf_ma[col]

mtbf_plot = df_mtbf_all.copy()
mtbf_plot.drop(['service','records','state_changes','pct_state_changes','mean_td'], axis=1, inplace=True)

mtbf_plot.describe()

['facebook', 22298, 77, 0.35, Timedelta('11 days 11:09:21.038961039')]


Unnamed: 0,mtbf_po,mtbf_mo,mtbf_ma
count,32.0,13.0,2.0
mean,23.996875,19.270769,22.48
std,32.121815,24.143147,7.806459
min,0.14,0.91,16.96
25%,1.03,2.19,19.72
50%,9.47,10.25,22.48
75%,28.2725,36.02,25.24
max,118.56,78.58,28.0
