In [1]:
from dask.delayed import delayed
import dask.dataframe as dd
import pandas as pd
from PySquashfsImage import SquashFsImage
import zipfile
import tempfile
import os
import shutil
from dask.distributed import get_worker
import re
import itertools
from dateutil.tz import gettz
from bs4 import BeautifulSoup
import dateutil
import datetime
import json
import numpy as np

In [2]:
tzinfos = {
    'PST': gettz('Etc/GMT-8'),
    'PDT': gettz('Etc/GMT-7'),
    # https://www.timeanddate.com/time/zones/est
    'EST': gettz('Etc/GMT-5'),
    # https://www.timeanddate.com/time/zones/edt
    'EDT': gettz('Etc/GMT-4'),
    # https://www.timeanddate.com/time/zones/cest
    'CET': gettz('Etc/GMT+1')
}

def process_aws_feed(file_obj):
    json_archive = json.load(file_obj)
    df = pd.DataFrame(json_archive['archive'])
    df['date'] = pd.to_datetime(df['date'], unit='s')
    df.rename(columns={
        'date': 'timestamp',
        'service': 'service_id',
    }, inplace=True)
    df.drop(columns=['details', 'summary'], inplace=True)

    mask_is_location_present = df['service_name'].str.endswith(')')

    df['location'] = 'Global'

    separated_name_and_loc=\
     df.loc[mask_is_location_present, 'service_name'].str.rsplit('(', expand=True, n=1)

    df.loc[mask_is_location_present, 'service_name'] = separated_name_and_loc.loc[:, 0]
    df.loc[mask_is_location_present, 'location'] = \
        separated_name_and_loc.loc[:, 1].str.rsplit(')', expand=True, n=1).loc[:, 0]

    df['status'] = df['status'].astype(np.int32)
    
    present_date = datetime.date.today()
    
    with_event_time_info = []
    for (_, row) in df.iterrows():
        desc = row['description']
        try:
            soup = BeautifulSoup(desc, 'lxml')
            all_spans = soup.find_all('span', {
                'class': re.compile(r'.*')
            })

            first_notification = dateutil.parser.parse(all_spans[0].contents[0].strip(), tzinfos=tzinfos)
            last_notification = dateutil.parser.parse(all_spans[-1].contents[-1].strip(), tzinfos=tzinfos)

            last_div_content = str(soup.find_all('div')[-1].contents[1])[1:]

            # Only verified on one file. Need to reverify later.
            good_times = re.match(r'\s*.*[Bb]etween(.+?)and(.+?)(?<=[0-9],)', last_div_content)
            if good_times is None:
                good_times = re.match(r'\s*.*[Bb]etween(.+?)and(.+?)(?<=T[, ])', last_div_content)
            if good_times is None:
                good_times = re.match(r'\s*.*[Bb]etween(.+?)and(.+?)(?<=M[, ])', last_div_content)

            event_start_time = None
            event_end_time = None
            # Ignore parsing errors for now.
            try:
                if good_times is not None:
                    # There are max 8 characters in just time. Example: 11:21 PM
                    event_start_time = dateutil.parser.parse(good_times.group(1).strip(), tzinfos=tzinfos)
                    event_end_time = dateutil.parser.parse(good_times.group(2).strip(), tzinfos=tzinfos)
            except:
                pass

            record_date = row['timestamp']

            # Be careful not to run this script at midnight.
            # present date might be different from date given to events.
            if first_notification.date() == present_date:
                first_notification = first_notification.replace(year=record_date.year,
                                                                month=record_date.month,
                                                                day=record_date.day)

            if last_notification.date() == present_date:
                last_notification = last_notification.replace(year=first_notification.year,
                                                              month=first_notification.month,
                                                              day=first_notification.day)
                
            if first_notification.year == present_date.year:
                first_notification = first_notification.replace(year=record_date.year)
                
            if last_notification.year == present_date.year:
                last_notification = last_notification.replace(year=record_date.year)

            # The order matters. Typically, end time has more information associated with it than start time.
            if event_end_time is None:
                event_end_time = last_notification
            if event_end_time.date() == present_date:
                event_end_time = event_end_time.replace(year=last_notification.year,
                                                        month=last_notification.month,
                                                        day=last_notification.day,
                                                        tzinfo=last_notification.tzinfo)

            if event_start_time is None:
                event_start_time = first_notification
            if event_start_time.date() == present_date:
                event_start_time = event_start_time.replace(year=event_end_time.year,
                                                            month=event_end_time.month,
                                                            day=event_end_time.day,
                                                            tzinfo=event_end_time.tzinfo)

            if event_start_time.tzinfo is None:
                event_start_time = event_start_time.replace(tzinfo=first_notification.tzinfo)

            if event_end_time.tzinfo is None:
                event_end_time = event_end_time.replace(tzinfo=event_start_time.tzinfo)
                
            if event_start_time.year == present_date.year:
                event_start_time = event_start_time.replace(year=record_date.year)
                
            if event_end_time.year == present_date.year:
                event_end_time = event_end_time.replace(year=record_date.year)

            # Make sure we parsed the times correct.
            try:
                assert first_notification <= last_notification
                assert event_start_time <= event_end_time
            except AssertionError as e:
#                 print('End before start')
#                 print(str(first_notification) + ' ' + str(last_notification))
#                 print(str(event_start_time) + ' ' + str(event_end_time))
#                 print(desc)

                if event_start_time.year > event_end_time.year:
                    event_start_time = event_start_time.replace(year=event_end_time.year)
            
                if event_start_time.hour > event_end_time.hour and event_start_time.day == event_end_time.day:
                    new_hour = event_end_time.hour+12
                    if new_hour > 24:
                        event_start_time = event_start_time.replace(day=event_start_time.day-1)
                    else:
                        event_end_time = event_end_time.replace(hour=new_hour)

#                 print(str(event_start_time) + ' ' + str(event_end_time))

            row['first_notification'] = first_notification.timestamp()
            row['last_notification'] = last_notification.timestamp()
            row['event_start_time'] = event_start_time.timestamp()
            row['event_end_time'] = event_end_time.timestamp()
            row['description'] = re.sub(r'&nbsp;', ' \n', re.sub(r'<[^>]*>', '', desc))

            with_event_time_info.append(row)
        except Exception as e:
            print('Overall catch')
            print(desc)
            raise e

    df = pd.DataFrame(with_event_time_info, columns=[
        'timestamp',
        'service_id',
        'service_name',
        'location',
        'status',
        'event_start_time',
        'event_end_time',
        'first_notification',
        'last_notification',
        'description'
    ]).drop(columns=['timestamp']).drop_duplicates()
    
    return df

In [3]:
def process_azure_feed(file_obj, metadata):
    data_string = file_obj.read()
    html_data = BeautifulSoup(data_string, 'lxml')

    ls = []

    filtered_items = html_data.select('section[aria-label="Service status history"] .row .row div')

    for fil_item in filtered_items:
        h3s = fil_item.select('h3')
        if len(h3s) == 0:
            continue

        location = ''
        service = ''

        # Extract service name and location from heading
        one_heading = h3s[0]
#         print(one_heading)
        hsplits = one_heading.getText().split('-')

        hstart_index = 0
        if hsplits[hstart_index].strip() == 'RCA':
            hstart_index = 1
            
        if ('east' in hsplits[hstart_index].lower()
            or 'west' in hsplits[hstart_index].lower()
            or 'north' in hsplits[hstart_index].lower()
            or 'south' in hsplits[hstart_index].lower()
            or 'central' in hsplits[hstart_index].lower()
            or 'US' in hsplits[hstart_index]):

                location = hsplits[hstart_index].strip()
                service = 'all'
        elif hstart_index < len(hsplits)-1:
            service = hsplits[hstart_index].strip()

            if ('east' in hsplits[hstart_index + 1].lower()
                or 'west' in hsplits[hstart_index + 1].lower()
                or 'north' in hsplits[hstart_index + 1].lower()
                or 'south' in hsplits[hstart_index + 1].lower()
                or 'central' in hsplits[hstart_index + 1].lower()
                or 'US' in hsplits[hstart_index + 1]):

                location = hsplits[hstart_index + 1].strip()
            else:
                location = 'global'
        else:
            service = hsplits[hstart_index].strip()
            location = 'global'

        event_start_time = None
        event_end_time = None

        # Extract time from description text
        description = fil_item.getText()
        paragraphs = description.split('\n')
        for para_index, para in enumerate(paragraphs):
            if 'between' in para.lower():
                if 'Latency between North Europe and North America' in para:
                    continue
                if '30 on Jan' in para:
                    # Azure's moronic error message 30 on jan instead of jan 30. Will fix it if more appear
                    continue
                try:
                    # Sometimes the dates of interest are split on two lines. Join remaining lines to include them
                    # Only consider the first 150 characters to prevent accidental matches
                    partial_content = " ".join(paragraphs[para_index:])[:150]
                    
                    # The order of clauses below is bloody important!!!
                    # The first one is the most selective
                    # Last on the least selective. Will basically match anything
                    # Between 03:30 and 15:20 UTC, and then again between 17:00 and 17:32 UTC on the 19 Apr 2019,
                    good_times = re.match(r'.*(?:(?:Between)|(?:From)).*?([0-9].+?) (?:(?:and)|(?:to)) ([0-9].+?) and .*? on ?(?:the)? (.+?[0-9]{4})', partial_content)
                    if good_times is not None:
                        event_day = dateutil.parser.parse(good_times.group(3).strip())
                        event_start_time = dateutil.parser.parse(good_times.group(1).strip()).replace(year=event_day.year, month=event_day.month, day=event_day.day)
                        event_end_time = dateutil.parser.parse(good_times.group(2).strip()).replace(year=event_day.year, month=event_day.month, day=event_day.day)
                        # sometimes the timezones are different from UTC
                        event_start_time = event_start_time.replace(tzinfo=event_end_time.tzinfo)

                    if good_times is None:
                        # Between 22:10 on 28 Mar 2019 and 03:23 UTC on 29 Mar 2019,
                        good_times = re.match(r'.*(?:(?:Between)|(?:From)).*?([0-9].+?) on ([0-9].+?[0-9]+) (?:(?:and)|(?:to)).*?([0-9].+?[0-9]{4})', partial_content)
                        if good_times is not None:
                            # Between 21:03 CST (UTC+8) on 05 Mar 2020 and 16:03 CST on 06 Mar 2020
                            start_day_string = good_times.group(2).strip()
                            if "(" in start_day_string:
                                start_day_string = start_day_string.split("(")[0].strip()
                                
                            start_time_string = good_times.group(1).strip()
                            if "(" in start_time_string:
                                start_time_string = start_time_string.split("(")[0].strip()
                            
                            event_start_day = dateutil.parser.parse(start_day_string)
                            event_start_time = dateutil.parser.parse(start_time_string).replace(year=event_start_day.year, month=event_start_day.month, day=event_start_day.day)
                            event_end_time = dateutil.parser.parse(good_times.group(3).strip())
                            # sometimes the timezones are different from UTC
                            event_start_time = event_start_time.replace(tzinfo=event_end_time.tzinfo)
                            
                    if good_times is None:
                        # Between 05:55 UTC on 22 Jan and 00:56 UTC on 23 Jan 2020,
                        good_times = re.match(r'.*Between.*?([0-9].+?) on ([0-9].+?) (?:(?:and)|(?:to)) ([0-9].+?) on ([0-9]{1,2}[a-zA-Z ]+(?:[0-9]{4})?)', partial_content)
                        if good_times is not None:
                            event_end_day = dateutil.parser.parse(good_times.group(4).strip())
                            event_start_day = dateutil.parser.parse(good_times.group(2).strip()).replace(year=event_end_day.year)
                            event_start_time = dateutil.parser.parse(good_times.group(1).strip()).replace(year=event_start_day.year, month=event_start_day.month, day=event_start_day.day)
                            event_end_time = dateutil.parser.parse(good_times.group(3).strip()).replace(year=event_end_day.year, month=event_end_day.month, day=event_end_day.day)
                            # sometimes the timezones are different from UTC
                            event_start_time = event_start_time.replace(tzinfo=event_end_time.tzinfo)

                    if good_times is None:
                        # Between approximately 07:12 and 08:02 UTC on 16 Apr 2019,
                        good_times = re.match(r'.*[Bb]etween.*?([0-9].+?) (?:(?:and)|(?:to)) ([0-9].+?) on ([0-9]{1,2}[a-zA-Z ]+(?:[0-9]{4})?)', partial_content)
                        if good_times is not None:
                            g1 = good_times.group(1)
                            if '(approx.)' in g1:
                                g1 = g1.split('(')[0].strip()
                            g2 = good_times.group(2)
                            if '(approx.)' in g2:
                                g2 = g2.split('(')[0].strip()
                                
                            # The replace is a fix for dates like 10.18 UTC
                            g1 = g1.replace(".", ":")
                            g2 = g2.replace(".", ":")
                            
                            event_day = dateutil.parser.parse(good_times.group(3).strip())
                            # If year is not available while parsing, current year is set
                            # If metadata is from before current year, metadata is likely correct
                            if event_day.year > metadata["year"]:
                                event_day = event_day.replace(year=metadata["year"])
                            event_start_time = dateutil.parser.parse(g1).replace(year=event_day.year, month=event_day.month, day=event_day.day)
                            event_end_time = dateutil.parser.parse(g2).replace(year=event_day.year, month=event_day.month, day=event_day.day)
                            # sometimes the timezones are different from UTC
                            event_start_time = event_start_time.replace(tzinfo=event_end_time.tzinfo)
                            
                    if good_times is None:
                        # Between 10:12 UTC 4 Dec and 15:14 UTC 6 Dec 2017,
                        good_times = re.match(r'.*Between.*?([0-9].+?) ([0-9].+?) and ([0-9].+?) ([0-9].+?[0-9]{4})', partial_content)
                        if good_times is not None:
                            event_start_day = dateutil.parser.parse(good_times.group(2).strip())
                            event_end_day = dateutil.parser.parse(good_times.group(4).strip())
                            # Notice that we take the end year for both times
                            event_start_time = dateutil.parser.parse(good_times.group(1).strip()).replace(year=event_end_day.year, month=event_start_day.month, day=event_start_day.day)
                            event_end_time = dateutil.parser.parse(good_times.group(3).strip()).replace(year=event_end_day.year, month=event_end_day.month, day=event_end_day.day)
                            # sometimes the timezones are different from UTC
                            event_start_time = event_start_time.replace(tzinfo=event_end_time.tzinfo)
                            
                    if good_times is None:
                        good_times = re.match(r'.*Between.*?(.+? UTC) (?:(?:and)|(?:to)) (.+? UTC)', partial_content)
                        if good_times is not None:
                            event_start_time = dateutil.parser.parse(good_times.group(1).strip()).replace(year=metadata['year'])
                            event_end_time = dateutil.parser.parse(good_times.group(2).strip()).replace(year=metadata['year'])
                            # sometimes the timezones are different from UTC
                            event_start_time = event_start_time.replace(tzinfo=event_end_time.tzinfo)
                            
                    if good_times is None:
                        # Between as early as 09:00 UTC on Sep 05 and as late as 05:50 UTC on Sep 10,
                        good_times = re.match(r'.*(?:(?:Between)|(?:From)).*?([0-9].+?) on (.+?[0-9]+) (?:(?:and)|(?:to)).*?([0-9].+?) on (.+?[0-9]+)', partial_content)
                        if good_times is not None:
                            event_start_day = dateutil.parser.parse(good_times.group(2).strip())
                            event_end_day = dateutil.parser.parse(good_times.group(4).strip())
                            event_start_time = dateutil.parser.parse(good_times.group(1).strip()).replace(year=event_start_time.year, month=event_start_day.month, day=event_start_day.day)
                            event_end_time = dateutil.parser.parse(good_times.group(3).strip()).replace(year=event_end_time.year, month=event_end_day.month, day=event_end_day.day)
                            # sometimes the timezones are different from UTC
                            event_start_time = event_start_time.replace(tzinfo=event_end_time.tzinfo)
                            
                    if good_times is None:
                        raise Exception("New kind of description does not match existing regexes")

                    if not good_times:
                        pass
                    elif 'Multi-Factor Authentication' in description:
                        pass
                    else:
                        assert event_start_time <= event_end_time

                except Exception as e:
                    print(description)
                    print(partial_content)
                    raise e 

                break

        if event_start_time is None:
#             print(one_heading)
#             print(service)
#             print(location)
#             print(paragraphs)
            ls.append({
                'service_id': '',
                'service_name': service,
                'location': location,
                'status': -1,
                'event_start_time': -1,
                'event_end_time': -1,
                'first_notification': -1,
                'last_notification': -1,
                'description': description
            })
        else:
            ls.append({
                'service_id': '',
                'service_name': service,
                'location': location,
                'status': -1,
                'event_start_time': event_start_time.timestamp(),
                'event_end_time': event_end_time.timestamp(),
                'first_notification': -1,
                'last_notification': -1,
                'description': description
            })

    df = pd.DataFrame(ls, columns=[
        'service_id',
        'service_name',
        'location',
        'status',
        'event_start_time',
        'event_end_time',
        'first_notification',
        'last_notification',
        'description'
    ]).drop_duplicates()
    
    return df

In [4]:
def process_gcloud_feed(file_obj, metadata):
    df = pd.read_json(file_obj).rename(columns={
        'begin': 'event_start_time',
        'end': 'event_end_time',
        'created': 'first_notification',
        'modified': 'last_notification',
        'service_key': 'service_id',
        'service_name': 'service_name',
    }).drop(columns=['most-recent-update', 'number', 'public', 'uri'])

    df['description'] = df['external_desc'] + df['updates'].apply(str)
    
    def map_status(as_string):
        if as_string == 'high':
            return 0
        elif as_string == 'medium':
            return 1
        elif as_string == 'low':
            return 2
        else:
            return 3
    df['status'] = df['severity'].apply(map_status)
    df = df.drop(columns=['severity', 'external_desc', 'updates'])
    df['location'] = None
    
    df.loc[df['event_end_time'].isnull(), 'event_end_time'] = df['event_start_time']
    df['event_start_time'] = (pd.to_datetime(df['event_start_time']).values.astype(np.int64) // 10**9).astype(np.int64) # convert to minutes
    df['event_end_time'] = (pd.to_datetime(df['event_end_time']).values.astype(np.int64) // 10**9).astype(np.int64)
    df['first_notification'] = (pd.to_datetime(df['first_notification']).values.astype(np.int64) // 10**9).astype(np.int64)
    df['last_notification'] = (pd.to_datetime(df['last_notification']).values.astype(np.int64) // 10**9).astype(np.int64)
    
    return df[[
        'service_id',
        'service_name',
        'location',
        'status',
        'event_start_time',
        'event_end_time',
        'first_notification',
        'last_notification',
        'description'
    ]]

In [5]:
APP_TEMP_PREFIX = 'ctzph-01-'
pathSplitRegex = re.compile('[./]')
PLACEHOLDER_DF = pd.DataFrame([{
            'service_id': '',
            'service_name': '',
            'location': '',
            'status': np.int32(-1),
            'event_start_time': np.int64(-1),
            'event_end_time': np.int64(-1),
            'first_notification': np.int64(-1),
            'last_notification': np.int64(-1),
            'description': '',
            'vendor': '',
            'monitor': '',
            'org_type': ''
        }])

def populateMetadataFromSourceDirName(metadata, sourceInfo):
    if sourceInfo.startswith('cloud-amazon-web-services'):
        metadata['vendor'] = 'AWS'
        metadata['monitor'] = 'AWS'
        metadata['org_type'] = 'cloud'
    elif sourceInfo.startswith('cloud-google-apps'):
        metadata['vendor'] = 'Google Apps'
        metadata['monitor'] = 'Google Apps'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('cloud-google-cloud-platform'):
        metadata['vendor'] = 'GCP'
        metadata['monitor'] = 'GCP'
        metadata['org_type'] = 'cloud'
    elif sourceInfo.startswith('cloud-microsoft-azure'):
        metadata['vendor'] = 'Azure'
        metadata['monitor'] = 'Azure'
        metadata['org_type'] = 'cloud'
    elif sourceInfo.startswith('cloudflare-status'):
        metadata['vendor'] = 'Cloudflare'
        metadata['monitor'] = 'Cloudflare'
        metadata['org_type'] = 'cloud'
    elif sourceInfo.startswith('downdetector'):
        metadata['monitor'] = 'Down Detector'
        
        if sourceInfo == 'downdetector':
            metadata['location'] = 'USA'
        elif sourceInfo.endswith('united-arab-emirates'):
            metadata['location'] = 'UAE'
        elif sourceInfo.endswith('argentina'):
            metadata['location'] = 'Argentina'
        elif sourceInfo.endswith('australia'):
            metadata['location'] = 'Australia'
        elif sourceInfo.endswith('austria'):
            metadata['location'] = 'Austria'
        elif sourceInfo.endswith('belgium'):
            metadata['location'] = 'Belgium'
        elif sourceInfo.endswith('brazil'):
            metadata['location'] = 'Brazil'
        elif sourceInfo.endswith('canada'):
            metadata['location'] = 'Canada'
        elif sourceInfo.endswith('switzerland'):
            metadata['location'] = 'Switzerland'
        elif sourceInfo.endswith('chile'):
            metadata['location'] = 'Chile'
        elif sourceInfo.endswith('denmark'):
            metadata['location'] = 'Denmark'
        elif sourceInfo.endswith('germany'):
            metadata['location'] = 'Germany'
        elif sourceInfo.endswith('spain'):
            metadata['location'] = 'Spain'
        elif sourceInfo.endswith('finland'):
            metadata['location'] = 'Finland'
        elif sourceInfo.endswith('france'):
            metadata['location'] = 'France'
        elif sourceInfo.endswith('great-britain'):
            metadata['location'] = 'Great Britain'
        elif sourceInfo.endswith('india'):
            metadata['location'] = 'India'
        elif sourceInfo.endswith('ireland'):
            metadata['location'] = 'Ireland'
        elif sourceInfo.endswith('italy'):
            metadata['location'] = 'Italy'
        elif sourceInfo.endswith('japan'):
            metadata['location'] = 'Japan'
        elif sourceInfo.endswith('mexico'):
            metadata['location'] = 'Mexico'
        elif sourceInfo.endswith('netherlands'):
            metadata['location'] = 'Netherlands'
        elif sourceInfo.endswith('norway'):
            metadata['location'] = 'Norway'
        elif sourceInfo.endswith('new-zealand'):
            metadata['location'] = 'New Zealand'
        elif sourceInfo.endswith('poland'):
            metadata['location'] = 'Poland'
        elif sourceInfo.endswith('portugal'):
            metadata['location'] = 'Portugal'
        elif sourceInfo.endswith('russia'):
            metadata['location'] = 'Russia'
        elif sourceInfo.endswith('singapore'):
            metadata['location'] = 'Singapore'
        elif sourceInfo.endswith('sweden'):
            metadata['location'] = 'Sweden'
        elif sourceInfo.endswith('south-africa'):
            metadata['location'] = 'South Africa'
        else:
            raise Exception('Unknown location')
            
    elif sourceInfo.startswith('downrightnow'):
        metadata['monitor'] = 'Down Right Now'
    elif sourceInfo.startswith('github-status'):
        metadata['vendor'] = 'Github'
        metadata['monitor'] = 'Github'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('outage'):
        metadata['monitor'] = 'Outage Report'
    elif sourceInfo.startswith('cloud-apple-consumer'):
        return False
    elif sourceInfo.startswith('atlassian'):
        metadata['vendor'] = 'Atlassian'
        metadata['monitor'] = 'Atlassian'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('docker'):
        metadata['vendor'] = 'Docker'
        metadata['monitor'] = 'Docker'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('slack'):
        metadata['vendor'] = 'Slack'
        metadata['monitor'] = 'Slack'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('minecraft'):
        metadata['vendor'] = 'Minecraft'
        metadata['monitor'] = 'Minecraft'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('nintendo'):
        metadata['vendor'] = 'Nintendo'
        metadata['monitor'] = 'Nintendo'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('discord'):
        metadata['vendor'] = 'Discord'
        metadata['monitor'] = 'Discord'
        metadata['org_type'] = 'application'
    elif sourceInfo.startswith('gpanel'):
        return False
    else:
        raise Exception('Unknown data source: ' + sourceInfo)
        
    return True

def processHourDir(oneHourDirPath, metadata):
    # ['', 'tmp', 'tmp9_ms31tl', '20171114T000001']
    hourInfo = oneHourDirPath.split('/')[-1][9:11]
    metadata['hour'] = hourInfo
    
    return_df = None

    # add parse conditions here
    # also extract monitor source and target service from sourceinfo here
    # data also extracted from inside the file
    #
    # pass dictionary to be merged inside parser
    # then, it would be like
    monitor_name = metadata['monitor']
    if monitor_name == 'AWS':
        with open(os.path.join(oneHourDirPath, 'json-feed.html'), 'r') as file_obj:
            return_df = pd.DataFrame(process_aws_feed(file_obj))
    elif monitor_name == 'Azure' and 'history.html' in os.listdir(oneHourDirPath):
        with open(os.path.join(oneHourDirPath, 'history.html'), 'r') as file_obj:
            return_df = pd.DataFrame(process_azure_feed(file_obj, metadata))
    elif monitor_name == 'GCP':
        with open(os.path.join(oneHourDirPath, 'json-feed.html'), 'r') as file_obj:
            return_df = pd.DataFrame(process_gcloud_feed(file_obj, metadata))
    
    if return_df is None or len(return_df) == 0:
        return_df = PLACEHOLDER_DF
    else: 
        return_df['vendor'] = metadata['vendor']
        return_df['monitor'] = metadata['monitor']
        return_df['org_type'] = metadata['org_type']
        if 'location' in metadata:
            return_df['location'] = metadata['location']
    
#     print(return_df.dtypes)
#     print(return_df.loc[0, :])
        
    return return_df

def processDayZip(oneDayZipPath, metadata):
    # ['', 'tmp', 'tmp9_ms31tl', '20171114', 'zip']
    dayInfo = int(pathSplitRegex.split(oneDayZipPath)[-2][6:])
    metadata['day'] = dayInfo
    
    oneDayZip = zipfile.ZipFile(oneDayZipPath)
    
    with tempfile.TemporaryDirectory(prefix=APP_TEMP_PREFIX) as dayTempDir:
        oneDayZip.extractall(dayTempDir)
        hourDirPaths = map(lambda x: os.path.join(dayTempDir, x), os.listdir(dayTempDir))

        return list(map(lambda oneHourDirPath: processHourDir(oneHourDirPath, metadata), hourDirPaths))

def processMonthZip(squashFsImage, monthZipPath):
    arrayOfDataFrames = []
    
    # Collecting info for the dataframe
    # ['', 'cloud-amazon-web-services', '2017', '201711', 'zip']
    try:
        monthZipPathSplits = pathSplitRegex.split(monthZipPath)
        sourceInfo = monthZipPathSplits[1]
        # outage.report has a period in the name. Thus, will lead to additional splits
        if sourceInfo.startswith('outage'):
            yearInfo = int(monthZipPathSplits[3])
#             if yearInfo > 2020:
#                 return []
            monthInfo = int(monthZipPathSplits[4][4:])
        else:
            yearInfo = int(monthZipPathSplits[2])
#             if yearInfo > 2020:
#                 return []
            monthInfo = int(monthZipPathSplits[3][4:])
    except Exception as e:
        print(repr(Exception))
        print(monthZipPath)
        return []
    
    metadata = {
        'year': yearInfo,
        'month': monthInfo
    }
    
    validSource = populateMetadataFromSourceDirName(metadata, sourceInfo)
    if not validSource:
        return []
    
    monthZipHandle = squashFsImage.root.select(monthZipPath)
    with tempfile.TemporaryFile(prefix=APP_TEMP_PREFIX) as monthZipTempFile:
        monthZipTempFile.write(monthZipHandle.getContent())
        oneMonthZip = zipfile.ZipFile(monthZipTempFile)
        
        with tempfile.TemporaryDirectory(prefix=APP_TEMP_PREFIX) as monthTempDir:
            oneMonthZip.extractall(monthTempDir)
            dayZipPaths = map(lambda x: os.path.join(monthTempDir, x), os.listdir(monthTempDir))
            
            return list(itertools.chain.from_iterable(map(lambda oneDayZipPath: processDayZip(oneDayZipPath, metadata), dayZipPaths)))

In [6]:
year_of_interest = 2020
archive_path = "/var/scratch/atlarge/traces/cloud-availability-sacheen-2021-05-20.sqsh"
# Install pysquashfs directly from git. The version on pypi has bug and hasn't been fixed in over 4 years.
image = SquashFsImage(archive_path)
zipfiles = []
for i in image.root.findAll():
    if not i.isFolder() and str(year_of_interest) in i.getPath():
#         print(i.getPath())
#         if 'amazon' in i.getPath() or 'azure' in i.getPath() or 'google' in i.getPath():
#             zipfiles.append(i.getPath())
        zipfiles.append(i.getPath())

# There is a zipfile for each month

In [7]:
from dask_jobqueue import SLURMCluster
from dask.distributed import Client

In [8]:
cluster = SLURMCluster(cores=16, memory="64 GB", processes=16,
                       local_directory="./scheduler_spill",
                       scheduler_options={'dashboard_address': ':8787'},
                       interface='ib0', walltime='02:00:00')
cluster.scale_up(10)
client = Client(cluster)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


In [9]:
def metaProcessMonthZip(path):
    image = SquashFsImage(archive_path)
#     processMonthZip(image, path)
#     return pd.DataFrame([])
    l = processMonthZip(image, path)
    if type(l) != list or len(l) == 0:
#         print(path)
        return PLACEHOLDER_DF
    return pd.concat(l)

tasks = list(map(delayed(metaProcessMonthZip), zipfiles))
df2 = dd.from_delayed(tasks, meta={
    'service_id': str,
    'service_name': str,
    'location': str,
    'status': np.int32,
    'event_start_time': np.int64,
    'event_end_time': np.int64,
    'first_notification': np.int64,
    'last_notification': np.int64,
    'description': str,
    'vendor': str,
    'monitor': str,
    'org_type': str
})
df3 = df2.drop_duplicates(subset=['service_name', 'description', 'location', 'event_start_time'])

In [10]:
df3.dtypes

service_id            object
service_name          object
location              object
status                 int32
event_start_time       int64
event_end_time         int64
first_notification     int64
last_notification      int64
description           object
vendor                object
monitor               object
org_type              object
dtype: object

In [11]:
# fut = client.persist(df3)
# client.recreate_error_locally(fut)
start = datetime.datetime.now()
df3.to_parquet('/var/scratch/stalluri/provider_failures_2020')
end = datetime.datetime.now()
# await cluster.scale_down(cluster.workers)
# cluster.close()
# df3.compute()



In [12]:
print(end - start)

0:23:56.342744


In [13]:
await cluster.scale_down(cluster.workers)