# ToR Piloting Exit Analysis

In [None]:
# This notebook collects key data for all devices in the pilot schedule for you to make a determination on whether or not to exit the pilot

import pandas as pd
import ipywidgets as widgets

In [None]:
# 7060
os_version = "20240510.25"
schedule_identifier = "FirmwareUpgrade_31824801_vadixit_Pilot_ToRRouter_SONiC-Arista-7060-ToRRouter_SONiC.20240510.25_HitlessReload_20250314"

# 7260
# os_version = "20240510.23"
# schedule_identifier = "FirmwareUpgrade_31411035_vadixit_Pilot_ToRRouter_SONiC-Arista-7260CX364-ToRRouter_SONiC.20240510.23_HitlessReload_20250222"

In [None]:
from utilities.fuse import get_devices_in_schedule, get_successful_FirmwareUpgrades_to_version_for_schedule

devices_in_schedule = get_devices_in_schedule(schedule_identifier)
successful_upgrades = get_successful_FirmwareUpgrades_to_version_for_schedule(schedule_identifier, os_version)

print(f"{len(successful_upgrades)} of {len(devices_in_schedule)} devices in the schedule have successfully upgraded to {os_version}")

### Check for IcM
There should not be any active or mitigated IcMs. If there are they need to be resolved, root-caused and any necessary fixes applied to ensure it doesn't impact the fleet-wide rollout.

In [None]:
from utilities.icm import get_all_icms_since_time_ago
from IPython.display import display

icm_raised_devices = {}

all_devices: list = successful_upgrades["device"].unique().tolist()
all_device_icms = get_all_icms_since_time_ago(all_devices, "90d")

# Filter by IcMs that occurred after the schedule start time for device and are either active or mitigated
all_device_icms_with_upgrades = pd.merge(all_device_icms, successful_upgrades, how="inner", left_on="OccurringDeviceName", right_on="device")
all_device_icms_with_upgrades = all_device_icms_with_upgrades[
    all_device_icms_with_upgrades["startTime"] < all_device_icms_with_upgrades["CreateDate"]
]

all_device_mitigated_icms = all_device_icms_with_upgrades[all_device_icms_with_upgrades["Status"] == "MITIGATED"]
all_device_active_icms = all_device_icms_with_upgrades[all_device_icms_with_upgrades["Status"] == "ACTIVE"]

if len(all_device_mitigated_icms):
    print("All device mitigated ICMs:")
    display(all_device_mitigated_icms)
else:
    print("No device mitigated ICMs ‚úÖ")

if len(all_device_active_icms):
    print("All device active ICMs:")
    display(all_device_active_icms)
else:
    print("No device active ICMs ‚úÖ")

### Check for BGP flaps
There should not be any lag flaps between T0<->T1, if there are then it needs to be deeply investigate as to why. Note that this is not a bullet-proof query. Logs from the T1 neighbours may not have been upstreamed to kusto and a possible lag flap could be missed. Recommend spot-checking a few T1 neighbours to gain confidence.

In [None]:
from utilities.dataplane_drop import get_t1_peers_bgp_flap_logs_in_time_window

lag_flapping_devices = {}

from IPython.display import display
import time

# Create widgets for progress tracking
progress = widgets.FloatProgress(min=0, max=len(successful_upgrades), description='Progress:')
status_text = widgets.HTML("Checking BGP flaps: 0/" + str(len(successful_upgrades)))
progress_box = widgets.VBox([widgets.HBox([progress, status_text])])
display(progress_box)

# Process each device and update progress
for index, row in successful_upgrades.iterrows():
    device = row['device']
    start_time = row['startTime']
    end_time = row['endTime']
    
    # Update progress and status text
    progress.value = index + 1
    status_text.value = f"Checking device {device} for BGP flap logs: {index+1}/{len(successful_upgrades)}"
    
    # Get BGP flap logs
    bgp_flap_logs = get_t1_peers_bgp_flap_logs_in_time_window(device, start_time, end_time)
    if not bgp_flap_logs.empty:
        status_text.value += f" <span style='color:red'>‚ö†Ô∏è BGP flaps found!</span>"
        lag_flapping_devices[device] = bgp_flap_logs
    
    # Small delay to allow UI update
    time.sleep(0.01)

# Final status update
if len(lag_flapping_devices) > 0:
    status_text.value = f"Completed: Found {len(lag_flapping_devices)} devices with BGP flap logs üí•"
else:
    status_text.value = "Completed: No devices with BGP flap logs found ‚úÖ"
    

if len(lag_flapping_devices) > 0:
    print(f"Found {len(lag_flapping_devices)} devices with BGP flap logs üí•:")
    for device, logs in lag_flapping_devices.items():
        print(f"Device: {device}")
        print(logs)
else:
    print("No devices with BGP flap logs found ‚úÖ")


### Cumulative Runtime
Looking for >10000 hours runtime here (ideally per HwSku)

In [None]:
from utilities.device import get_devices_by_names, get_runtime_hours_by_version_and_hwsku

all_device_names = successful_upgrades["device"].unique().tolist()
all_devices_df = get_devices_by_names(all_device_names)
all_hwskus = all_devices_df["HardwareSku"].unique().tolist()

for hwsku in all_hwskus:
    print(f"HardwareSku: {hwsku}")
    runtime_hours = get_runtime_hours_by_version_and_hwsku(os_version, hwsku)
    display(runtime_hours)

### Syslog
#### Excessive Syslog

In [None]:
from utilities.logs import get_devices_with_excessive_syslog

device_names = successful_upgrades["device"].unique().tolist()
device_syslogs = get_devices_with_excessive_syslog(device_names)
if len(device_syslogs) > 0:
    print(f"Found {len(device_syslogs)} devices with excessive syslog messages ‚ö†Ô∏è:")
    display(device_syslogs)
else:
    print("No devices found with excessive syslog messages. ‚úÖ")

#### Unexpected Error/Warning syslogs

In [None]:
# TODO: Implement this

unexpected_syslog_rgxs = [

]

### Diversity
#### Syncd restore count
We want to see devices that have made 1, 2, 3, etc hops to land on the pilot version.

In [None]:
from utilities.logs import get_syncd_restore_count

devices_and_restore_counts = {}

from IPython.display import display
import time

# Create widgets for progress tracking
progress = widgets.FloatProgress(min=0, max=len(successful_upgrades), description='Progress:')
status_text = widgets.HTML("Checking syncd restore counts: 0/" + str(len(successful_upgrades)))
progress_box = widgets.VBox([widgets.HBox([progress, status_text])])
display(progress_box)

# Process each device and update progress
for index, row in successful_upgrades.iterrows():
    device = row['device']
    start_time = row['startTime']
    end_time = row['endTime']
    
    # Update progress and status text
    progress.value = index + 1
    status_text.value = f"Checking device {device} for syncd restore count: {index+1}/{len(successful_upgrades)}"
    
    # Get restore count
    restore_count = get_syncd_restore_count(device, start_time, end_time)
    if not restore_count.startswith("warm-reboot count:"):
        status_text.value += f" <span style='color:red'>‚ö†Ô∏è Error: {restore_count}</span>"
    else:
        devices_and_restore_counts[device] = restore_count
    
    # Small delay to allow UI update
    time.sleep(0.01)

# Final status update
status_text.value = f"Completed: Checked syncd restore counts for {len(successful_upgrades)} devices ‚úÖ"

# Aggregate by restore counts
from collections import defaultdict
restore_counts_summary = defaultdict(int)
for restore_count in devices_and_restore_counts.values():
    restore_counts_summary[restore_count] += 1
print("Restore counts summary:")
for restore_count, count in restore_counts_summary.items():
    print(f"Restore count: {restore_count}, Count: {count}")


#### Dataplane drops
The following captures ToRs that had dataplane impact as per NetVMA data during the FUSE upgrade window. False positives do occur in this window that require manual checking but we'd rather do that then overly filtering and missing true positives. If there's no drops during the warm-upgrade downtime then it is not a true dataplane impact and can be ignored.

In [None]:
from utilities.dataplane_drop import apply_all_dataplane_drop_info_on_row


from IPython.display import display
import time

# Create widgets for progress tracking
progress = widgets.FloatProgress(min=0, max=len(successful_upgrades), description='Progress:')
status_text = widgets.HTML("Checking dataplane drops: 0/" + str(len(successful_upgrades)))
progress_box = widgets.VBox([widgets.HBox([progress, status_text])])
display(progress_box)

# Process each row with progress tracking
processed_rows = []
for index, row in successful_upgrades.iterrows():
    # Update progress and status
    progress.value = index + 1
    status_text.value = f"Checking device {row['device']} for dataplane drops: {index+1}/{len(successful_upgrades)}"
    
    # Process the row
    processed_row = apply_all_dataplane_drop_info_on_row(row)
    processed_rows.append(processed_row)
    
    # Small delay to allow UI update
    time.sleep(0.01)

# Create DataFrame from processed rows
upgrades_with_dataplane_drop_info = pd.DataFrame(processed_rows)

# Final status update
status_text.value = f"Completed: Checked {len(successful_upgrades)} devices for dataplane drops ‚úÖ"

In [None]:
from utilities.dataplane_drop import Availability

# Sort by consolidated_status
upgrades_with_dataplane_drop_info = upgrades_with_dataplane_drop_info.sort_values(by="consolidated_status")

# Filter out the rows that had drops
upgrades_with_drops = upgrades_with_dataplane_drop_info[
    upgrades_with_dataplane_drop_info["consolidated_status"].isin([Availability.NODE_DROP, Availability.TOR_DROP, Availability.BOTH_DROP])
]

# List the devices with drops and the NetVMA link to view
if len(upgrades_with_drops) > 0:
    print(f"Found {len(upgrades_with_drops)} devices with dataplane drops ‚ö†Ô∏è:")
    # Display as a nice formatted table
    # Set pandas display options to show the full URLs
    pd.set_option('display.max_colwidth', None)

    # Display the dataframe with just the device and netvma_url columns
    display(upgrades_with_drops[['device', 'netvma_url']])
else:
    print("No devices with dataplane drops found ‚úÖ")

In [None]:
from datetime import datetime
from zoneinfo import ZoneInfo
from IPython.display import Markdown, display

now_pst = datetime.now(ZoneInfo("America/Los_Angeles"))  # or your timezone
stamp = now_pst.strftime("%Y-%m-%d %H:%M %Z")
print("Report generated on:", stamp)