must-gather/collection-scripts/gather_ceph_resources

#!/bin/bash
# Expect base collection path as an argument
BASE_COLLECTION_PATH=$1

# Expect time option as an argument
SINCE_TIME=$2

# Use PWD as base path if no argument is passed
if [ "${BASE_COLLECTION_PATH}" = "" ]; then
    BASE_COLLECTION_PATH=$(pwd)
fi

CEPH_COLLECTION_PATH="${BASE_COLLECTION_PATH}/ceph"
POD_TEMPLATE_LATEST="/templates/pod.template.latest"
POD_TEMPLATE_STANDARD="/templates/pod.template.standard"

SED_DELIMITER=$(echo -en "\001");
safe_replace () {
    sed "s${SED_DELIMITER}${1}${SED_DELIMITER}${2}${SED_DELIMITER}g"
}

apply_standard_helper_pod() {
    < ${POD_TEMPLATE_STANDARD} safe_replace "NAMESPACE" "$1" | safe_replace "IMAGE_NAME" "$2" | safe_replace "MUST_GATHER" "$HOSTNAME" > pod_helper.yaml
    oc apply -f pod_helper.yaml
}

apply_latest_helper_pod() {
    < ${POD_TEMPLATE_LATEST} safe_replace "NAMESPACE" "$1" | safe_replace "IMAGE_NAME" "$2" | safe_replace "MUST_GATHER" "$HOSTNAME" > pod_helper.yaml
    oc apply -f pod_helper.yaml
}

# Ceph resources
ceph_resources=()
ceph_resources+=(cephblockpools)
ceph_resources+=(cephclusters)
ceph_resources+=(cephfilesystems)
ceph_resources+=(cephobjectstores)
ceph_resources+=(cephobjectstoreusers)

# Ceph commands
ceph_commands=()
ceph_commands+=("ceph auth list")
ceph_commands+=("ceph balancer dump")
ceph_commands+=("ceph balancer pool ls")
ceph_commands+=("ceph balancer status")
ceph_commands+=("ceph config dump")
ceph_commands+=("ceph config-key ls")
ceph_commands+=("ceph crash ls")
ceph_commands+=("ceph crash stat")
ceph_commands+=("ceph device ls")
ceph_commands+=("ceph df detail")
ceph_commands+=("ceph fs dump")
ceph_commands+=("ceph fs ls")
ceph_commands+=("ceph fs status")
ceph_commands+=("ceph fs subvolumegroup ls ocs-storagecluster-cephfilesystem")
ceph_commands+=("ceph fs subvolume ls ocs-storagecluster-cephfilesystem csi")
ceph_commands+=("ceph health detail")
ceph_commands+=("ceph mds stat")
ceph_commands+=("ceph mgr dump")
ceph_commands+=("ceph mgr module ls")
ceph_commands+=("ceph mgr services")
ceph_commands+=("ceph mon stat")
ceph_commands+=("ceph mon dump")
ceph_commands+=("ceph osd df tree")
ceph_commands+=("ceph osd tree")
ceph_commands+=("ceph osd stat")
ceph_commands+=("ceph osd dump")
ceph_commands+=("ceph osd utilization")
ceph_commands+=("ceph osd crush show-tunables")
ceph_commands+=("ceph osd crush dump")
ceph_commands+=("ceph osd crush weight-set ls")
ceph_commands+=("ceph osd crush weight-set dump")
ceph_commands+=("ceph osd crush rule dump")
ceph_commands+=("ceph osd crush rule ls")
ceph_commands+=("ceph osd crush class ls")
ceph_commands+=("ceph osd perf")
ceph_commands+=("ceph osd numa-status")
ceph_commands+=("ceph osd getmaxosd")
ceph_commands+=("ceph osd drain status")
ceph_commands+=("ceph osd pool ls detail")
ceph_commands+=("ceph osd lspools")
ceph_commands+=("ceph osd df")
ceph_commands+=("ceph osd blocked-by")
ceph_commands+=("ceph osd blacklist ls")
ceph_commands+=("ceph pg dump")
ceph_commands+=("ceph pg stat")
ceph_commands+=("ceph pool autoscale-status")
ceph_commands+=("ceph progress")
ceph_commands+=("ceph progress json")
ceph_commands+=("ceph quorum_status")
ceph_commands+=("ceph report")
ceph_commands+=("ceph service dump")
ceph_commands+=("ceph status")
ceph_commands+=("ceph time-sync-status")
ceph_commands+=("ceph versions")

# Ceph volume commands
ceph_volume_commands+=()
ceph_volume_commands+=("ceph-volume lvm list")
ceph_volume_commands+=("ceph-volume raw list")

# Inspecting ceph related custom resources for all namespaces
for resource in "${ceph_resources[@]}"; do
    echo "collecting dump ${resource}" | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
    { oc adm --dest-dir="${CEPH_COLLECTION_PATH}" inspect "${resource}" --all-namespaces --"${SINCE_TIME}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
done

namespaces=$(oc get deploy --all-namespaces -o go-template --template='{{range .items}}{{if .metadata.labels}}{{printf "%s %v" .metadata.namespace (index .metadata.labels "olm.owner")}} {{printf "\n"}}{{end}}{{end}}' | grep ocs-operator | awk '{print $1}' | uniq)
# Inspecting the namespace where ocs-cluster is installed
for ns in $namespaces; do
    operatorImage=$(oc get pods -l app=rook-ceph-operator -n openshift-storage -o jsonpath="{range .items[*]}{@.spec.containers[0].image}+{end}" | tr "+" "\n" | head -n1)
    cephClusterCount=$(oc get cephcluster -n "${ns}" -o jsonpath="{range .items[*]}{@.metadata.name}{'\n'}{end}" | wc -l)
    if [ "${operatorImage}" = "" ]; then
        echo "not able to find the rook's operator image. Skipping collection of ceph command output" | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
    elif [[ $cephClusterCount -gt 0 ]]; then
        current_version=$(oc get csv -n "${ns}" --no-headers | awk '{print $5}' | tr -dc '0-9')
        if [[ $current_version -ge 460 ]]; then
            apply_latest_helper_pod "$ns" "$operatorImage"
        else
            apply_standard_helper_pod "$ns" "$operatorImage"
        fi
    fi

    COMMAND_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/must_gather_commands
    COMMAND_JSON_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/must_gather_commands_json_output
    mkdir -p "${COMMAND_OUTPUT_DIR}"
    mkdir -p "${COMMAND_JSON_OUTPUT_DIR}"

    if [ "${operatorImage}" != "" ]; then
        for i in {1..50};do
           if [ "$(oc get pods  "${HOSTNAME}"-helper -n "${ns}" -o jsonpath='{.status.phase}')" = "Running" ]; then
                echo "helper pod got deployed successfully." | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
                break
           fi
           echo "waiting for helper pod to come up in ${ns} namespace. Retrying ${i}" | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
           sleep 5
        done

        # add jq command to the helper pod
        timeout 120 oc -n "$ns" exec "${HOSTNAME}"-helper -- bash -c "wget 'http://stedolan.github.io/jq/download/linux64/jq' && chmod 755 jq"
        timeout 120 oc -n "$ns" exec "${HOSTNAME}"-helper -- bash -c "mv jq /usr/bin/"

        # Collecting output of ceph commands
        for ((i = 0; i < ${#ceph_commands[@]}; i++)); do
            printf "collecting command output for: %s\n"  "${ceph_commands[$i]}" | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
            COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/${ceph_commands[$i]// /_}
            JSON_COMMAND_OUTPUT_FILE=${COMMAND_JSON_OUTPUT_DIR}/${ceph_commands[$i]// /_}_--format_json-pretty
            { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "${ceph_commands[$i]} --connect-timeout=15" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
            { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "${ceph_commands[$i]} --connect-timeout=15 --format json-pretty" >> "${JSON_COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
        done
        for i in $(timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph osd lspools --connect-timeout=15"|awk '{print $2}'); do
            { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "rbd ls -p $i" >> "${COMMAND_OUTPUT_DIR}/pools_rbd_$i"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1;
        done

        # Collecting snapshot info for ceph rbd volumes
        printf "collecting snapshot info for ceph rbd volumes \n" | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
        COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/rbd_vol_and_snap_info
        # Inspecting ceph block pools for ceph rbd
        blockpools=$(oc get cephblockpools.ceph.rook.io -n openshift-storage -o jsonpath="{range .items[*]}{@.metadata.name}{'\n'}{end}")
        for bp in $blockpools; do
            images=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "rbd ls -p $bp")
            for image in $images; do
                { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "rbd info $image --pool $bp" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
                { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "rbd snap ls --all $image --pool $bp" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
            done
        done

        # Collecting snapshot information for ceph subvolumes
        printf "collecting snapshot info for cephFS subvolumes \n" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
        COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/cephfs_subvol_and_snap_info

        # Inspecting CephFS filesystems
        filesystems=$(oc get cephfilesystems.ceph.rook.io -n openshift-storage -o jsonpath="{range .items[*]}{@.metadata.name}{'\n'}{end}")
        # Default subvolumegroup in OCS is 'csi'
        svg="csi"
        for fs in $filesystems; do
            subvols=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume ls $fs $svg | jq --raw-output '.[].name' ")
            for subvol in $subvols; do
                { printf "Information for subvolume: %s\n" "${subvol}" >> "${COMMAND_OUTPUT_FILE}"; }
                { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume info $fs $subvol $svg --connect-timeout=15" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
                snaps=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume snapshot ls $fs $subvol $svg | jq --raw-output '.[].name'")
                count=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume snapshot ls $fs $subvol $svg | jq --raw-output '.[].name' | wc -l")
                { printf "Snapshot count in subvolume: %s=%s\n" "${subvol}" "${count}" >> "${COMMAND_OUTPUT_FILE}"; }
                for snap in $snaps; do
                    { printf "Information for snapshot: %s\n" "${snap}" >> "${COMMAND_OUTPUT_FILE}"; }
                    { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume snapshot info $fs $subvol $snap $svg --connect-timeout=15" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
                done
            done
        done
    fi

    # Collecting output of ceph volume commands
    for ((i = 0; i < ${#ceph_volume_commands[@]}; i++)); do
        printf "collecting command output for: %s\n"  "${ceph_volume_commands[$i]}" | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
        for osdPod in $(oc get pods -n "${ns}" -l app=rook-ceph-osd --no-headers | awk '{print $1}'); do
            pod_status=$(oc get po "${osdPod}" -n "${ns}" -o jsonpath='{.status.phase}')
            if [ "${pod_status}" != "Running" ]; then
                continue
            fi
            COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/${ceph_volume_commands[$i]// /_}
            { timeout 120 oc -n "${ns}" exec "${osdPod}" -- bash -c "${ceph_volume_commands[$i]}" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
        done
    done

    # Add Ready nodes to the list
    nodes=$(oc get nodes -l cluster.ocs.openshift.io/openshift-storage='' --no-headers | awk '/\yReady\y/{print $1}')

    # Collecting ceph prepare volume logs
    volume_collection(){
        printf "collecting prepare volume logs from node %s \n"  "${node}"
        oc rsync -n "${ns}" "$(oc get pods -n "${ns}"| grep "${node//./}-debug"| awk '{print $1}')":/host/var/lib/rook/openshift-storage/log "${NODE_OUTPUT_DIR}"
    }

    for i in $(timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph crash ls --connect-timeout=15"| awk '{print $1}'); do
        { timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph crash info $i --connect-timeout=15" >> "${COMMAND_OUTPUT_DIR}"/crash_"${i}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1;
    done

    oc delete -f pod_helper.yaml

    crash_collection(){
        oc debug nodes/"${node}" --to-namespace="${ns}" -- bash -c "sleep 5m" &
        printf "debugging node %s \n" "${node}"
        debug_pod_ready=false
        for i in {0..300..3}; do
            if [ "$(oc get pods -n openshift-storage | grep "${node//./}-debug" | awk '{print $2}')" == "1/1" ] ; then
                debug_pod_ready=true
                break
            else
                sleep 3
                echo "waiting for the debug pod to be in ready state"
            fi
        done
        if $debug_pod_ready; then
            echo "debug pod is ready"
            volume_collection
            oc rsync -n "${ns}" "$(oc get pods -n "${ns}"| grep "${node//./}-debug"| awk '{print $1}')":/host/var/lib/rook/openshift-storage/crash/ "${CRASH_OUTPUT_DIR}"
        fi
    }

    # creating a counter variable for collecting PID in array
    idx=0
    # Collecting ceph crash dump
    for node in ${nodes}; do
        printf "collecting crash logs  from node %s \n"  "${node}" | tee -a  "${BASE_COLLECTION_PATH}"/gather-debug.log
        CRASH_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/crash_${node}
        mkdir -p "${CRASH_OUTPUT_DIR}"
        NODE_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/volume_collection_${node}
        mkdir -p "${NODE_OUTPUT_DIR}"
        crash_collection &
        # collecting PID for bg jobs
        pids[${idx}]=$!
        # shellcheck disable=SC2046
        idx=$idx+1
    done

    # wait for all pids
    echo "waiting for ${pids[*]} to terminate"
    wait "${pids[@]}"

    echo "ceph core dump collection completed"
done