-
Notifications
You must be signed in to change notification settings - Fork 177
/
gather_ceph_resources
executable file
·261 lines (234 loc) · 13.2 KB
/
gather_ceph_resources
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/bin/bash
# Expect base collection path as an argument
BASE_COLLECTION_PATH=$1
# Expect time option as an argument
SINCE_TIME=$2
# Use PWD as base path if no argument is passed
if [ "${BASE_COLLECTION_PATH}" = "" ]; then
BASE_COLLECTION_PATH=$(pwd)
fi
CEPH_COLLECTION_PATH="${BASE_COLLECTION_PATH}/ceph"
POD_TEMPLATE_LATEST="/templates/pod.template.latest"
POD_TEMPLATE_STANDARD="/templates/pod.template.standard"
SED_DELIMITER=$(echo -en "\001");
safe_replace () {
sed "s${SED_DELIMITER}${1}${SED_DELIMITER}${2}${SED_DELIMITER}g"
}
apply_standard_helper_pod() {
< ${POD_TEMPLATE_STANDARD} safe_replace "NAMESPACE" "$1" | safe_replace "IMAGE_NAME" "$2" | safe_replace "MUST_GATHER" "$HOSTNAME" > pod_helper.yaml
oc apply -f pod_helper.yaml
}
apply_latest_helper_pod() {
< ${POD_TEMPLATE_LATEST} safe_replace "NAMESPACE" "$1" | safe_replace "IMAGE_NAME" "$2" | safe_replace "MUST_GATHER" "$HOSTNAME" > pod_helper.yaml
oc apply -f pod_helper.yaml
}
# Ceph resources
ceph_resources=()
ceph_resources+=(cephblockpools)
ceph_resources+=(cephclusters)
ceph_resources+=(cephfilesystems)
ceph_resources+=(cephobjectstores)
ceph_resources+=(cephobjectstoreusers)
# Ceph commands
ceph_commands=()
ceph_commands+=("ceph auth list")
ceph_commands+=("ceph balancer dump")
ceph_commands+=("ceph balancer pool ls")
ceph_commands+=("ceph balancer status")
ceph_commands+=("ceph config dump")
ceph_commands+=("ceph config-key ls")
ceph_commands+=("ceph crash ls")
ceph_commands+=("ceph crash stat")
ceph_commands+=("ceph device ls")
ceph_commands+=("ceph df detail")
ceph_commands+=("ceph fs dump")
ceph_commands+=("ceph fs ls")
ceph_commands+=("ceph fs status")
ceph_commands+=("ceph fs subvolumegroup ls ocs-storagecluster-cephfilesystem")
ceph_commands+=("ceph fs subvolume ls ocs-storagecluster-cephfilesystem csi")
ceph_commands+=("ceph health detail")
ceph_commands+=("ceph mds stat")
ceph_commands+=("ceph mgr dump")
ceph_commands+=("ceph mgr module ls")
ceph_commands+=("ceph mgr services")
ceph_commands+=("ceph mon stat")
ceph_commands+=("ceph mon dump")
ceph_commands+=("ceph osd df tree")
ceph_commands+=("ceph osd tree")
ceph_commands+=("ceph osd stat")
ceph_commands+=("ceph osd dump")
ceph_commands+=("ceph osd utilization")
ceph_commands+=("ceph osd crush show-tunables")
ceph_commands+=("ceph osd crush dump")
ceph_commands+=("ceph osd crush weight-set ls")
ceph_commands+=("ceph osd crush weight-set dump")
ceph_commands+=("ceph osd crush rule dump")
ceph_commands+=("ceph osd crush rule ls")
ceph_commands+=("ceph osd crush class ls")
ceph_commands+=("ceph osd perf")
ceph_commands+=("ceph osd numa-status")
ceph_commands+=("ceph osd getmaxosd")
ceph_commands+=("ceph osd drain status")
ceph_commands+=("ceph osd pool ls detail")
ceph_commands+=("ceph osd lspools")
ceph_commands+=("ceph osd df")
ceph_commands+=("ceph osd blocked-by")
ceph_commands+=("ceph osd blacklist ls")
ceph_commands+=("ceph pg dump")
ceph_commands+=("ceph pg stat")
ceph_commands+=("ceph pool autoscale-status")
ceph_commands+=("ceph progress")
ceph_commands+=("ceph progress json")
ceph_commands+=("ceph quorum_status")
ceph_commands+=("ceph report")
ceph_commands+=("ceph service dump")
ceph_commands+=("ceph status")
ceph_commands+=("ceph time-sync-status")
ceph_commands+=("ceph versions")
# Ceph volume commands
ceph_volume_commands+=()
ceph_volume_commands+=("ceph-volume lvm list")
ceph_volume_commands+=("ceph-volume raw list")
# Inspecting ceph related custom resources for all namespaces
for resource in "${ceph_resources[@]}"; do
echo "collecting dump ${resource}" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
{ oc adm --dest-dir="${CEPH_COLLECTION_PATH}" inspect "${resource}" --all-namespaces --"${SINCE_TIME}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
done
namespaces=$(oc get deploy --all-namespaces -o go-template --template='{{range .items}}{{if .metadata.labels}}{{printf "%s %v" .metadata.namespace (index .metadata.labels "olm.owner")}} {{printf "\n"}}{{end}}{{end}}' | grep ocs-operator | awk '{print $1}' | uniq)
# Inspecting the namespace where ocs-cluster is installed
for ns in $namespaces; do
operatorImage=$(oc get pods -l app=rook-ceph-operator -n openshift-storage -o jsonpath="{range .items[*]}{@.spec.containers[0].image}+{end}" | tr "+" "\n" | head -n1)
cephClusterCount=$(oc get cephcluster -n "${ns}" -o jsonpath="{range .items[*]}{@.metadata.name}{'\n'}{end}" | wc -l)
if [ "${operatorImage}" = "" ]; then
echo "not able to find the rook's operator image. Skipping collection of ceph command output" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
elif [[ $cephClusterCount -gt 0 ]]; then
current_version=$(oc get csv -n "${ns}" --no-headers | awk '{print $5}' | tr -dc '0-9')
if [[ $current_version -ge 460 ]]; then
apply_latest_helper_pod "$ns" "$operatorImage"
else
apply_standard_helper_pod "$ns" "$operatorImage"
fi
fi
COMMAND_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/must_gather_commands
COMMAND_JSON_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/must_gather_commands_json_output
mkdir -p "${COMMAND_OUTPUT_DIR}"
mkdir -p "${COMMAND_JSON_OUTPUT_DIR}"
if [ "${operatorImage}" != "" ]; then
for i in {1..50};do
if [ "$(oc get pods "${HOSTNAME}"-helper -n "${ns}" -o jsonpath='{.status.phase}')" = "Running" ]; then
echo "helper pod got deployed successfully." | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
break
fi
echo "waiting for helper pod to come up in ${ns} namespace. Retrying ${i}" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
sleep 5
done
# add jq command to the helper pod
timeout 120 oc -n "$ns" exec "${HOSTNAME}"-helper -- bash -c "wget 'http://stedolan.github.io/jq/download/linux64/jq' && chmod 755 jq"
timeout 120 oc -n "$ns" exec "${HOSTNAME}"-helper -- bash -c "mv jq /usr/bin/"
# Collecting output of ceph commands
for ((i = 0; i < ${#ceph_commands[@]}; i++)); do
printf "collecting command output for: %s\n" "${ceph_commands[$i]}" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/${ceph_commands[$i]// /_}
JSON_COMMAND_OUTPUT_FILE=${COMMAND_JSON_OUTPUT_DIR}/${ceph_commands[$i]// /_}_--format_json-pretty
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "${ceph_commands[$i]} --connect-timeout=15" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "${ceph_commands[$i]} --connect-timeout=15 --format json-pretty" >> "${JSON_COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
done
for i in $(timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph osd lspools --connect-timeout=15"|awk '{print $2}'); do
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "rbd ls -p $i" >> "${COMMAND_OUTPUT_DIR}/pools_rbd_$i"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1;
done
# Collecting snapshot info for ceph rbd volumes
printf "collecting snapshot info for ceph rbd volumes \n" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/rbd_vol_and_snap_info
# Inspecting ceph block pools for ceph rbd
blockpools=$(oc get cephblockpools.ceph.rook.io -n openshift-storage -o jsonpath="{range .items[*]}{@.metadata.name}{'\n'}{end}")
for bp in $blockpools; do
images=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "rbd ls -p $bp")
for image in $images; do
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "rbd info $image --pool $bp" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "rbd snap ls --all $image --pool $bp" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
done
done
# Collecting snapshot information for ceph subvolumes
printf "collecting snapshot info for cephFS subvolumes \n" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/cephfs_subvol_and_snap_info
# Inspecting CephFS filesystems
filesystems=$(oc get cephfilesystems.ceph.rook.io -n openshift-storage -o jsonpath="{range .items[*]}{@.metadata.name}{'\n'}{end}")
# Default subvolumegroup in OCS is 'csi'
svg="csi"
for fs in $filesystems; do
subvols=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume ls $fs $svg | jq --raw-output '.[].name' ")
for subvol in $subvols; do
{ printf "Information for subvolume: %s\n" "${subvol}" >> "${COMMAND_OUTPUT_FILE}"; }
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume info $fs $subvol $svg --connect-timeout=15" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
snaps=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume snapshot ls $fs $subvol $svg | jq --raw-output '.[].name'")
count=$(oc -n openshift-storage exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume snapshot ls $fs $subvol $svg | jq --raw-output '.[].name' | wc -l")
{ printf "Snapshot count in subvolume: %s=%s\n" "${subvol}" "${count}" >> "${COMMAND_OUTPUT_FILE}"; }
for snap in $snaps; do
{ printf "Information for snapshot: %s\n" "${snap}" >> "${COMMAND_OUTPUT_FILE}"; }
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph fs subvolume snapshot info $fs $subvol $snap $svg --connect-timeout=15" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
done
done
done
fi
# Collecting output of ceph volume commands
for ((i = 0; i < ${#ceph_volume_commands[@]}; i++)); do
printf "collecting command output for: %s\n" "${ceph_volume_commands[$i]}" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
for osdPod in $(oc get pods -n "${ns}" -l app=rook-ceph-osd --no-headers | awk '{print $1}'); do
pod_status=$(oc get po "${osdPod}" -n "${ns}" -o jsonpath='{.status.phase}')
if [ "${pod_status}" != "Running" ]; then
continue
fi
COMMAND_OUTPUT_FILE=${COMMAND_OUTPUT_DIR}/${ceph_volume_commands[$i]// /_}
{ timeout 120 oc -n "${ns}" exec "${osdPod}" -- bash -c "${ceph_volume_commands[$i]}" >> "${COMMAND_OUTPUT_FILE}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1
done
done
# Add Ready nodes to the list
nodes=$(oc get nodes -l cluster.ocs.openshift.io/openshift-storage='' --no-headers | awk '/\yReady\y/{print $1}')
# Collecting ceph prepare volume logs
volume_collection(){
printf "collecting prepare volume logs from node %s \n" "${node}"
oc rsync -n "${ns}" "$(oc get pods -n "${ns}"| grep "${node//./}-debug"| awk '{print $1}')":/host/var/lib/rook/openshift-storage/log "${NODE_OUTPUT_DIR}"
}
for i in $(timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph crash ls --connect-timeout=15"| awk '{print $1}'); do
{ timeout 120 oc -n "${ns}" exec "${HOSTNAME}"-helper -- bash -c "ceph crash info $i --connect-timeout=15" >> "${COMMAND_OUTPUT_DIR}"/crash_"${i}"; } >> "${BASE_COLLECTION_PATH}"/gather-debug.log 2>&1;
done
oc delete -f pod_helper.yaml
crash_collection(){
oc debug nodes/"${node}" --to-namespace="${ns}" -- bash -c "sleep 5m" &
printf "debugging node %s \n" "${node}"
debug_pod_ready=false
for i in {0..300..3}; do
if [ "$(oc get pods -n openshift-storage | grep "${node//./}-debug" | awk '{print $2}')" == "1/1" ] ; then
debug_pod_ready=true
break
else
sleep 3
echo "waiting for the debug pod to be in ready state"
fi
done
if $debug_pod_ready; then
echo "debug pod is ready"
volume_collection
oc rsync -n "${ns}" "$(oc get pods -n "${ns}"| grep "${node//./}-debug"| awk '{print $1}')":/host/var/lib/rook/openshift-storage/crash/ "${CRASH_OUTPUT_DIR}"
fi
}
# creating a counter variable for collecting PID in array
idx=0
# Collecting ceph crash dump
for node in ${nodes}; do
printf "collecting crash logs from node %s \n" "${node}" | tee -a "${BASE_COLLECTION_PATH}"/gather-debug.log
CRASH_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/crash_${node}
mkdir -p "${CRASH_OUTPUT_DIR}"
NODE_OUTPUT_DIR=${CEPH_COLLECTION_PATH}/volume_collection_${node}
mkdir -p "${NODE_OUTPUT_DIR}"
crash_collection &
# collecting PID for bg jobs
pids[${idx}]=$!
# shellcheck disable=SC2046
idx=$idx+1
done
# wait for all pids
echo "waiting for ${pids[*]} to terminate"
wait "${pids[@]}"
echo "ceph core dump collection completed"
done