/
steps.txt
81 lines (59 loc) · 3.56 KB
/
steps.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
## kubernetes images used in this:
## openshift 4.9 (kubernetes 1.22) + debug lines: https://github.com/openshift/kubernetes/pull/1177
## openshift 4.10 (kubernetes 1.23) + debug lines + modified logic in node cache updates: https://github.com/openshift/kubernetes/pull/1174
## BZ link
## https://bugzilla.redhat.com/show_bug.cgi?id=2040715
## look at the node names
$ kubectl get nodes
NAME STATUS ROLES AGE VERSION
ci-ln-3qh25qk-1d09d-5m9q8-master-0 Ready master 22m v1.22.3+2cb6068
ci-ln-3qh25qk-1d09d-5m9q8-master-1 Ready master 22m v1.22.3+2cb6068
ci-ln-3qh25qk-1d09d-5m9q8-master-2 Ready master 22m v1.22.3+2cb6068
ci-ln-3qh25qk-1d09d-5m9q8-worker-centralus2-zg6hc Ready worker 10m v1.22.3+2cb6068
ci-ln-3qh25qk-1d09d-5m9q8-worker-centralus3-8gvj9 Ready worker 11m v1.22.3+2cb6068
## the cluster name for AZ is the root name of nodes seen above
$ export CLUSTER=ci-ln-3qh25qk-1d09d-5m9q8
## create backend server (two pods)
$ kubectl apply -f server.yaml
deployment.apps/hello-dep1 created
## create load balancer
$ kubectl apply -f service-lb.yaml
service/hello1 created
## see where the server pods are running
$ kubectl get pods -o wide
NAME READY STATUS RESTARTS AGE
hello-dep1-5866c8558f-qc7x9 1/1 Running 0 11m
hello-dep1-5866c8558f-rss9r 1/1 Running 0 11m
## note the external IP...
$ kubectl get svc -w
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
hello1 LoadBalancer 172.30.130.159 52.230.216.80 80:32044/TCP 17s
kubernetes ClusterIP 172.30.0.1 <none> 443/TCP 30m
openshift ExternalName <none> kubernetes.default.svc.cluster.local <none> 19m
## curl the (cloud-provided) external IP
$ curl 52.230.216.80
NOW: 2022-02-11 14:28:50.789022265 +0000 UTC m=+140.980251690
## inspect AZ LB
$ az network lb list -g ${CLUSTER}-rg | jq > t0_az_network_lb.json
...
## save kube controller manager logs
$ kubectl get pods -A -o wide | grep kube-controller-manager
[...]
$ kubectl logs kube-controller-manager-ci-ln-8f7c2jk-1d09d-svprg-master-2 -n openshift-kube-controller-manager
### TRY EXCLUSION BY LABEL
# add label to a node
$ kubectl label nodes ci-ln-8f7c2jk-1d09d-svprg-worker-centralus3-82ck2 "node.kubernetes.io/exclude-from-external-load-balancers"=""
# show current labels
$ kubectl get nodes ci-ln-8f7c2jk-1d09d-svprg-worker-centralus3-82ck2 -o jsonpath='{.metadata.labels}' | jq
# verify that cloud controller manager reacts to this
$ kubectl logs kube-controller-manager-ci-ln-8f7c2jk-1d09d-svprg-master-2 -n openshift-kube-controller-manager
# in particular look for this line, which is printed when the code detects a node that needs to be deleted:
# I0211 14:38:44.651083 1 azure_loadbalancer.go:1142] reconcileLoadBalancer for service (openshift-ingress/router-default)(true): lb backendpool - found unwanted node ci-ln-3qh25qk-1d09d-5m9q8-worker-centralus2-zg6hc, decouple it from the LB
# see if anything changed on AZ LB...
$ az network lb list -g ${CLUSTER}-rg | jq> t1_az_network_lb.json
$ diff -uB t0_az_network_lb.json t1_az_network_lb.json
# you should see that in the second file the deleted node does not appear (a whole block of ~10 lines should be missing)
### TRY EXCLUSION BY KUBELET FAILURE (REBOOT)
## apply machine config!
kubectl apply -f 1-worker-testfile.yaml
## repeat the steps above