diff --git a/doc/system_health_monitoring/diagrams/System-ready-HLD.drawio b/doc/system_health_monitoring/diagrams/System-ready-HLD.drawio new file mode 100644 index 00000000000..b8ea4a3ee6c --- /dev/null +++ b/doc/system_health_monitoring/diagrams/System-ready-HLD.drawio @@ -0,0 +1 @@ +7VxZk5s4EP41U7X7MBSSOB/nyCRVyWZTm9pK8siAbJPB4ACOPfvrV4CE0TEGHxzJ2A8zVgON1f31idAVultu36beavFXEuDoCurB9grdX0EIDGiRfwXluaI4JqwI8zQMKpK+I3wO/8P0SkZdhwHOKK0i5UkS5eGKJ/pJHGM/52hemiYb/rRZEgUcYeXNsUT47HuRTP0SBvmCUoGu7w68w+F8kbP50QNLj51MCdnCC5JNg4TeXKG7NEny6ttye4ejQnhMLvD7t6ebDz8+ftx8efc9D97fX7/dXFfMHg65pJ5CiuP8vKypLn960ZrKi841f2YCxHFwU+iBjPzIy7LQv0K3i3wZEQIgX2dJnFO1A4uO75IoScvLkV5+CD3L0+QJc0fAzcNDfYRph8j1Fm/D/Cv5rmsmHX0rRvT7/bY5eGaDOE+fvzYH33YciuHusnLErsuecO4v6GSy3Etzebol+SGM2JyJTBqjSmQ4kIDYoi96XpasUx/vUxJVCvkRc7yPoVujkpgzTpaYzJNct9nhnoF70UA8o6U48vLwJz8Jj5rfvGZX3+FTEpLpQZ26CsOlfKijgJarWcDQ6w/iOVbTpkyauBX5WkgjzOuPw90FsF/P2FZCktgSnXrPjdNWxQnZntnYiL8PNJvsyJeKIxs1BL4jlRZ4gDWiUa2RO6KyRtC0xdoy1dbIWdXONDUAUNM8r3VNt5wWCy1Hn3AaEvnilBL7NjpgdzQ6YIxpdSYxMgpNBlXTPM7QLN5+gSvwecGyzgV+6yDwx0lMiLeBly1wQGEmmEED3Ma966phLxxpwp6D8ElwG80l62cBBtBNqCETugYAtmXYOtrLtmec2BJO/s2IVyBAWXnEokXM5HibC9ggkZshgMLIi8J5XLhUoszCw9z+xGkekiTyhh5YhkFQcLzdLMIcfy7vhO43JGUmtDRZx0GJQl2E5Es4LN00/ZGi2zbkpESEJhUB+ZF4exY8Ui5QCOFUsw201nlzE65QfxmZnPoP1bWr8AlWlFOBcYq2fqwTduA6K0V5Q04A1mq7O1jWGUwbOyIqPrNZk2TNi//Zc7ZM4jAnV9D7Pqbs2B8L7EX5IvhTPsQoZM7V72Rk0Z9FESmDXoJVA7IK5yQ6suJTZdOy2zo/UEydR4orIwUCRzNlrFh9YQWAaWZPL9cX1Pk0iwvJgRyTe9XpVp17dauEek+pjI4p1ah1jG3y0EbgyLDpigmVyOhMpYorFF6DlCrgsM4BhfpAbQOgkZM4g3HKe0ykeUBuLFYzu4bCEIbYuaEARrVES0eaUNoYiJQ7x5mjbUGJG0lr7UGTV9ChxJ+TfLJKK3MivaRIP6G9g4UimNPeqffIWNS9zZOCPDEaTagtke1othzrkWErUEDivwsd0wZG+de0+or8RrtQdzk6OCbdUtQMXJiWvNTdna7f3YkeortK21WH1LqjitI1nZO96Rygtb70ZCr0xKXzyuTc93Xd97kkvjXthyZJ+8tEXMjo769v11nX/HxStePDg+sidHrtOAAGtyxz0hzdQbuPzecprto/NGsIh3ggkpCJPeQmYAs2fQG2Q0+q8tYDe2bL5lM+w7U0RQVmKGIzgEZ/8pJ7M6M74tpszlUXW60pkVLuLupL6I4k9Ow59oMKfbkXxkV37DV4tvMrlbXFAO+4dENhbYA9mRqkMwZaW2MHNbrYtdnKizu31VTxtYSeli32NsWqu/xGvbL98EFCVxWo0OPsqaPOjh4GywZ6/sFe0MFN8LKkjqJv3xGmJIZWKcsGZzmfxNTtdOooiqeJkfeIo1vPf5qXnNhvC/DMW0f5abreq2rXdOv6iHkLQ9a2pVA26k3ZHRqjo2QxyBTre4QUOaAqmkKnrxyGCed3zmFg+1NJtdTNvpIYKPdFAo8AziMe/5LHnKxYdR4DrbGzGKjq3Bz2gA8aL2QiKQ7C7DrD6c8CN92SjGzhrYqv/nMUEqWnqN28Hyt4fHisCXXY+XudEzYMBBkFgMlhCohwaOKXYCrwsDPzVU7B8h38OCuOsHWJpxXzLdgxRE8tQ0fVCga20xd2VN2kKYQ1sThHigRg8NIcdmhlDB3WdJ2tvTlTWDOPC2u9leZQ1Q9pL9KoXCSPtkiy3J/NSXlFnFpYrHV5xW3M88NkusU+lFs8kyj2GSBfVbHfAp8JFvuqVtEUAyVQrSIaOlAiuTXy+wXK9sf6wwZKpGpRHB8owyKEzYhCsmsC3Vk4v0TMXvAy3YiJVF2cCURMBTJfV6u8BUrTi56o84qdgaOnya/CNxqLdUaNnxNcjHPu/il6Yd3DePFT7oVk8XJ1aZ2ertMJxzhVS2cCMa5A3msLay3omWBY67BUZ8w3v49632GAxduVkPpbms2MunVpdqWvsZZmQ9fVXPell7JNG9a5yKHLtJEFNd1ssDYl1sCBPO+eF20jVfttOFt5Re9ljwVnFyAN8vk0cs2jXzWoVoFAiR/xe43lrsNiuEMTcDoYto8DsQugCGIIBgdxFy+vd/Ty424u4ELJLAydmAVouOcjLcSSLaRg3bQQKLwv0rOFGHLjd0WkN0vSJaEGKRFjmkk2w5c4LYmp8Gwfg8DEtspgXMtGnqUufgUj5Gxh4fmLdYrfFje+RydltC2rASxOdZZiJQliS1SbwDT6ymcNuYMc43yTpE8X5UnViAPatQcVbqU/7cntWq8IPxfVCaqzgT011cn90VnZQCjYkvkmm3JzjmoU4eC8quyyPGqiqhRcqK16QQ8oXtDrT5Ny5zZPvTjz8flN8BfWm+g9lYob1gTl7m/lPS8KU/nMCSjsstfV+fe6EnIa48j+E9BNtGezK4lv31XJYX3aC1QOh4pt9QMViW/fUJHblO9xGpf7+V72Rjt9bzQh8waKRziDbo5myC29d14abMg0L5o+RdNivoDG1rQpt6Y6BIEBN1tCXPcWQnZ8/Od1v9BmS9Uq77GCYrHZkm1wwDdNqBmGfWqv19b5Z4VAuImwKOboKEmGu83Yq9N3W9qjN/8D5Vxdc5s4FP01nuk+xIMkJPBj7Hxsd9I2TbbbNi8ZbGSHKQYKcmL316+wRYwkShwMGKeTzARdywKOjq6Orq7SQ6P58jJ2oocPoUv9HjTcZQ+d9SAEwLb5n9Sy2lgsYGwMs9hzN6ac4db7RcU3M+vCc2kibBsTC0OfeZFsnIRBQCdMsjlxHD7J1aah70qGyJlRzXA7cXzd+tVz2cPGakNra/+berOH7M6ADDafzJ2ssniT5MFxw6ecCZ330CgOQ7a5mi9H1E/By3C5Zx/Q4+W1sbqKJv8lyfh2hZ5ONo1dvOYrz68Q04DV2zTcNP3o+AuB12kUifslbJWByF89Si8Xc/90wsK4h4aPNGYeh/nKGVP/Okw85oUBrzIOGQvnuQqnvjdLP2BhxK0PbO7zAuCXyQ/KJg+isOMrikdL26bLXAeLV76k4ZyyeMWriE+hoK9gLzTNTflpywUk+vchRwMibI5g3+y54S3C/EKA/ArATQ1wDezWIbKwhBEAOkYQF4CEB4OGUMIaSjc0CmPGbTF13NUaMIfRg2OHTAU7omMHQBF2TRGMFEDnuAKwRcIvwmkPEp8/x3Ac86tZekUdDhU0FoGXYvzOiaK/enCk10to/OhNuod70bhuF3fr95RNVgmjcx3MblMZwUNDav+WykcJqGkcGtDsZjlEbwWUz3516nONA7kXQbw2Gi4SejJxEh1NDkIKuSNm9wkHj6a6YBoG7FZUAwUyYO65bvrhMImciRfMrug0fW20tdwIJNYmFoc/6Cj0U81xFoRB+s045P3rjNe3SJ8xCr2ArbHCQ/7LX3Jk9HEPn6UuDA/Btsx/0+oxG4UBb9rx1j1OnYQ90YQdbsK1CyZcoBMDkqaIoQ81rcMni/iRugIaGrinqThPe953ksSblGg7Dl+8+iY6a134nhb6EGfls2X+07OVKL2uA5JwEU/Ewy7/GTvXPy+d0erq25kBr7zItjKxy5x4RsvaE71DXWlloXdnrrsK5ZGwxdR3mPcor0eKulDc4Tql85YtIJvasqkOQbmJzXuLb+UXAEpDCBGZdrYpN7QBRmuId7SzylUTw+23Dwxt+YEBhgpBNy1u6fqM6R4MHhyCwVZzDC5brh0bg7PFdkYIy66HwZAoC5CaGAwGirpEqHkGQ31y3pnBYmJ8LX1fYi/XUTHTR8nafOH5/vZBRKkBp23tSHnSKcpDQ2YqMhRfW5XyyGyG8hAplBdDq1nKZ9KpVafdsurYlcBWtwkMayKwCZsicPFAaZbAerSlTgIvPfYtd51Xzby4pW9aaEhx4B3ZC0Cn6GsqGhQCpYld6Wtp48BqhL7EVtaEgzYkhx60qktyVJIOxYS3WiU8OVLCE1UaV/TXxIaKH22I8IrgQGYbhN8jztEI4TOJAtqSKKVcPjKNQpTNPY2qO3N+oDh5dfDUxHmsBP4QaYPze0RGXlpXFvrrcnfd8pqybGPh2H28iSr7eIXve0cCpejz60jrspsvd+Fn83xxf/N5ao+v7+6/nOixkFEYTL0Zt02pwxYx7XV2mycDs41tnkLwYHMDvmokvwuRpJ1nObtTo/45WpoNVjXrYveVuKzssNGMsuMtywMCWLXOcq75fXx9sjq/P/3kjdApufxh4YK8GrFX3HV3gQno4xYdRhF4Rek2a/DSfJAkgy7pAnZ9GT2CrANjt0cE6A1H7QHUfW3ZuO2KrzUGMr0GlaOeiq+19/W1+yisIuCbC/38Ebzt2A6rwlvLVDJFduatKfOWkKY0gnofKXm5GY3QbJJLhbVwdfKWLZqOb1mr+sqKAtc2gEwqdc+qrtClgeQHNurdaiok7x8dxSmTDsdHd3lrilSN1FsDmYYEK2k1ddEdypMLtupdz5Xl6skHU3TKv5mDKaAgObipgyllaQo5wJNVMg/5ApCj+nZgH8ipaNBGh4U9c0453F0acb+ZvreW4O5EkW58lwTz9LjGW+2j54DXwfpId0aLhL7hUYFxUUyqXcxRdb1zNKlhhavNEmXfEQWTJXRv6VI1SoKVWOiAZLxrOL0Rw3p3Xou7d4fDnn+YZj/WdDLyIlF3Vu1AHjx8OdDn1u1PMwOAEGVtbMBaBwA2n05ulv8ulj8/Rncfn4Lg68o+0bXN2clwoe8jHO/ESQwlbRq3pycLIdenTb5uQm8IcUWqANyeOCwE/CAn55rTKWWkOrZjR7ZMlQHuI9OEJL0ybFB5G10RLcBuTLRo56ZI8z672XN0HUjwLZAgZZNXR8isKpDK9LUsOfxoGlafWAAjaJvrv41QGSs5IbBm/V0qIvOxlfTgeNFx9uOdEC0ih4HNjFyHmhELYN/ZhVRa9rxwIqYLGacFpwqOw+mo+TYWrOZ2CLL7nKmGgS3bQCaRZzazoYO8WD3VaNXrdi4u/OV7cDOGH6aEwLvx7aer9wdIv3zxGHpXUyuOIf0SKbHRAejbXPTZtgmgAaqOhxZDX0DNT95rDPDi9l/ebapv/3EgOv8f7Vtdc5s4FP01frSHb8RjYqfJdpMmG3e2TV86GGSbFiMW5Djur18JhI2QjE0ciD2TPLToIiTQOffcqw/39OHi5Tpx4/kd8mHY0xT/paePepqmKo5N/qOWdW6xTDM3zJLAz03K1jAO/sDiSWZdBj5MmS03YYRCHMS80UNRBD3M2dwkQSu+2hSFPmeI3RkUDGPPDUXrt8DHc2ZVFWV74wYGsznrGpjsxsItKjNDOnd9tCqZ9KuePkwQwvnV4mUIQzp4xbiMYmPo/Bhf/Zl9fRj+B65A/7PVzxv71OSRzSckMMJv27SRN/3shks2Xuxb8boYwAQtIx/SRpSefrmaBxiOY9ejd1eEMsQ2x4uQlFRy6bvpPKtLC+lviL05K0yDMByiECWkHKGIPH554Gexz3+GCYYvJVDZZ15DtIA4WZMq7K7FAFvzxdUWfsdhtnkJeb0wuoxys03L22ElF2xkG4yyKYxyuk4xXPjCYBOOxfRyuQhvgykMg2ygYpgE5E0gHbuQmR+2tkviO9gltoSNtYfC0I3TYJI1S2FLoLdM0uAZPsI0d9HMykG74XdWwAn6vfGYKoA9TZ9OoeV5m5qlO77tTBSlin9bYKsqjzYQ0d74Ooe21Rba1n6filEQ4axf87JnjioQowTP0QxFblgG+QTBMpuCZZgcVrYMKxEqrTWobAGqCZXzKlz0AwMSUi7CYBYR0wRhjBbZcLoJvqBRisJGmiE2GPmFZRIi73dRjYVGsB+8I6AgHc1gXT1GTuhzUVMELIGhi4le8PFYMv7s0QfK6C3QfUOVIl20kKJl4kH2UAXEzVu8HlcgCi5MngOPJCJnrLjAg3InngDTMDtT3A2YJ6O4zgkqbjtggaZgnZriqqqAVaaOQTSj3WUNwMglfuRTAHd57U5FlqlvA7WFLwH+Xrp+oo8NFB2w8uiFtZMV1qVCiTiZrZn35Xq4X7f36rvTjb6rls3xSq0SpmWBVzWBRqP+5fKc5d13IZhKFcPyAJxMu5J3Qz81eVf109P3ttDSGsN1agIvLikkMCYAbOU8Q87FEm99Q1nfKPlA57RcN6zGWn6E3+0VdudAYS+coG1l1yvurzok/pX+Cr7tEXqhXUMxuHa16mpLPgKtRYxiFa/Ey+H9l09/Xf8cXZ5x1ICqb0JbpkOOZeuuJSzDkXdkM1GiNe1FERPwuvT+UUQTM893jyJN0TsQrQ3XzzaKaGLIXyUBpqFj/DT+enVHGyDxQ7n/u80gIp0bED00LS6mWEaX84P5+DI2btdPjzc/Zne33+aaefO9qLc3jBRe0HYYsTRjYFocqXTTGZims/17ZSSxDD5CCZKxI5IQ7N11qRpz9jZijZgD3aCUfsL9WKAr8VDMb6LwYsA2SyT7Jy4juEcolElRlfmLwPdpN9JdG163dm/ctBUhKrs0uig5liw+vIHiSB1ITA/SdbpAUYDJkJ9vftB0Haqr/MA23nGWWaegp5QeHL+IWEv1c8gOblfjoYvBn/tnMP23v0qTOFzJfHU5Sb0kmNAEASPyjz9pd3opzQz6yqCycGjb55IYdDW/NBx+HqjrYADsUlbwuqTAdkxumqo16aTLhEFKaMmieJXQJHhMg1nG7PfntQo+Et4PXu/ntbhKn0aL+IzTKTJdB74hi8hAm+jWuy23AOcd0ykp9Ke4Zt8QPBGsWpKfQzo1U7zHKP6c/log49e18s/9KphL0ikBqtLs9DWoHQnELHTTdGcYeRtQTN6BVPMwVFoDRUwJvrkBDQxTOh+13AWdw0eTNM7GxArpGsKE3pnRq/x8YRbVXH9dB2eT3IEMebLON1XUopgnBEDXC8M2I8hK63KpdmNFvvdOd5qPTzXqOFJONer8o5xq1NXbm2kwxikDwh5eClSRdIWSH5memJWODtyvFxff+HNd6oG7OLtTjh3nCxRF+r67XqtaX1O4+uQif4PK08XroOk0hcduO0k5IR79/dBW2YnsTrVVPKE7pmeQiGn85e5BVNOLOA6JRuIARR9SepSUSo6p1tU7SSntA73SVVUEDxXTPtAc3jO0buS0yDkPlVNd7V5OpTwTN1bqF0rYbuHPx6uL0ZPgux2smyiWw/mg/gYeeOi6Sd287GTWTWzHqFng0BQwAOCVqYqpNGy5y/USqeyJ6yWbs1NuHPe6PDelDHjiGs55npvqise6Dnh+Gc7AKe92a69jsQGMQakVp1kvb3eoSspXcZFnw1c8p4pMkstTIK+lOB/krSVv5XAeADytXqnARq0C13fSMnXFn36JB4vIsHqQjKrfeeZAZgWAT95tzewwdahlY5m2dTnG+6YOugEGttVG6iBrucXUgRS3PzLPq29/qq9f/Q8=7Vxbc5s6EP41frQHJC7iMbHTtNPTJtOcmdM+dTDINlOMPCDH8fn1RwJhI4Rt5BhfTpuHFC1Cgv12v9Wu1PTgcP72mPqL2RcS4rgHjPCtB0c9AEzDc9k/XLIuJI5tF4JpGoWFyNgKXqJ/cfmkkC6jEGdCVogoITGNFrIwIEmCAyrJ/DQlK7nbhMShJFj4U6wIXgI/VqX/RCGdCalpGNsbH3E0nYmpkS1uzP2ysxBkMz8kq4oIPvTgMCWEFlfztyGOufJKvXwfwdEj+PhMYfbUfzLoEIHv/WKwDzqPbD4hxQk97dCgGPrVj5dCX+Jb6bpUYEqWSYj5IEYP3q9mEcUvCz/gd1fMZJhsRucxa5nsMvSzWd6XN7JfmAYz0ZhEcTwkMUlZOyEJe/y+5WeJz3/FKcVvFVDFZz5iMsc0XbMu4q4jAFvLzdUWfs8TslkFeVgKfWFy083IW7WyC6FZDS1DRcvZOqN4HirKZja24JfLefxXNMFxlCtqgdOIvQnmuouF+Hkru2e+Q30mS4WuAxLH/iKLxvmwHLYUB8s0i17xN5wVLppLJWg39p03aEp+bTymDmAPwMkEO0Gw6Vm5E7re2DDq+HcFtmnKaCMV7Y2vS2g7XaFtHfapBYkSms9r3/fsUQ1iktIZmZLEj6sgXyFYUBcsy5awcpuwUqECnUFlK1CNOZ3X4eIfGLGQchdH04SJxoRSMs/V6af0jkcpDhsbhslwEpaScUyCX2U3ERrRYfDeAQWbaIr39RPGiUMpaqqApTj2KeMLOR436F88+swtegt03zIbkS5HyMgyDbB4qAbi5i2Ox9VRCRenr1HAFiI3zLgowM1OPEa2ZZ+NcTdgXg3julfIuN2A5eiCdW2Mi1TP5OQYJVM+W/48TnzmRiHHb5fT7iTkJvLVIFv8FtHvlesf/LGBAZFoj97EOHljXWlU7CaX6TlfQYeHafsgvbvnoXfTcSWzMuv20jG/e4oVjfr3y1sm99DHaNLIF06A8HhyLnK34LWRe8lP18TuHaHlaaN1ZexumgpWKV4w/W/JPAfOpw3OekJS3/D4AEpMDi1Hm8nf4XYHad1tSeslil3zOqx5v+mx6Ff5Ke3tAM0r41qGJY0L6qWWQgOdxQtTrXMNn75++PT4c3R/w0EDm6GN3SYa8hwX+o5Sg2PvKNJQppHugoiNZF66giCiluAuHkR00WuLFtCF6+qiiFpBW6UR5aHj5cfL3w9f+AAsfhhfn/5mv58+dxlKGvODvjGQUwQX6AeWE8cSs3UwgecJJg6wBrYjmRa0vYFte9ufI+OJY8lxSiGOHfGEYe+vK92Ey3cRcdTS4keS8U94elHMlfkplfdRZEoQ+yUNWyi+MPCA2VBOSHXLn0dhyKdp3LiR2Wv33k1XcaK2UQNV4nGaokRnvNNQNlxnc5JElOn8dpcJurWocy0TXOvqcs3fp5Jo3lIpEbzdzT9RvP48nLv9+cOz7y2+9dVcU8GqwqjHwPZOJKaxn2WawV4bFRtIqJgeaAVLZ6iomRZl+iRLdVetAo7Oeo0pMF0Xyb1ZNotFGIKwFGxXYXlrXW3tTfCbK8C83qm7vGttPEVqrW8j1RXhPgepLgj39Tu4HhQWx9RhAU9mbsXmSi5/7z4ix1Sy79LLdZeMfQRqNWu7NpL2mnFHbdyQPRKI9q4XU/pbUn92UbxB7enydchkkuFuiiYNezXLcRak0ZinYpSwX+G420Jeu+zLcPXdU29Rc8Ls60ylPMuTS24QogFyK6nXcW7kerZUEQQ6k1w+K1M3jhSTZiv0STTNbfvylm2ii289Xl1d4Y9lNx3kVHfJsmS+UCz4drLW0MYotJoWTAiMoXOx4jbyri1rBS0yobPvkGqi1xKtjZnfQtba/AVqgqSAddq0VReKI9JWfVhs2YnMEqYzpK3NX6BuEK38iIeHCa/9OTEv0I751ZRfFee381jmh+t9AP4fU1tNqzi4xChd4OASo+zYPlVlFiMTgKkSwIlyVbs2kX3s5oZ8dNZEtXFOlqcaje+7O0+tnw2Q+l8uTwUtDtz/fowKUDtGrR/pOB2jqjtgL/yoJ59rsYgZJ9KIJH+o853U2fZkaNlRo8rn1k7tuF0xZx9BmTuBeSR39lGtMqnYd0fsCTw99oTgSqp8oGHTc29JRJzB+Pnt4W70Q3HfM1RIGNVJbghccOkaCWgotV+0RuJ61p5iBjDQAKEjVye2oTny5Wsj6q7u5lQqC0W9c55INQaeZLyWd6MnUs9kyBCiGp97A696gggcZ8YWsgaVUTy9WTo+rgrUMvWug29MvwFm6g3PzsTmwDDl9RAyLr8N0/b/QYK2m56dEjG00MB1uiDippE7JGLW3P4hhKL79s9JwIf/AA==7Vpbd6o4FP41PtYFhHB5rNV6zkxvq55ZM52XsxCiUpFYiFXn108CQQmhCFovXat96CLbEMj+vr33x4YWuJmt+pEzn9xjDwUtTfFWLdBtaZptAfqfGdapAVpWahhHvpealK1h4P+HUqOaWRe+h2JuS00E44D4c9Ho4jBELhFsThThpThthANPMMydMZIMA9cJZOvfvkcm3KoqyvaHH8gfT/ilLch/mDnZZG6IJ46HlzkT6LXATYQxSY9mqxsUMN9t/PLwFip/va7hKu55y5dF8Mfr21W62G2TUzZbiFBIPndpLV363QkW3F98r2SdOTDCi9BDbBGlBTrLiU/QYO647NclZQy1TcgsoCOVHnpOPEnmskE8RcSd8MHID4IbHOCIjkMc0tM7NbfFt/+OIoJWOVD5NvsIzxCJ1nQK/9XggK3F4XILP8zYOckhD2xudDjlxpuVt26lB9yzTQCU3Xzz+HD7s/+725H8TWk2Z4eLWXDnj1DgJ76ao8inN4OY+wJuftraOjR8iENtEXe3i4PAmcf+MFmWIRchdxHF/jt6RnEapYlVQHdD8WRAIjzdBE0Rw5YGkOpBZG5m5n6xDRM4hkQBeo88QVCPHA9/DYoEsGQCbMJfIIBxNAKA3XE2x35IkgvDTgt2C5jjiEzwGIdOkEf9lOjVRUtrCpcOBbTMMrRksLTjgaVLYC0jmvWoafAy+NW7ZwvE9N/jnxKIbNc+LT7XgT8OqWmICcEz6j0UetesmjFbgN3pbrgE56OVT/7JHb+w09rQsPm4u+LrJIN1bpCjS2JrFnQxXkQuqppn8pruRGNUuSAPAeQJ9VpmRYQCh9A0JSqBEpD5qU8sbnLJX9OpXwRGAWi3IbS3f1BcNN0kXydfQYtL62JekfJF6gRpIQq8s85N45Fe5OnGBwdQF0rU/YFjtoXHgcRVGp5ErN5iJuBVuqRwO5zdLuVQkoeKtJ/5nscuUyoXxKT1sWI4kTwAcr4xyorD0dKNIWEWr+MZDn1Cff511cHIcpHrltWXoQV1qJxLHWxK/+WoA/Py1EFT9OqiZTSF6+LUgS2H62IYu5E/RMmzbfosO/LH7KF5eHKFoLQzTZBqBNWC3xqhUMjVQgnQlAJb6ioC04ZtJfeniZUFWG3LzOmOC1MLslgYsmZGbcbGdAck4yymy3zAYjaNZ3arIaubMXE3wcwTiVBbIIIK6/FrD2BvByR8I9p0Oh3fDd2nvhP0lrwRlAf2GbmIbUdTnmit+Em1RZdJuVMnpyulrQnZSVfPn53AhZFHNcSSB3SzbVv7JSig7F7rlHno36t+19OW13fm/fBVu39/fOr3Sugah7P5F5a+HkSWp5eJJ0sbAuNsjTH9nNK3FHpVgv7syrcpeDJYlST/CsK3dANyD1uCKtdK2Ae1A4EYB04cf1gjPgeUQmdZzUDagcrRQJH7ykvHZ/l/xHoHRsAaPEN2NGZH8TomaJZUKMdbV8HXRAhQF0frpDnZVrNhWt0tADLDtrwno3V+VCzwNTqfqmF+gm6o4kReN1TFQ142VM3bqRo4w6hCMgpFW5VDP8vcB0oNWLgQ3LczqorrWIV1GkuMD5RRoX5l9/vRbUnzbWE+PUjvoHB2djt4NIrRoQK9lBPyG4bvXFr2nvakuVR+HB6wJ1d2qfk8oNmQ+Dj8TpoHJU29ZtLUmyZN1QZAoJN9rJx5ZYFC1ix+N1A3a15Zmtgn2HxqcuS8mYnJunkTqML88+XNklcllb1XOp/dO+ikL25/P/euuy9SCJ+i66FBIRQ1U/uEUKzb96h69LqYpqxp6xW9VE2x2taeTZDie1uWL0wtdylwYNR9Ns3l90HihwcPj79a6ecHG4JHaYPPOzm5xRJjqqfkde1+XlUAHL0XrNPqpG7JJkpl9m2CpebeEeh7chwqVW8idKttGidjOR1uP4tMp2+/LQW9/wE= \ No newline at end of file diff --git a/doc/system_health_monitoring/diagrams/system-chart.png b/doc/system_health_monitoring/diagrams/system-chart.png new file mode 100644 index 00000000000..e28aa94ca0f Binary files /dev/null and b/doc/system_health_monitoring/diagrams/system-chart.png differ diff --git a/doc/system_health_monitoring/diagrams/system-ready-disabled-flow.png b/doc/system_health_monitoring/diagrams/system-ready-disabled-flow.png new file mode 100644 index 00000000000..11cb206d0d5 Binary files /dev/null and b/doc/system_health_monitoring/diagrams/system-ready-disabled-flow.png differ diff --git a/doc/system_health_monitoring/diagrams/system-ready-ok-flow.png b/doc/system_health_monitoring/diagrams/system-ready-ok-flow.png new file mode 100644 index 00000000000..5df5bd46b3b Binary files /dev/null and b/doc/system_health_monitoring/diagrams/system-ready-ok-flow.png differ diff --git a/doc/system_health_monitoring/diagrams/system-ready-timeout-flow.png b/doc/system_health_monitoring/diagrams/system-ready-timeout-flow.png new file mode 100644 index 00000000000..6dd99323030 Binary files /dev/null and b/doc/system_health_monitoring/diagrams/system-ready-timeout-flow.png differ diff --git a/doc/system_health_monitoring/diagrams/system-use-case.png b/doc/system_health_monitoring/diagrams/system-use-case.png new file mode 100644 index 00000000000..06e2aabe3e5 Binary files /dev/null and b/doc/system_health_monitoring/diagrams/system-use-case.png differ diff --git a/doc/system_health_monitoring/system-ready-HLD.md b/doc/system_health_monitoring/system-ready-HLD.md index c55698c1ec8..73076aea982 100644 --- a/doc/system_health_monitoring/system-ready-HLD.md +++ b/doc/system_health_monitoring/system-ready-HLD.md @@ -40,26 +40,36 @@ - [8 Unit Test Cases ](#8-unit-test-cases) - [9 References ](#9-references) +# List of Figures + +- [Figure 1: System ready system chart](#figure-1-system-ready-system-chart) +- [Figure 2: System ready use-cause diagrams](#figure-2-system-ready-use-cause-diagram) +- [Figure 3: System status OK sequence diagram](#figure-3-system-status-ok-sequence-diagram) +- [Figure 4: System status DOWN sequence diagram](#figure-4-system-status-down-sequence-diagram) +- [Figure 5: System ready feature disabled flow](#figure-5-system-ready-feature-disabled-flow) + # List of Tables -[Table 1: Abbreviations](#table-1-abbreviations) +- [Table 1: Abbreviations](#table-1-abbreviations) # Revision -| Rev | Date | Author | Change Description | -|:--:|:--------:|:-----------------:|:------------------------------------------------------------:| -| 0.1 | | Senthil Kumar Guruswamy | Initial version | -| 0.2 | | Senthil Kumar Guruswamy | Update as per review comments | -| 0.3 | | Senthil Kumar Guruswamy | Integrate systemready to system-health | +| Rev | Date | Author | Change Description | +|:---:|:----------------:|:-----------------------:|:------------------------------------------------------------:| +| 0.1 | | Senthil Kumar Guruswamy | Initial version | +| 0.2 | | Senthil Kumar Guruswamy | Update as per review comments | +| 0.3 | | Senthil Kumar Guruswamy | Integrate systemready to system-health | +| 0.4 | 16 June 2023 | Yevhen Fastiuk πŸ‡ΊπŸ‡¦ | Report host daemons status. System status permanent. System ready admin state | # Definition/Abbreviation ### Table 1: Abbreviations -| **Term** | **Meaning** | -| -------- | ----------------------------------------- | -| FEATURE | Docker/Service | -| App | Docker/Service | +| **Term** | **Meaning** | +| ----------- | --------------------------------------------- | +| FEATURE | Docker/Service | +| App | Docker/Service | +| Host daemon | The demonized application running on the host | # About this Manual @@ -79,6 +89,8 @@ A new python based System monitor tool is introduced to monitor all the essentia This framework gives provision for docker apps to notify its closest up status. CLIs are provided to fetch the current system status and also service running status and its app ready status along with failure reason if any. This feature will be part of system-health framework. +![System chart](diagrams/system-chart.png "Figure 1: System ready system chart") +###### Figure 1: System ready system chart ## 1.1 Limitation of Existing tools: - Monit tool is a poll based approach which monitors the configured services for every 1 minute. @@ -90,7 +102,7 @@ This feature will be part of system-health framework. - Event based model where the feedback is immediate - Know the overall system status through syslog and as well through CLIs - It brings in the concept of application readiness to allow each application/service/docker to declare themselves as ready based on different application specific criteria. - - Combatibility with application extension framework. + - Compatibility with application extension framework. SONiC package installation process will register new feature in CONFIG DB. Third party dockers(signature verified) gets integrated into sonic os and runs similar to the existing dockers accessing db etc. Now, once the feature is enabled, it becomes part of either sonic.target or multi-user.target and when it starts, it automatically comes under the system monitor framework watchlist. @@ -106,15 +118,22 @@ Following requirements are addressed by the design presented in this document: 1. Identify the list of sonic services to be monitored. 2. system-health to include the sysmon framework to check system status of all the service units and receive service state change notifications to declare the system ready status. 3. Provision for apps to notify its closest up status in STATE DB. This should internally cover Port ready status. Also support application extension framework. -4. Appropriate system ready syslogs to be raised. -5. New CLI to be introduced to know the current system status all services. +4. Allow host daemons to report their app's ready status +5. Appropriate system ready syslogs to be raised. +6. New CLI to be introduced to know the current system status all services. - "show system-health sysready-status" covers the overall system status. -6. During the techsupport data collection, the new CLI to be included for debugging. +7. During the techsupport data collection, the new CLI to be included for debugging. +8. The feature should have enable/disable configuration. + - By default it is enabled, so it preserves all the behavior described in this document. + - In disabled state it will still report system ready status, but it will wait for only one event - `PortInitDone` +9. The feature should respect multi-asic according to [this design](https://github.com/sonic-net/SONiC/blob/master/doc/multi_asic/SONiC_multi_asic_hld.md#2421-systemd-services). If service is configured to be ignored or system ready feature should wait for it's app status - wait for all instances of that service. +![System ready use-case diagram](diagrams/system-use-case.png "Figure 2: System ready use-cause diagram") +###### Figure 2: System ready use-cause diagram ## 2.2 Configuration and Management Requirements -This feature will support CLI and no configuration command is provided for any congiruations. +This feature will support CLI and one configuration command is supported. ## 2.3 Scalability Requirements @@ -131,8 +150,8 @@ warmboot-finalizer sonic service to be monitored as part of all services. This feature provides framework to determine the current system status to declare the system is (almost) ready for network traffic. System ready is arrived at considering the following factors. -1. All sonic docker services and its UP status(including Portready status) -2. All sonic host services +1. Configured sonic docker services and its UP status (including Portready status) +2. Configured sonic host services # 4 Feature Design @@ -142,6 +161,18 @@ System ready is arrived at considering the following factors. - When sysmonitor daemon boots up, it polls for the service list status once and maintains the operational data in STATE_DB and publishes the system ready status in form of syslog and as well as in STATE_DB. - Subsequently, when any service state changes, sysmonitor gets the event notification for that service to be checked for its status and update the STATE_DB promptly. - Hence the system status is always up-to-date to be notifed to user in the form of syslog, STATE_DB update and as well as could be fetched by appropriate CLIs. +- Once system declare the status (any of it, `UP`, `DOWN`, or `FAILED`) the applications which were waiting for it can continue execution and take actions according to received status. + - `UP` system status should be concidered as healthy system status + - `DOWN` system status means that required daemon/s didn't notify its ready status during timeout period. + - `FAILED` system status means that some daemon was failed during it's execution or SONiC application reported `false` `up_status`. + +System status OK flow: +![System status OK sequence diagram](diagrams/system-ready-ok-flow.png "Figure 3: System status OK sequence diagram") +###### Figure 3: System status OK sequence diagram + +System status DOWN (by timeout): +![System status WODN sequence diagram](diagrams/system-ready-timeout-flow.png "Figure 4: System status DOWN sequence diagram") +###### Figure 4: System status DOWN sequence diagram ## 4.2 Sysmonitor @@ -153,10 +184,14 @@ Sysmonitor is the subtask of system-health service which does the job of checkin 1. subscribe to system dbus - With the dbus subscription, any systemd events gets notified to this task and it puts the event in the multiprocessing queue. -2. subscribe to the new FEATURE table in STATE_DB of Redis database +1. subscribe to the new FEATURE table in STATE_DB of Redis database - With the STATE_DB feature table subscription, any input to the FEATURE table gets notified to this task and it puts the event in the queue. -3. Main task +1. Timeout task + - Timeout can be configured trought the platform's `system_health_monitoring_config.json` file by the `timeout` field. + System will be declared DOWN once timeout reached. + +1. Main task - Runs through the polling of all service status check once and listen for events in queue populated by dbus task and statedb task to take appropriate action of checking the specific event unit status and updating system status in the STATE_DB. @@ -165,8 +200,12 @@ Sysmonitor is the subtask of system-health service which does the job of checkin ## 4.3 Service Identification - It covers the enabled services from FEATURE table of CONFIG_DB. -- Also, since the idea is to cover only the sonic services but not the general system services, sysmonitor tracks services under "multi-user.target" and "sonic.target" +- Also, since the idea is to cover only the sonic services but not the general system services, sysmonitor tracks services under "multi-user.target" and "sonic.target". It also inportant to track all "generated" systemd services from `/run/systemd/generator/` folder, such as `ntp-config`, `interfaces-config`, etc. - This covers all the sonic docker services and most of the sonic host services. +- Additionaly, in `system_health_monitoring_config.json` we introduce a new fields: `services_to_wait` and `services_to_report_app_status`. + - `services_to_wait` - holds explicit list of services we would like to wait for in order to declare system ready state. This list shouldn't include the SONiC applications, because it is up to them to specify the effect on system ready by paramerizing FEATURE table. + - `services_to_report_app_status` - some daemon may want to notify the readiness to systemd earlier that functional readiness. + That parameter will hold all services that should notify app ready state by itself using the same mechanism as SONiC application. ## 4.4 System ready Framework logic @@ -177,14 +216,21 @@ but align the services within framework to flag the status as "Down" if the serv - For services: - Loaded, enabled/enabled-runtime/static, active & running, active & exited state services are considered 'OK'. - For active and running services, up_status marked by docker app should be True to be considered 'OK'. - - Failed state services are considered 'Down'. + - Failed state services are considered 'Failed'. - Activating state services are considered as 'Starting'. - Deactivating state services are considered as 'Stopping'. - Inactive state services category: - oneshot services are considered as 'OK'. - Special services with condition pathexists check failure are considered as 'OK'. - Other services in inactive state are considered to be 'Down'. + - Services exited with error code concidered 'Failed'. - Any service type other than oneshot if by default goes to inactive state, RemainAfterExit=yes entry needs to be added to their service unit file to be inline with the framework. + - Host daemons marked their status via `up_status` field in STATE_DB as `true` considered 'OK'. + - Host daemons marked their status via `up_status` field in STATE_DB as `false` considered 'Failed'. + +System ready feature disabled flow: +![System ready feature disabled flow](diagrams/system-ready-disabled-flow.png "Figure 5: System ready feature disabled flow") +###### Figure 5: System ready feature disabled flow ## 4.5 Provision for apps to mark closest UP status @@ -197,6 +243,7 @@ In simple, each app is responsible in marking its closest up status in STATE_DB. Docker apps marking their UP status in STATE_DB will input an entry in FEATURE table of CONFIG_DB with check_up_status flag set to true through /etc/sonic/init_cfg.json file change. Sysmonitor checks for the check_up_status flag in CONFIG_DB before reading the app ready status from STATE_DB. If the flag does not exist or if set to False, then sysmonitor will not read the app ready status but just checks the running status of the service. +Docker applications can mark `irrel_for_sysready` field in `FEATURE` table to instruct sysmonitor to ignore the application's status. For application extension package support, a new manifest variable is introduced to control whether "check_up_status" should be up true or false which will also be an indication whether docker implements marking the up_status flag in STATE_DB. @@ -210,19 +257,48 @@ a new manifest variable is introduced to control whether "check_up_status" shoul "": { ... "state": "enabled", - "check_up_status": "true" + "check_up_status": "true", + "irrel_for_sysready": "true" } } } ``` +The feature configuration is controlled by `sysready_state` field of `DEVICE_METADATA` table. +```yang +module sonic-device_metadata { + ... + + container sonic-device_metadata { + + container DEVICE_METADATA { + + description "DEVICE_METADATA part of config_db.json"; + + container localhost { + ... + + leaf sysready_state { + type stypes:state; + } + } + /* end of container localhost */ + } + /* end of container DEVICE_METADATA */ + } + /* end of top level container */ +} +/* end of module sonic-device_metadata */ +``` + + ### 4.5.2 STATE_DB Changes - Docker apps which rely on config, can mark 'up_status' to true in STATE_DB when they are ready to receive configs from CONFIG_DB and/or some extra dependencies are met. - Respective apps should mark its up_status considering Port ready status. Hence there is no separate logic check needed by system monitoring tool - Any docker app which has multiple independent daemons can maintain a separate intermediate key-value in the redis-db for each of the daemons and the startup script that invokes each of these daemons can determine the status from the redis entries by each daemon and finally update the STATE_DB up_status. - Along with up_status, docker apps should update the fail_reason field with appropriate reason in case of failure or empty string in case of success. - Also, update_time field to be fed in as well in the format of epoch time. - +- Deamon's application mentioned in `services_to_report_app_status` must report their status in `up_status` field in `SERVICE_APP` table of `STATE_DB`. For instances, - swss docker app can wait for port init done and wait for Vrfmgr, Intfmgr and Vxlanmgr to be ready before marking its up status. @@ -232,14 +308,16 @@ For instances, STATE_DB: +- For SONiC application the `` is `FEATURE` +- For daemon's applications mentioned in `services_to_report_app_status` the `
` is `SERVICE_APP` ``` -- sonic-db-cli STATE_DB HSET "FEATURE|" up_status true -- sonic-db-cli STATE_DB HSET "FEATURE|" fail_reason "" / "" -- sonic-db-cli STATE_DB HSET "FEATURE|" update_time "" +- sonic-db-cli STATE_DB HSET "
|" up_status true +- sonic-db-cli STATE_DB HSET "
|" fail_reason "" / "" +- sonic-db-cli STATE_DB HSET "
|" update_time "" - Schema in STATE_DB sonic-db-dump -n STATE_DB output - "FEATURE|": { + "
|": { "type": "hash", "value": { "up_status": "true", @@ -250,7 +328,7 @@ STATE_DB: }, - Example: - "FEATURE|bgp": { + "
|bgp": { "type": "hash", "value": { "fail_reason": "", @@ -268,12 +346,36 @@ In addition to this, sysmonitor posts the system status to SYSTEM_READY table in "SYSTEM_READY|SYSTEM_STATE": { "type": "hash", "value": { - "status": "up" + "Status": "UP" } } ``` -### 4.5.3 Feature yang Changes +### 4.5.3 Health configuration file changes +As it was mentioned before that feature will use `system_health_monitoring_config.json` file as configuration. +The example of that file is here: +```json +{ + "services_to_ignore": ["rsyslog", "syncd", "redis", "orchagent", "portsyncd", "portmgrd", "pmon"], + "services_to_wait": ["ntp-config", "interfaces-config", "hostcfgd"], + "services_to_report_app_status": ["hostcfgd"], + "timeout": 10, + "devices_to_ignore": [], + "user_defined_checkers": [], + "polling_interval": 3, + "led_color": { + "fault": "orange", + "normal": "green", + "booting": "orange_blink" + } +} +``` +- `services_to_ignore` - is used to filter services we don't want to wait for +- `services_to_wait` - is explicit list of services we would like to wait for +- `services_to_report_app_status` - the list of services which must report their status in order to declare system ready +- `timeout` - the timeout after which sysmonitor will consider the system is `DOWN` + +### 4.5.4 Feature yang Changes Following field is added to the sonic-feature.yang file.