Skip to content

Commit

Permalink
Fix system health reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
ohsayan committed Mar 4, 2024
1 parent 3109330 commit 10e1f50
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 7 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

All changes in this project will be noted in this file.

## Version 0.8.1

### Fixes

- Fixed migration from v1 SE (released with v0.8.0-beta) to v2 SE (released in v0.8.0)
- Fixed health reporting

## Version 0.8.0

> This is the first release of Skytable Octave, and it changes the query API entirely making all previous versions incompatible
Expand Down
8 changes: 7 additions & 1 deletion server/src/engine/core/dcl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,13 @@ pub fn exec<G: GlobalInstanceLike>(
SysctlCommand::CreateUser(new) => create_user(&g, new),
SysctlCommand::DropUser(drop) => drop_user(&g, current_user, drop),
SysctlCommand::AlterUser(usermod) => alter_user(&g, current_user, usermod),
SysctlCommand::ReportStatus => Ok(()),
SysctlCommand::ReportStatus => {
if g.health().status_okay() {
Ok(())
} else {
Err(QueryError::SysServerError)
}
}
}
}

Expand Down
8 changes: 6 additions & 2 deletions server/src/engine/core/space.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,13 @@ impl Space {
// UNSAFE(@ohsayan): I want to try what the borrow checker has been trying
core::mem::transmute(EntityIDRef::new(space_name.as_str(), &model))
};
models.st_delete(&e);
let mdl = models.st_delete_return(&e).unwrap();
// no need to purge model drive since the dir itself is deleted. our work here is to just
// remove this from the linked models from the model ns
// remove this from the linked models from the model ns. but we should update the global state
if mdl.driver().status().is_iffy() {
// yes this driver had a fault but it's being purged anyway so update global status
global.health().report_removal_of_faulty_source();
}
}
let _ = spaces.st_delete(space_name.as_str());
if if_exists {
Expand Down
3 changes: 2 additions & 1 deletion server/src/engine/fractal/drivers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,11 @@ impl FractalGNSDriver {
match f(&mut txn_driver) {
Ok(v) => Ok(v),
Err(e) => compiler::cold_call(|| {
error!("GNS driver failed with: {e}");
self.status.set_iffy();
g.health().report_fault();
on_failure();
g.taskmgr_post_high_priority(Task::new(CriticalTask::CheckGNSDriver));
error!("GNS driver failed with: {e}");
Err(QueryError::SysServerError)
}),
}
Expand Down
2 changes: 2 additions & 0 deletions server/src/engine/fractal/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ impl FractalMgr {
Ok(()) => {
info!("GNS driver has been successfully auto-recovered");
global.state().gns_driver().status().set_okay();
global.health().report_recovery();
}
Err(e) => {
error!("failed to autorecover GNS driver with error `{e}`. will try again");
Expand All @@ -357,6 +358,7 @@ impl FractalMgr {
match drv.__lwt_heartbeat() {
Ok(()) => {
mdl.driver().status().set_okay();
global.health().report_recovery();
info!("model driver for {mdl_id} has been successfully auto-recovered");
}
Err(e) => {
Expand Down
45 changes: 43 additions & 2 deletions server/src/engine/fractal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ use {
},
},
crate::{engine::error::RuntimeResult, util::compiler},
std::{fmt, mem::MaybeUninit},
std::{
fmt,
mem::MaybeUninit,
sync::atomic::{AtomicUsize, Ordering},
},
tokio::sync::mpsc::unbounded_channel,
};

Expand Down Expand Up @@ -87,9 +91,34 @@ pub unsafe fn load_and_enable_all(gns: GlobalNS) -> GlobalStateStart {
global access
*/

pub struct GlobalHealth {
faults: AtomicUsize,
}

impl GlobalHealth {
pub fn status_okay(&self) -> bool {
self.faults.load(Ordering::Acquire) == 0
}
const fn new() -> Self {
Self {
faults: AtomicUsize::new(0),
}
}
fn report_fault(&self) {
self.faults.fetch_add(1, Ordering::Release);
}
fn report_recovery(&self) {
self.faults.fetch_sub(1, Ordering::Release);
}
pub fn report_removal_of_faulty_source(&self) {
self.report_recovery()
}
}

/// Something that represents the global state
pub trait GlobalInstanceLike {
// stat
fn health(&self) -> &GlobalHealth;
fn get_max_delta_size(&self) -> usize;
// global namespace
fn state(&self) -> &GlobalNS;
Expand Down Expand Up @@ -150,6 +179,13 @@ impl GlobalInstanceLike for Global {
fn state(&self) -> &GlobalNS {
self._namespace()
}
fn health(&self) -> &GlobalHealth {
&unsafe {
// UNSAFE(@ohsayan): we expect the system to be initialized
self.__gref()
}
.health
}
// taskmgr
fn taskmgr_post_high_priority(&self, task: Task<CriticalTask>) {
self._post_high_priority_task(task)
Expand Down Expand Up @@ -258,11 +294,16 @@ impl Global {
struct GlobalState {
gns: GlobalNS,
task_mgr: mgr::FractalMgr,
health: GlobalHealth,
}

impl GlobalState {
fn new(gns: GlobalNS, task_mgr: mgr::FractalMgr) -> Self {
Self { gns, task_mgr }
Self {
gns,
task_mgr,
health: GlobalHealth::new(),
}
}
pub(self) fn fractal_mgr(&self) -> &mgr::FractalMgr {
&self.task_mgr
Expand Down
7 changes: 6 additions & 1 deletion server/src/engine/fractal/test_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

use {
super::{
drivers::FractalGNSDriver, CriticalTask, FractalModelDriver, GenericTask,
drivers::FractalGNSDriver, CriticalTask, FractalModelDriver, GenericTask, GlobalHealth,
GlobalInstanceLike, Task,
},
crate::engine::{
Expand All @@ -47,6 +47,7 @@ pub struct TestGlobal {
gns: GlobalNS,
lp_queue: RwLock<Vec<Task<GenericTask>>>,
max_delta_size: usize,
health: GlobalHealth,
}

impl TestGlobal {
Expand All @@ -55,6 +56,7 @@ impl TestGlobal {
gns,
lp_queue: RwLock::default(),
max_delta_size: usize::MAX,
health: GlobalHealth::new(),
}
}
pub fn set_max_data_pressure(&mut self, max_data_pressure: usize) {
Expand Down Expand Up @@ -110,6 +112,9 @@ impl TestGlobal {
}

impl GlobalInstanceLike for TestGlobal {
fn health(&self) -> &GlobalHealth {
&self.health
}
fn state(&self) -> &GlobalNS {
&self.gns
}
Expand Down

0 comments on commit 10e1f50

Please sign in to comment.