Skip to content

Commit

Permalink
Merge pull request xapi-project#38 from koushikcgit/CA-115733
Browse files Browse the repository at this point in the history
CA-115733: Mark a stuck VM and do not retry ballooning them
  • Loading branch information
robhoes committed Dec 22, 2015
2 parents 3474573 + 4aa65d0 commit f0853cc
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 19 deletions.
21 changes: 16 additions & 5 deletions lib/squeeze.ml
Expand Up @@ -60,6 +60,8 @@ type domain = {
domid: int;
(** true if the domain has ballooning capability i.e. is not paused etc. *)
can_balloon: bool;
(** true if the domain has been declared stuck by squeezed *)
is_stuck: bool;
(** admin-imposed lower-limit on the balloon target *)
dynamic_min_kib: int64;
(** current balloon target requested by the system *)
Expand All @@ -75,10 +77,11 @@ type domain = {
}

let domain_make
domid can_balloon dynamic_min_kib target_kib dynamic_max_kib memory_actual_kib memory_max_kib inaccuracy_kib =
domid can_balloon is_stuck dynamic_min_kib target_kib dynamic_max_kib memory_actual_kib memory_max_kib inaccuracy_kib =
{
domid = domid;
can_balloon = can_balloon;
is_stuck = is_stuck;
dynamic_min_kib = dynamic_min_kib;
target_kib = target_kib;
dynamic_max_kib = dynamic_max_kib;
Expand All @@ -92,6 +95,7 @@ let domain_to_string_pairs (x: domain) =
[
"domid", i x.domid;
"can_balloon", string_of_bool x.can_balloon;
"is_stuck", string_of_bool x.is_stuck;
"dynamic_min_kib", i64 x.dynamic_min_kib;
"target_kib", i64 x.target_kib;
"dynamic_max_kib", i64 x.dynamic_max_kib;
Expand Down Expand Up @@ -180,7 +184,7 @@ let has_hit_target inaccuracy_kib memory_actual_kib target_kib =
let short_string_of_domain domain =
Printf.sprintf "%d T%Ld A%Ld M%Ld %s%s" domain.domid
domain.target_kib domain.memory_actual_kib domain.memory_max_kib
(if domain.can_balloon then "B" else "?")
(if (domain.can_balloon && not domain.is_stuck) then "B" else if (domain.can_balloon && domain.is_stuck) then "X" else "?")
(string_of_direction (direction_of_actual domain.inaccuracy_kib domain.memory_actual_kib domain.target_kib))

(** Generic code to guesstimate if a balloon driver is stuck *)
Expand Down Expand Up @@ -226,15 +230,18 @@ module Stuckness_monitor = struct
let makingprogress = (delta_actual > 0L && direction = Some Down) || (delta_actual < 0L && direction = Some Up) in
(* We keep track of the last time we were makingprogress. If we are makingprogress now
then we are not stuck. *)
if makingprogress then begin
if makingprogress && not state.stuck then begin
state.last_makingprogress_time <- now;
state.stuck <- false;
end;
(* If there is a request (ie work to do) and we haven't been makingprogress for more than the
assume_balloon_driver_stuck_after then declare this domain stuck. *)
let request = direction <> None in (* ie target <> actual *)
if request && (now -. state.last_makingprogress_time > assume_balloon_driver_stuck_after)
then state.stuck <- true;
then begin
debug "domain = %d is marked stuck" domain.domid;
state.stuck <- true;
end;
)
host.domains;
(* Clear out dead domains just in case someone keeps *)
Expand Down Expand Up @@ -338,7 +345,7 @@ module Squeezer = struct
Stuckness_monitor.update x.stuckness host now;
let active_domains =
List.filter (fun domain ->
domain.can_balloon
domain.can_balloon && not domain.is_stuck
&& (Stuckness_monitor.domid_is_active x.stuckness domain.domid now))
host.domains in
let non_active_domids = List.map (fun d -> d.domid) (set_difference host.domains active_domains) in
Expand Down Expand Up @@ -478,6 +485,7 @@ type io = {
execute_action: action -> unit;
wait: float -> unit;
gettimeofday: unit -> float;
declare_domain_stuck: int -> unit;

target_host_free_mem_kib: int64;
free_memory_tolerance_kib: int64;
Expand Down Expand Up @@ -536,6 +544,8 @@ let change_host_free_memory ?fistpoints io required_mem_kib success_condition =
let debug_string = String.concat "; " (host_debug_string :: (List.map (fun domain -> short_string_of_domain domain ^ (new_target_direction domain)) host.domains)) in
debug "%s" debug_string;

List.iter (io.declare_domain_stuck ) declared_inactive_domids;

(* For each domid, decide what maxmem should be *)
let maxmems = IntMap.mapi
(fun domid domain ->
Expand Down Expand Up @@ -587,6 +597,7 @@ let free_memory_range ?fistpoints io min_kib max_kib =
(* First compute the 'ideal' amount of free memory based on the proportional allocation policy *)
let domain = { domid = -1;
can_balloon = true;
is_stuck = false;
dynamic_min_kib = min_kib; dynamic_max_kib = max_kib;
target_kib = min_kib;
memory_actual_kib = 0L;
Expand Down
18 changes: 16 additions & 2 deletions src/squeeze_xen.ml
Expand Up @@ -37,6 +37,7 @@ let _feature_balloon = "/control/feature-balloon" (* immutable *)
let _data_updated = "/data/updated" (* mutable: written by guest agent *)
let _memory_offset = "/memory/memory-offset" (* immutable *)
let _uncooperative = "/memory/uncooperative" (* mutable: written by us *)
let _domain_stuck = "/memory/squeezed-declared-stuck" (* mutable: written by us *)

let _dynamic_min = "/memory/dynamic-min" (* mutable: written by domain manager *)
let _dynamic_max = "/memory/dynamic-max" (* mutable: written by domain manager *)
Expand Down Expand Up @@ -103,6 +104,7 @@ module Domain = struct
[ "memory"; "uncooperative" ];
[ "memory"; "dynamic-min" ];
[ "memory"; "dynamic-max" ];
[ "memory"; "squeezed-declared-stuck" ];
] in
let watches domid =
List.map (fun p -> Printf.sprintf "/local/domain/%d/%s" domid (String.concat "/" p)) interesting_paths in
Expand Down Expand Up @@ -336,6 +338,12 @@ module Domain = struct
let get_memory_offset cnx domid =
Int64.of_string (read cnx domid _memory_offset)

(** Mark a domain as stuck. Don't throw an exception if the domain has been destroyed *)
let set_domain_stuck_noexn cnx domid _val = write_noexn cnx domid _domain_stuck _val

(** Query to find if a domain is stuck. Don't throw an exception if the domain has been destroyed *)
let get_domain_stuck cnx domid = try ignore(read cnx domid _domain_stuck); true with Xs_protocol.Enoent _ -> false

(** Set a domain's maxmem. Don't throw an exception if the domain has been destroyed *)
let set_maxmem_noexn cnx domid target_kib =
let maxmem_kib = xen_max_offset_kib (get_hvm cnx domid) +* target_kib in
Expand All @@ -360,8 +368,11 @@ module Domain = struct
Int64.of_string (read cnx domid _dynamic_max)
end



(** Mark the domain as squeezed declared stuck *)
let declare_domain_stuck ~xc domid =
let cnx = xc in
Domain.set_domain_stuck_noexn cnx domid "true"

(** Record when the domain was last co-operative *)
let when_domain_was_last_cooperative : (int, float) Hashtbl.t = Hashtbl.create 10

Expand Down Expand Up @@ -449,6 +460,7 @@ let make_host ~verbose ~xc =
(* Misc other stuff appears in max_memory_pages *)
let memory_max_kib = max 0L (memory_max_kib -* (xen_max_offset_kib di.Xenctrl.hvm_guest)) in
let can_balloon = Domain.get_feature_balloon cnx di.Xenctrl.domid in
let is_stuck = Domain.get_domain_stuck cnx di.Xenctrl.domid in
let has_guest_agent = Domain.get_guest_agent cnx di.Xenctrl.domid in
let has_booted = can_balloon || has_guest_agent in
(* Once the domain tells us it has booted, we assume it's not currently ballooning and
Expand Down Expand Up @@ -482,6 +494,7 @@ let make_host ~verbose ~xc =
{ Squeeze.
domid = di.Xenctrl.domid;
can_balloon = can_balloon;
is_stuck = is_stuck;
dynamic_min_kib = 0L;
dynamic_max_kib = 0L;
target_kib = 0L;
Expand Down Expand Up @@ -605,6 +618,7 @@ let io ~xc ~verbose = {
execute_action = (fun action -> execute_action ~xc action);
target_host_free_mem_kib = target_host_free_mem_kib;
free_memory_tolerance_kib = free_memory_tolerance_kib;
declare_domain_stuck = (fun domid -> declare_domain_stuck ~xc domid);
}

let change_host_free_memory ~xc required_mem_kib success_condition =
Expand Down
25 changes: 13 additions & 12 deletions test/squeeze_test.ml
Expand Up @@ -179,9 +179,9 @@ let scenario_a = {
should_succeed = true;
scenario_domains = [
new idealised_vm_with_limit
(domain_make 0 true 1000L 1500L 2000L 1500L 1500L 4L) 100L 1250L;
(domain_make 0 true false 1000L 1500L 2000L 1500L 1500L 4L) 100L 1250L;
new intermittently_stuck_vm
(domain_make 1 true 2500L 3500L 4500L 3500L 3500L 4L) 500L 0.25;
(domain_make 1 true false 2500L 3500L 4500L 3500L 3500L 4L) 500L 0.25;
];
host_free_mem_kib = 0L;
required_mem_kib = 1000L;
Expand All @@ -195,9 +195,9 @@ let scenario_b = {
should_succeed = true;
scenario_domains = [
new intermittently_stuck_vm
(domain_make 1 true 500L 3500L 4500L 3500L 3500L 4L) 100L 3.;
(domain_make 1 true false 500L 3500L 4500L 3500L 3500L 4L) 100L 3.;
new intermittently_stuck_vm
(domain_make 0 true 500L 1500L 2500L 1500L 1500L 4L) 100L 1.5;
(domain_make 0 true false 500L 1500L 2500L 1500L 1500L 4L) 100L 1.5;
];
host_free_mem_kib = 0L;
required_mem_kib = 1000L;
Expand All @@ -210,8 +210,8 @@ let scenario_c = {
freed";
should_succeed = false;
scenario_domains = [
new idealised_vm (domain_make 0 true 1000L 1500L 2000L 1500L 1500L 0L) 100L;
new idealised_vm (domain_make 1 true 2000L 2500L 3000L 2500L 2500L 0L) 100L;
new idealised_vm (domain_make 0 true false 1000L 1500L 2000L 1500L 1500L 0L) 100L;
new idealised_vm (domain_make 1 true false 2000L 2500L 3000L 2500L 2500L 0L) 100L;
];
host_free_mem_kib = 0L;
required_mem_kib = 1500L;
Expand All @@ -225,9 +225,9 @@ let scenario_d = {
should_succeed = false;
scenario_domains = [
new idealised_vm
(domain_make 0 true 1000L 1500L 2000L 1500L 1500L 0L) 100L;
(domain_make 0 true false 1000L 1500L 2000L 1500L 1500L 0L) 100L;
new idealised_vm_with_limit
(domain_make 1 true 2000L 2500L 3000L 2500L 2500L 0L) 100L 2250L;
(domain_make 1 true false 2000L 2500L 3000L 2500L 2500L 0L) 100L 2250L;
];
host_free_mem_kib = 0L;
required_mem_kib = 1000L;
Expand All @@ -244,10 +244,10 @@ let scenario_e = {
scenario_domains = [
(* The stuck domain is using more than it should be if the memory was freed and everything balanced *)
new stuck_vm
(domain_make 0 true (*min*)5000L (*target*)7000L (*max*)7000L (*actual*)7000L 7000L 0L);
(domain_make 0 true false (*min*)5000L (*target*)7000L (*max*)7000L (*actual*)7000L 7000L 0L);
(* The working domain is using less than it should be if the memory was freed and everything balanced *)
new idealised_vm
(domain_make 1 true (*min*)5000L (*target*)6000L (*max*)11000L (*actual*)6000L 6000L 0L) 100L;
(domain_make 1 true false (*min*)5000L (*target*)6000L (*max*)11000L (*actual*)6000L 6000L 0L) 100L;

];
host_free_mem_kib = 0L;
Expand Down Expand Up @@ -290,9 +290,9 @@ let scenario_h = {
should_succeed = true;
scenario_domains = [
new idealised_vm_with_upper_limit
(domain_make 0 true 1000L 1500L 2000L 1500L 1500L 4L) 100L 1500L;
(domain_make 0 true false 1000L 1500L 2000L 1500L 1500L 4L) 100L 1500L;
new idealised_vm
(domain_make 1 true 1000L 1500L 2000L 1500L 1500L 4L) 100L; (* this one can take up the slack *)
(domain_make 1 true false 1000L 1500L 2000L 1500L 1500L 4L) 100L; (* this one can take up the slack *)
];
host_free_mem_kib = 1000L;
required_mem_kib = 0L;
Expand Down Expand Up @@ -415,6 +415,7 @@ let simulate scenario =
execute_action = execute_action;
target_host_free_mem_kib = scenario.required_mem_kib;
free_memory_tolerance_kib = 0L;
declare_domain_stuck = (fun domid -> ());
} in

finally
Expand Down

0 comments on commit f0853cc

Please sign in to comment.