-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve ncd status logging #13
Changes from 6 commits
32f6945
df625a3
d8790c2
1bbdf9d
94421f2
a3f6a8c
0fe244c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -268,8 +268,10 @@ use LC::Exception qw ( throw_error); | |
use EDG::WP4::CCM::CacheManager; | ||
use CDISPD::Application; | ||
use CDISPD::Utils qw(COMP_CONFIG_PATH compare_profiles add_component clean_ICList); | ||
use CAF::FileEditor; | ||
|
||
use constant CONFIG_ROOT => "/"; | ||
use constant NCD_EXECUTABLE => "/usr/sbin/ncm-ncd"; | ||
|
||
our $this_app; | ||
our %SIG; | ||
|
@@ -563,7 +565,7 @@ sub launch_ncd { | |
|
||
my $result = 0; # Assume success | ||
|
||
my @cmd = ( '/usr/sbin/ncm-ncd', '--configure' ); | ||
my $p = CAF::Process->new([NCD_EXECUTABLE, '--configure'] , log => $this_app ); | ||
if ( defined( $this_app->{ICLIST} ) && scalar(@{$this_app->{ICLIST}}) ) { | ||
# At this point, ICLIST should contain only components present in the last profile received. | ||
# The only case where a component may be in the list without being part of the configuration | ||
|
@@ -574,69 +576,104 @@ sub launch_ncd { | |
# X failed with profile N+1) and thus X removal is not detected. | ||
# As a result, X remains on the list of component to run. This should be harmless as ncm-ncd will ignore it. | ||
# This is probably rare enough to avoid complex processing to handle this in ncm-dispd. | ||
push @cmd, @{ $this_app->{ICLIST} }; | ||
$p->pushargs(@{$this_app->{ICLIST}}); | ||
} else { | ||
$this_app->info("no components to be run by NCM - ncm-ncd won't be called"); | ||
return (0); | ||
} | ||
|
||
# ncd options | ||
if ( $this_app->option('state') ) { | ||
push @cmd, "--state", $this_app->option('state'); | ||
$p->pushargs("--state", $this_app->option('state')); | ||
} | ||
if ( $this_app->option('ncd-retries') ) { | ||
push @cmd, "--retries", $this_app->option('ncd-retries'); | ||
$p->pushargs("--retries", $this_app->option('ncd-retries')); | ||
} | ||
if ( $this_app->option('ncd-timeout') ) { | ||
push @cmd, "--timeout", $this_app->option('ncd-timeout'); | ||
$p->pushargs("--timeout", $this_app->option('ncd-timeout')); | ||
} | ||
if ( $this_app->option('ncd-useprofile') ) { | ||
push @cmd, "--useprofile", $this_app->option('ncd-useprofile'); | ||
$p->pushargs("--useprofile", $this_app->option('ncd-useprofile')); | ||
} | ||
|
||
my $cmd_line = join( " ", @cmd ); | ||
if ( $this_app->option('noaction') ) { | ||
$this_app->info( "would run (noaction mode): " . $cmd_line ); | ||
$this_app->info( "would run (noaction mode): $p"); | ||
} else { | ||
$this_app->info( "about to run: " . $cmd_line ); | ||
my $verb = $cmd[0]; | ||
if ( -x $verb ) { | ||
$this_app->info( "about to run: $p"); | ||
if ( $p->is_executable() ) { | ||
# Delay processing of some signals | ||
delay_signals(); | ||
|
||
# Execute ncm-ncd and report exit status | ||
my $p = CAF::Process->new( \@cmd, log => $this_app ); | ||
sub act { | ||
my ($logger, $message) = @_; | ||
$logger->debug(1, $message); | ||
$logger->verbose($message); | ||
} | ||
my $errormsg = $p->stream_output(\&act, mode => 'line', arguments => [$this_app]); | ||
|
||
my $ec = $?; | ||
$this_app->debug(3, "ncm-ncd finished with full message $errormsg"); | ||
my $msg = "ncm-ncd finished with status: exitcode ". ($ec >> 8) . " (ec $ec)"; | ||
my $msg = "ncm-ncd finished with status: ". ($ec >> 8) . " (ec $ec"; | ||
my $log_level = 'info'; | ||
if ( $ec ) { | ||
$msg .= " (some configuration modules failed to run successfully)"; | ||
log_failed_components(); | ||
$log_level = 'warn'; | ||
$msg .= ", some configuration modules failed to run successfully)"; | ||
$result = 1; | ||
} else { | ||
$msg .= " (all configuration modules ran successfully)"; | ||
$msg .= ", all configuration modules ran successfully)"; | ||
} | ||
$this_app->info($msg); | ||
$this_app->$log_level($msg); | ||
|
||
# Process delayed signal if any and reestablish | ||
# immediate processing of signals | ||
process_signal(); | ||
immediate_signals(); | ||
|
||
} else { | ||
$this_app->error("Command $verb not found"); | ||
$this_app->error("Command ". ${$p->get_command()}[0] . " not found or not executable"); | ||
$result = 1; | ||
} | ||
} | ||
return $result; | ||
} | ||
|
||
|
||
# log_failed_components() | ||
# | ||
# Scan the ncm-ncd component state directory and for each failed component | ||
# (component with an entry in the directory), print a line with the name of the | ||
# component and the failure reason. | ||
# | ||
# Arguments: none | ||
# | ||
# Return value: none | ||
# | ||
sub log_failed_components { | ||
|
||
unless ( $this_app->option('state') ) { | ||
$this_app->debug(1,"No component state file defined: cannot list failed components"); | ||
return; | ||
} | ||
|
||
my $comp_state_dir = $this_app->option('state'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so i wasn't aware of the state dir, but i just looked at the default on one of our nodes
so this contains 2 files, one empty one and one with just a newline (not really useful); but they have very different timestamps. we should make sure that this dir is cleaned up etc etc; the documentations suggests it's only cleaned up when components become inactive, so we'd need some support of "recent" failures. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be we should fix the documentation but for me (checked on my test system), there is a file left by ncm-ncd only for failed components (components with errors, not warning) and the file is removed after a succesful execution of the component. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not that the file is created when ncm-ncd determined that the component should be run and is empty until it actually ran and encounters some error. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so what i have is some leftovers i guess. well, if it causes issues, i'll open an issue with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only issue you'll have with leftover if that they will appear as failed components even if they are not... but only in the case they are other failed components (ncm-ncd returns a non 0 exit status). When ncm-ncd succeeds, there is no attempt to look at the state directory. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I confirm that |
||
if ( opendir(my $dh, $comp_state_dir) ) { | ||
my @comps = grep { -f "$comp_state_dir/$_" } readdir($dh); | ||
$this_app->warn("No failed component found in the component state directory ($comp_state_dir)") if ( @comps == 0 ); | ||
foreach my $component (sort(@comps)) { | ||
my $fh = CAF::FileEditor->new("$comp_state_dir/$component"); | ||
my $comp_msg = "$fh"; | ||
chomp $comp_msg; | ||
$this_app->warn("Component $component failed with message: $comp_msg"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe deal with empty There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it should not happen... but if there is an empty message, the file with be empty and $fh will be an empty string so that should not cause an error. the fact that there is no message after ':' will mean the message was empty. Is it work building a different log message when the message is empty (making less clear it was empty)? If you think that yes, I'll happily do it! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, if this is some leftover on our side, and empty is the exception, this is ok for me. |
||
$fh->close(); | ||
} | ||
closedir $dh; | ||
} else { | ||
$this_app->error("Failed to open component state directory ($comp_state_dir)"); | ||
} | ||
|
||
} | ||
|
||
|
||
#------------------------------------------------------------ | ||
# main loop | ||
#------------------------------------------------------------ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, if this is going to be frequently used, i propose to add to CAF (one or more) of:
$p->get_executable()
is_executable
to log the failure$p->execute_if_executable()
(better to find shorter name likeeie
😄 ) which is basicallyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 for
get_executable()
... In fact in another issue I advocated for this!As for the
execute_if_exists()
proposal, I'm wondering why it would not be the standard behaviour forexecute()
. Is there really a use case to executing a non executable command?Probably better to open an issue in CAF for this discussion...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe use the constant
NCD_EXECUTABLE
here?