From bd12511cebd5a7c446883c5417509bac2c657963 Mon Sep 17 00:00:00 2001 From: Sotiris Fragkiskos Date: Mon, 7 Dec 2015 17:58:31 +0200 Subject: [PATCH] anonymize function attempt #1 --- common_module.py | 39 +++++++++++++++++++++++++++++++++++++-- plugin_sge.py | 19 +++++++++++++------ qtop.py | 6 ++++-- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/common_module.py b/common_module.py index 01c2633..e6b076a 100755 --- a/common_module.py +++ b/common_module.py @@ -5,6 +5,7 @@ from tempfile import mkstemp import os import re +from itertools import count import errno try: import ujson as json @@ -58,6 +59,7 @@ class StatMaker: def __init__(self, config): self.l = list() self.config = config + self.anonymize = anonymize_func() self.stat_mapping = { 'txtyaml': (self.stat_write_lines, {}, 'yaml'), @@ -339,5 +341,38 @@ def get_jobs_info(fn, write_method=options.write_method): return job_ids, usernames, job_states, queue_names -def anonymize(strng): - return strng \ No newline at end of file +def anonymize_func(): + """ + creates and returns an _anonymize_func object (closure) + """ + counters = {} + stored_dict = {} + for key in ['users', 'wns', 'qs']: + counters[key] = count() + + maps = { + 'users': '_anon_user_', + 'wns': '_anon_wn_', + 'qs': '_anon_q_' + } + + def _anonymize_func(s, a_type, stored_dict=stored_dict): + """ + d4-p4-04 --> d_anon_wn_0 + d4-p4-05 --> d_anon_wn_1 + biomed017--> b_anon_user_0 + alice --> a_anon_q_0 + """ + dup_counter = counters[a_type] + + s_type = maps[a_type] + cnt = '0' + new_name_parts = [s[0], s_type, cnt] + if s not in stored_dict: + cnt = str(dup_counter.next()) + new_name_parts.pop() + new_name_parts.append(cnt) + stored_dict.setdefault(s, (''.join(new_name_parts), s_type)) + return stored_dict[s][0] + + return _anonymize_func diff --git a/plugin_sge.py b/plugin_sge.py index a510a4a..db100d5 100755 --- a/plugin_sge.py +++ b/plugin_sge.py @@ -1,6 +1,6 @@ __author__ = 'sfranky' from xml.etree import ElementTree as etree -from common_module import logging, check_empty_file, StatMaker, get_new_temp_file, options, anonymize +from common_module import logging, check_empty_file, StatMaker, get_new_temp_file, options, anonymize_func import os @@ -24,6 +24,7 @@ def extract_job_info(elem, elem_text): def _get_statq_from_xml(fn, write_method=options.write_method): logging.debug("Parsing tree of %s" % fn) check_empty_file(fn) + anonymize = anonymize_func() with open(fn, mode='rb') as fin: try: tree = etree.parse(fin) @@ -43,7 +44,7 @@ def _get_statq_from_xml(fn, write_method=options.write_method): queue_names = queue_elem.findall('resource') for _queue_name in queue_names: if _queue_name.attrib.get('name') == 'qname': - queue_name = _queue_name.text + queue_name = _queue_name.text if not options.ANONYMIZE else anonymize(_queue_name.text, 'qs') break else: raise ValueError("No such resource") @@ -125,7 +126,8 @@ def convert_qstat_to_yaml(self, orig_file, out_file, write_method): queue_name_elems = queue_elem.findall('resource') for queue_name_elem in queue_name_elems: if queue_name_elem.attrib.get('name') == 'qname': - queue_name = queue_name_elem.text + queue_name = queue_name_elem.text if not options.ANONYMIZE else self.anonymize(queue_name_elem.text, 'qs') + # import wdb; wdb.set_trace() break else: raise ValueError("No such queue name") @@ -152,7 +154,7 @@ def _extract_job_info(self, elem, elem_text, queue_name): qstat_values = dict() qstat_values['JobId'] = subelem.find('./JB_job_number').text qstat_values['UnixAccount'] = subelem.find('./JB_owner').text \ - if not options.ANONYMIZE else anonymize(subelem.find('./JB_owner').text) + if not options.ANONYMIZE else self.anonymize(subelem.find('./JB_owner').text, 'users') qstat_values['S'] = subelem.find('./state').text qstat_values['Queue'] = queue_name self.l.append(qstat_values) @@ -177,6 +179,7 @@ def dump_all(fd, write_func_args): def _calc_everything(fn, write_method): logging.debug('Parsing tree of %s' % fn) + anonymize = anonymize_func() with open(fn, 'rb') as fin: tree = etree.parse(fin) root = tree.getroot() @@ -196,11 +199,15 @@ def _calc_everything(fn, write_method): # worker_node.setdefault('qname', []) for resource in resources: if resource.attrib.get('name') == 'hostname': - worker_node['domainname'] = resource.text + worker_node['domainname'] = resource.text if not options.ANONYMIZE else anonymize(resource.text, 'wns') count += 1 if count == 2: break elif resource.attrib.get('name') == 'qname': - worker_node['qname'] = set(resource.text[0]) if slots_used else set() + if not slots_used: + worker_node['qname'] = set() + else: + worker_node['qname'] = set(resource.text[0]) \ + if not options.ANONYMIZE else set(anonymize(resource.text[0], 'qs')) count += 1 if count == 2: break else: diff --git a/qtop.py b/qtop.py index 8566573..bda8890 100755 --- a/qtop.py +++ b/qtop.py @@ -27,7 +27,7 @@ # modules from constants import * import common_module -from common_module import logging, options, sections_off +from common_module import logging, options, sections_off, anonymize_func import plugin_pbs, plugin_oar, plugin_sge from plugin_pbs import * from plugin_oar import * @@ -139,7 +139,7 @@ def calculate_cluster(worker_nodes): _all_letters = [] _all_str_digits_with_empties = [] - re_nodename = r'(^[A-Za-z0-9-]+)(?=\.|$)' + re_nodename = r'(^[A-Za-z0-9-]+)(?=\.|$)' if not options.ANONYMIZE else r'\w_anon_\w+' for node in worker_nodes: nodename_match = re.search(re_nodename, node['domainname']) _nodename = nodename_match.group(0) @@ -1608,11 +1608,13 @@ def safe_exit_with_file_close(handle, name, stdout, delete_file=False): input_filenames = get_input_filenames() # reset_yaml_files() # either that or having a pid appended in the filename + # anonymize = anonymize_func() if not options.YAML_EXISTS: convert_to_yaml(scheduler, INPUT_FNs_commands, input_filenames) yaml_files = get_yaml_files(scheduler, input_filenames) worker_nodes = get_worker_nodes(scheduler)(*yaml_files['get_worker_nodes']) + # import wdb; wdb.set_trace() job_ids, user_names, job_states, _ = get_jobs_info(scheduler)(*yaml_files['get_jobs_info']) total_running_jobs, total_queued_jobs, qstatq_lod = get_queues_info(scheduler)(*yaml_files['get_queues_info']) deprecate_old_yaml_files(*yaml_files['get_jobs_info'])