Skip to content

Commit

Permalink
anonymize function attempt #1
Browse files Browse the repository at this point in the history
  • Loading branch information
sfranky committed Dec 7, 2015
1 parent fe3c443 commit bd12511
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 10 deletions.
39 changes: 37 additions & 2 deletions common_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tempfile import mkstemp
import os
import re
from itertools import count
import errno
try:
import ujson as json
Expand Down Expand Up @@ -58,6 +59,7 @@ class StatMaker:
def __init__(self, config):
self.l = list()
self.config = config
self.anonymize = anonymize_func()

self.stat_mapping = {
'txtyaml': (self.stat_write_lines, {}, 'yaml'),
Expand Down Expand Up @@ -339,5 +341,38 @@ def get_jobs_info(fn, write_method=options.write_method):
return job_ids, usernames, job_states, queue_names


def anonymize(strng):
return strng
def anonymize_func():
"""
creates and returns an _anonymize_func object (closure)
"""
counters = {}
stored_dict = {}
for key in ['users', 'wns', 'qs']:
counters[key] = count()

maps = {
'users': '_anon_user_',
'wns': '_anon_wn_',
'qs': '_anon_q_'
}

def _anonymize_func(s, a_type, stored_dict=stored_dict):
"""
d4-p4-04 --> d_anon_wn_0
d4-p4-05 --> d_anon_wn_1
biomed017--> b_anon_user_0
alice --> a_anon_q_0
"""
dup_counter = counters[a_type]

s_type = maps[a_type]
cnt = '0'
new_name_parts = [s[0], s_type, cnt]
if s not in stored_dict:
cnt = str(dup_counter.next())
new_name_parts.pop()
new_name_parts.append(cnt)
stored_dict.setdefault(s, (''.join(new_name_parts), s_type))
return stored_dict[s][0]

return _anonymize_func
19 changes: 13 additions & 6 deletions plugin_sge.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__author__ = 'sfranky'
from xml.etree import ElementTree as etree
from common_module import logging, check_empty_file, StatMaker, get_new_temp_file, options, anonymize
from common_module import logging, check_empty_file, StatMaker, get_new_temp_file, options, anonymize_func
import os


Expand All @@ -24,6 +24,7 @@ def extract_job_info(elem, elem_text):
def _get_statq_from_xml(fn, write_method=options.write_method):
logging.debug("Parsing tree of %s" % fn)
check_empty_file(fn)
anonymize = anonymize_func()
with open(fn, mode='rb') as fin:
try:
tree = etree.parse(fin)
Expand All @@ -43,7 +44,7 @@ def _get_statq_from_xml(fn, write_method=options.write_method):
queue_names = queue_elem.findall('resource')
for _queue_name in queue_names:
if _queue_name.attrib.get('name') == 'qname':
queue_name = _queue_name.text
queue_name = _queue_name.text if not options.ANONYMIZE else anonymize(_queue_name.text, 'qs')
break
else:
raise ValueError("No such resource")
Expand Down Expand Up @@ -125,7 +126,8 @@ def convert_qstat_to_yaml(self, orig_file, out_file, write_method):
queue_name_elems = queue_elem.findall('resource')
for queue_name_elem in queue_name_elems:
if queue_name_elem.attrib.get('name') == 'qname':
queue_name = queue_name_elem.text
queue_name = queue_name_elem.text if not options.ANONYMIZE else self.anonymize(queue_name_elem.text, 'qs')
# import wdb; wdb.set_trace()
break
else:
raise ValueError("No such queue name")
Expand All @@ -152,7 +154,7 @@ def _extract_job_info(self, elem, elem_text, queue_name):
qstat_values = dict()
qstat_values['JobId'] = subelem.find('./JB_job_number').text
qstat_values['UnixAccount'] = subelem.find('./JB_owner').text \
if not options.ANONYMIZE else anonymize(subelem.find('./JB_owner').text)
if not options.ANONYMIZE else self.anonymize(subelem.find('./JB_owner').text, 'users')
qstat_values['S'] = subelem.find('./state').text
qstat_values['Queue'] = queue_name
self.l.append(qstat_values)
Expand All @@ -177,6 +179,7 @@ def dump_all(fd, write_func_args):

def _calc_everything(fn, write_method):
logging.debug('Parsing tree of %s' % fn)
anonymize = anonymize_func()
with open(fn, 'rb') as fin:
tree = etree.parse(fin)
root = tree.getroot()
Expand All @@ -196,11 +199,15 @@ def _calc_everything(fn, write_method):
# worker_node.setdefault('qname', [])
for resource in resources:
if resource.attrib.get('name') == 'hostname':
worker_node['domainname'] = resource.text
worker_node['domainname'] = resource.text if not options.ANONYMIZE else anonymize(resource.text, 'wns')
count += 1
if count == 2: break
elif resource.attrib.get('name') == 'qname':
worker_node['qname'] = set(resource.text[0]) if slots_used else set()
if not slots_used:
worker_node['qname'] = set()
else:
worker_node['qname'] = set(resource.text[0]) \
if not options.ANONYMIZE else set(anonymize(resource.text[0], 'qs'))
count += 1
if count == 2: break
else:
Expand Down
6 changes: 4 additions & 2 deletions qtop.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
# modules
from constants import *
import common_module
from common_module import logging, options, sections_off
from common_module import logging, options, sections_off, anonymize_func
import plugin_pbs, plugin_oar, plugin_sge
from plugin_pbs import *
from plugin_oar import *
Expand Down Expand Up @@ -139,7 +139,7 @@ def calculate_cluster(worker_nodes):
_all_letters = []
_all_str_digits_with_empties = []

re_nodename = r'(^[A-Za-z0-9-]+)(?=\.|$)'
re_nodename = r'(^[A-Za-z0-9-]+)(?=\.|$)' if not options.ANONYMIZE else r'\w_anon_\w+'
for node in worker_nodes:
nodename_match = re.search(re_nodename, node['domainname'])
_nodename = nodename_match.group(0)
Expand Down Expand Up @@ -1608,11 +1608,13 @@ def safe_exit_with_file_close(handle, name, stdout, delete_file=False):
input_filenames = get_input_filenames()

# reset_yaml_files() # either that or having a pid appended in the filename
# anonymize = anonymize_func()
if not options.YAML_EXISTS:
convert_to_yaml(scheduler, INPUT_FNs_commands, input_filenames)
yaml_files = get_yaml_files(scheduler, input_filenames)

worker_nodes = get_worker_nodes(scheduler)(*yaml_files['get_worker_nodes'])
# import wdb; wdb.set_trace()
job_ids, user_names, job_states, _ = get_jobs_info(scheduler)(*yaml_files['get_jobs_info'])
total_running_jobs, total_queued_jobs, qstatq_lod = get_queues_info(scheduler)(*yaml_files['get_queues_info'])
deprecate_old_yaml_files(*yaml_files['get_jobs_info'])
Expand Down

0 comments on commit bd12511

Please sign in to comment.