diff --git a/scala_helpers/MeasureDataQuality.scala b/scala_helpers/MeasureDataQuality.scala new file mode 100644 index 0000000..91ffe34 --- /dev/null +++ b/scala_helpers/MeasureDataQuality.scala @@ -0,0 +1,77 @@ +/** + * Investigate various output quality measures supplied by ARX + * + */ +package examples.example_3 + + +import base.DefaultConfiguration +import org.deidentifier.arx.Data +import postprocessor.ResultPrinter.printHandleTop + +//import scala.collection.JavaConversions._ +//import collection.convert.ImplicitConversionsToScala.map AsScala +import collection.JavaConverters.* // asScala +import collection.convert.ImplicitConversions.* +import java.io.File +import java.nio.charset.Charset + +object MeasureDataQuality extends App{ + + def loadData: Tuple2[Data, Data] = { + + val dataFileOrg: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-1") + val dataOrg: Data = Data.create(dataFileOrg, Charset.defaultCharset, ',') + + val dataFileDist: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-2") + val dataDist: Data = Data.create(dataFileDist, Charset.defaultCharset, ',') + + require(dataOrg.getHandle.getNumRows == dataDist.getHandle.getNumRows) + require(dataOrg.getHandle.getNumColumns == dataDist.getHandle.getNumColumns) + + // define the attribute types + System.out.println(s"Number of rows ${dataOrg.getHandle.getNumRows}") + System.out.println(s"Number of cols ${dataOrg.getHandle.getNumColumns}") + + printHandleTop(handle = dataOrg.getHandle, n = 5) + System.out.println("Done...") + + (dataOrg, dataDist) + } + + def experiment1: Unit = { + + val data = loadData + + val dataHandleOrg = data._1.getHandle + val dataHandleDist = data._2.getHandle + + val summaryStatsDist = dataHandleDist.getStatistics().getSummaryStatistics(true) + val summaryStatsOrg = dataHandleOrg.getStatistics().getSummaryStatistics(true) + // getEquivalenceClassStatistics(); //getEquivalenceClassStatistics(); + + for((key, value) <- summaryStatsDist){ + println(s"Column: ${key}") + println("-----------------------Distorted/Original") + println(s"distinctNumberOfValues ${value.getNumberOfDistinctValuesAsString}/${summaryStatsOrg.get(key).getNumberOfDistinctValuesAsString}") + println(s"Mode ${value.getModeAsString}/${summaryStatsOrg.get(key).getModeAsString}") + if(value.isMaxAvailable) { + println(s"Max ${value.getMaxAsString}/${summaryStatsOrg.get(key).getMaxAsString}") + println(s"Min ${value.getMinAsString}/${summaryStatsOrg.get(key).getMinAsString}") + } + } + } + + def runKAnonimity: Unit = { + + val data = loadData + + // create the hierarchies for the ethnicity and + // salary + + } + + // execute Experiment 1 + experiment1 + +} diff --git a/scala_helpers/build.sbt b/scala_helpers/build.sbt new file mode 100644 index 0000000..7cbf069 --- /dev/null +++ b/scala_helpers/build.sbt @@ -0,0 +1,8 @@ +name := "data_anonymizer_scala" + +version := "0.1" + +scalaVersion := "3.0.2" + +libraryDependencies += "org.scalactic" %% "scalactic" % "3.2.10" +libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.10" % "test" diff --git a/src/algorithms/q_learning.py b/src/algorithms/q_learning.py index 65a5a96..b011e12 100644 --- a/src/algorithms/q_learning.py +++ b/src/algorithms/q_learning.py @@ -74,7 +74,7 @@ def play(self, env: Env, stop_criterion: Criterion) -> None: # set the q_table for the policy self.config.policy.q_table = self.q_table - total_dist = env.total_average_current_distortion() + total_dist = env.total_current_distortion() while stop_criterion.continue_itr(total_dist): if stop_criterion.iteration_counter == 12: @@ -87,7 +87,7 @@ def play(self, env: Env, stop_criterion: Criterion) -> None: print("{0} At state={1} with distortion={2} select action={3}".format("INFO: ", state_idx, total_dist, action.column_name + "-" + action.action_type.name)) env.step(action=action) - total_dist = env.total_average_current_distortion() + total_dist = env.total_current_distortion() def train(self, env: Env, **options) -> tuple: diff --git a/src/datasets/dataset_wrapper.py b/src/datasets/dataset_wrapper.py index 55f6cd5..92c6cd7 100644 --- a/src/datasets/dataset_wrapper.py +++ b/src/datasets/dataset_wrapper.py @@ -29,7 +29,6 @@ def read(self, filename: Path, **options) -> None: class PandasDSWrapper(DSWrapper[pd.DataFrame]): - """ Simple wrapper to a pandas DataFrame object. Facilitates various actions on the original dataset @@ -60,15 +59,15 @@ def n_columns(self) -> int: def schema(self) -> dict: return pd.io.json.build_table_schema(self.ds) - def save_to_csv(self, filename: Path) -> None: + def save_to_csv(self, filename: Path, save_index: bool) -> None: """ Save the underlying dataset in a csv format :param filename: :return: """ - self.ds.to_csv(filename) + self.ds.to_csv(filename, index=save_index) - def read(self, filename: Path, **options) -> None: + def read(self, filename: Path, **options) -> None: """ Load a data set from a file :param filename: @@ -145,14 +144,14 @@ def get_column(self, col_name: str): return self.ds.loc[:, col_name] def get_column_unique_values(self, col_name: str): - """ + """ Returns the unique values for the column :param col_name: :return: """ - col = self.get_column(col_name=col_name) - vals = col.values.ravel() - return pd.unique(vals) + col = self.get_column(col_name=col_name) + vals = col.values.ravel() + return pd.unique(vals) def get_columns_types(self): return list(self.ds.dtypes) @@ -181,8 +180,3 @@ def apply_column_transform(self, column_name: str, transform: Transform) -> None column = self.get_column(col_name=column_name) column = transform.act(**{"data": column.values}) self.ds[transform.column_name] = column - - - - - diff --git a/src/examples/__init__.py b/src/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/examples/create_hierarchies_arx.py b/src/examples/create_hierarchies_arx.py new file mode 100644 index 0000000..22904c5 --- /dev/null +++ b/src/examples/create_hierarchies_arx.py @@ -0,0 +1,66 @@ +""" +This example shows how to create hierarchies suitable to +be loaded in the ARX tool +""" +import csv +from src.datasets.datasets_loaders import MockSubjectsLoader + + +def get_ethnicity_hierarchy(): + + ethnicity_hierarchy = {} + + ethnicity_hierarchy["Mixed White/Asian"] = ["White/Asian", "Mixed"] + ethnicity_hierarchy["Chinese"] = ["Asian", "Asian"] + ethnicity_hierarchy["Indian"] = ["Asian", "Asian"] + ethnicity_hierarchy["Mixed White/Black African"] = ["White/Black", "Mixed"] + ethnicity_hierarchy["Black African"] = ["Black", "African"] + ethnicity_hierarchy["Asian other"] = ["Asian", "Other"] + ethnicity_hierarchy["Black other"] = ["Black", "Other"] + ethnicity_hierarchy["Mixed White/Black Caribbean"] = ["White/Black", "Mixed"] + ethnicity_hierarchy["Mixed other"] = ["Mixed", "Mixe"] + ethnicity_hierarchy["Arab"] = ["Asian", "Asian"] + ethnicity_hierarchy["White Irish"] = ["Irish", "European"] + ethnicity_hierarchy["Not stated"] = ["Not stated", "Not stated"] + ethnicity_hierarchy["White Gypsy/Traveller"] = ["White", "White"] + ethnicity_hierarchy["White British"] = ["British", "European"] + ethnicity_hierarchy["Bangladeshi"] = ["Asian", "Asian"] + ethnicity_hierarchy["White other"] = ["White", "White"] + ethnicity_hierarchy["Black Caribbean"] = ["Black", "Caribbean"] + ethnicity_hierarchy["Pakistani"] = ["Asian", "Asian"] + + return ethnicity_hierarchy + + +if __name__ == '__main__': + + # specify the columns to drop + drop_columns = MockSubjectsLoader.FEATURES_DROP_NAMES + ["preventative_treatment", "gender", + "education", "mutation_status"] + MockSubjectsLoader.FEATURES_DROP_NAMES = drop_columns + + # do a salary normalization + MockSubjectsLoader.NORMALIZED_COLUMNS = ["salary"] + + # specify the columns to use + MockSubjectsLoader.COLUMNS_TYPES = {"ethnicity": str, "salary": float, "diagnosis": int} + ds = MockSubjectsLoader() + + ehnicity_map = get_ethnicity_hierarchy() + # get the ethincity column loop over + # the values and create the hierarchy file + filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv" + with open(filename, 'w') as fh: + writer = csv.writer(fh, delimiter=",") + + ethnicity_column = ds.get_column(col_name="ethnicity").values + + for val in ethnicity_column: + + if val not in ehnicity_map: + raise ValueError("Value {0} not in ethnicity map") + + row = [val] + row.extend(ehnicity_map[val]) + writer.writerow(row) + diff --git a/src/examples/qlearning_three_columns.py b/src/examples/qlearning_three_columns.py index b3bf642..0f6abb4 100644 --- a/src/examples/qlearning_three_columns.py +++ b/src/examples/qlearning_three_columns.py @@ -151,11 +151,10 @@ def get_ethinicity_hierarchy(): # create the environment env = DiscreteStateEnvironment(env_config=env_config) env.reset() - env.save_current_dataset(episode_index=-1) - # save the original dataset for comparison - env.save_current_dataset(episode_index=-1) - env.reset() + # save the data before distortion so that we can + # later load it on ARX + env.save_current_dataset(episode_index=-1, save_index=False) # configuration for the Q-learner algo_config = QLearnConfig() @@ -195,7 +194,8 @@ def get_ethinicity_hierarchy(): stop_criterion = IterationControl(n_itrs=10, min_dist=MIN_DISTORTION, max_dist=MAX_DISTORTION) agent.play(env=env, stop_criterion=stop_criterion) - env.save_current_dataset(episode_index=-2) - + env.save_current_dataset(episode_index=-2, save_index=False) + print("{0} Done....".format(INFO)) + print("=============================================") diff --git a/src/spaces/discrete_state_environment.py b/src/spaces/discrete_state_environment.py index a2eda09..2dfae51 100644 --- a/src/spaces/discrete_state_environment.py +++ b/src/spaces/discrete_state_environment.py @@ -128,14 +128,16 @@ def n_states(self) -> int: def get_action(self, aidx: int) -> ActionBase: return self.config.action_space[aidx] - def save_current_dataset(self, episode_index: int) -> None: + def save_current_dataset(self, episode_index: int, save_index: bool = False) -> None: """ Save the current distorted datase for the given episode index :param episode_index: + :param save_index: :return: """ self.distorted_data_set.save_to_csv( - filename=Path(str(self.config.distorted_set_path) + "_" + str(episode_index))) + filename=Path(str(self.config.distorted_set_path) + "_" + str(episode_index)), + save_index=save_index) def create_bins(self) -> None: """ @@ -216,15 +218,14 @@ def apply_action(self, action: ActionBase): self.column_distances[action.column_name] = distance - def total_average_current_distortion(self) -> float: + def total_current_distortion(self) -> float: """ - Calculates the average total distortion of the dataset - by summing over the current computed distances for each column + Calculates the current total distortion of the dataset. :return: """ return self.config.distortion_calculator.total_distortion( - list(self.column_distances.values())) # float(np.mean(list(self.column_distances.values()))) + list(self.column_distances.values())) def reset(self, **options) -> TimeStep: """ @@ -270,7 +271,7 @@ def step(self, action: ActionBase) -> TimeStep: self.apply_action(action=action) # calculate the distortion of the dataset - current_distortion = self.total_average_current_distortion() + current_distortion = self.total_current_distortion() # get the reward for the current distortion reward = self.config.reward_manager.get_reward_for_state(state=current_distortion, **{"action": action}) @@ -312,6 +313,7 @@ def step(self, action: ActionBase) -> TimeStep: # TODO: these modifications will cause the agent to always # move close to transition points + # TODO: Remove the magic constants if next_state is not None and self.current_time_step.observation is not None: if next_state < min_dist_bin <= self.current_time_step.observation: # the agent chose to step into the chaos again