Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions scala_helpers/MeasureDataQuality.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* Investigate various output quality measures supplied by ARX
*
*/
package examples.example_3


import base.DefaultConfiguration
import org.deidentifier.arx.Data
import postprocessor.ResultPrinter.printHandleTop

//import scala.collection.JavaConversions._
//import collection.convert.ImplicitConversionsToScala.map AsScala
import collection.JavaConverters.* // asScala
import collection.convert.ImplicitConversions.*
import java.io.File
import java.nio.charset.Charset

object MeasureDataQuality extends App{

def loadData: Tuple2[Data, Data] = {

val dataFileOrg: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-1")
val dataOrg: Data = Data.create(dataFileOrg, Charset.defaultCharset, ',')

val dataFileDist: File = new File("/home/alex/qi3/drl_anonymity/src/examples/q_learn_distorted_sets/distorted_set_-2")
val dataDist: Data = Data.create(dataFileDist, Charset.defaultCharset, ',')

require(dataOrg.getHandle.getNumRows == dataDist.getHandle.getNumRows)
require(dataOrg.getHandle.getNumColumns == dataDist.getHandle.getNumColumns)

// define the attribute types
System.out.println(s"Number of rows ${dataOrg.getHandle.getNumRows}")
System.out.println(s"Number of cols ${dataOrg.getHandle.getNumColumns}")

printHandleTop(handle = dataOrg.getHandle, n = 5)
System.out.println("Done...")

(dataOrg, dataDist)
}

def experiment1: Unit = {

val data = loadData

val dataHandleOrg = data._1.getHandle
val dataHandleDist = data._2.getHandle

val summaryStatsDist = dataHandleDist.getStatistics().getSummaryStatistics(true)
val summaryStatsOrg = dataHandleOrg.getStatistics().getSummaryStatistics(true)
// getEquivalenceClassStatistics(); //getEquivalenceClassStatistics();

for((key, value) <- summaryStatsDist){
println(s"Column: ${key}")
println("-----------------------Distorted/Original")
println(s"distinctNumberOfValues ${value.getNumberOfDistinctValuesAsString}/${summaryStatsOrg.get(key).getNumberOfDistinctValuesAsString}")
println(s"Mode ${value.getModeAsString}/${summaryStatsOrg.get(key).getModeAsString}")
if(value.isMaxAvailable) {
println(s"Max ${value.getMaxAsString}/${summaryStatsOrg.get(key).getMaxAsString}")
println(s"Min ${value.getMinAsString}/${summaryStatsOrg.get(key).getMinAsString}")
}
}
}

def runKAnonimity: Unit = {

val data = loadData

// create the hierarchies for the ethnicity and
// salary

}

// execute Experiment 1
experiment1

}
8 changes: 8 additions & 0 deletions scala_helpers/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name := "data_anonymizer_scala"

version := "0.1"

scalaVersion := "3.0.2"

libraryDependencies += "org.scalactic" %% "scalactic" % "3.2.10"
libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.10" % "test"
4 changes: 2 additions & 2 deletions src/algorithms/q_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def play(self, env: Env, stop_criterion: Criterion) -> None:

# set the q_table for the policy
self.config.policy.q_table = self.q_table
total_dist = env.total_average_current_distortion()
total_dist = env.total_current_distortion()
while stop_criterion.continue_itr(total_dist):

if stop_criterion.iteration_counter == 12:
Expand All @@ -87,7 +87,7 @@ def play(self, env: Env, stop_criterion: Criterion) -> None:
print("{0} At state={1} with distortion={2} select action={3}".format("INFO: ", state_idx, total_dist,
action.column_name + "-" + action.action_type.name))
env.step(action=action)
total_dist = env.total_average_current_distortion()
total_dist = env.total_current_distortion()

def train(self, env: Env, **options) -> tuple:

Expand Down
20 changes: 7 additions & 13 deletions src/datasets/dataset_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def read(self, filename: Path, **options) -> None:


class PandasDSWrapper(DSWrapper[pd.DataFrame]):

"""
Simple wrapper to a pandas DataFrame object.
Facilitates various actions on the original dataset
Expand Down Expand Up @@ -60,15 +59,15 @@ def n_columns(self) -> int:
def schema(self) -> dict:
return pd.io.json.build_table_schema(self.ds)

def save_to_csv(self, filename: Path) -> None:
def save_to_csv(self, filename: Path, save_index: bool) -> None:
"""
Save the underlying dataset in a csv format
:param filename:
:return:
"""
self.ds.to_csv(filename)
self.ds.to_csv(filename, index=save_index)

def read(self, filename: Path, **options) -> None:
def read(self, filename: Path, **options) -> None:
"""
Load a data set from a file
:param filename:
Expand Down Expand Up @@ -145,14 +144,14 @@ def get_column(self, col_name: str):
return self.ds.loc[:, col_name]

def get_column_unique_values(self, col_name: str):
"""
"""
Returns the unique values for the column
:param col_name:
:return:
"""
col = self.get_column(col_name=col_name)
vals = col.values.ravel()
return pd.unique(vals)
col = self.get_column(col_name=col_name)
vals = col.values.ravel()
return pd.unique(vals)

def get_columns_types(self):
return list(self.ds.dtypes)
Expand Down Expand Up @@ -181,8 +180,3 @@ def apply_column_transform(self, column_name: str, transform: Transform) -> None
column = self.get_column(col_name=column_name)
column = transform.act(**{"data": column.values})
self.ds[transform.column_name] = column





Empty file added src/examples/__init__.py
Empty file.
66 changes: 66 additions & 0 deletions src/examples/create_hierarchies_arx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
This example shows how to create hierarchies suitable to
be loaded in the ARX tool
"""
import csv
from src.datasets.datasets_loaders import MockSubjectsLoader


def get_ethnicity_hierarchy():

ethnicity_hierarchy = {}

ethnicity_hierarchy["Mixed White/Asian"] = ["White/Asian", "Mixed"]
ethnicity_hierarchy["Chinese"] = ["Asian", "Asian"]
ethnicity_hierarchy["Indian"] = ["Asian", "Asian"]
ethnicity_hierarchy["Mixed White/Black African"] = ["White/Black", "Mixed"]
ethnicity_hierarchy["Black African"] = ["Black", "African"]
ethnicity_hierarchy["Asian other"] = ["Asian", "Other"]
ethnicity_hierarchy["Black other"] = ["Black", "Other"]
ethnicity_hierarchy["Mixed White/Black Caribbean"] = ["White/Black", "Mixed"]
ethnicity_hierarchy["Mixed other"] = ["Mixed", "Mixe"]
ethnicity_hierarchy["Arab"] = ["Asian", "Asian"]
ethnicity_hierarchy["White Irish"] = ["Irish", "European"]
ethnicity_hierarchy["Not stated"] = ["Not stated", "Not stated"]
ethnicity_hierarchy["White Gypsy/Traveller"] = ["White", "White"]
ethnicity_hierarchy["White British"] = ["British", "European"]
ethnicity_hierarchy["Bangladeshi"] = ["Asian", "Asian"]
ethnicity_hierarchy["White other"] = ["White", "White"]
ethnicity_hierarchy["Black Caribbean"] = ["Black", "Caribbean"]
ethnicity_hierarchy["Pakistani"] = ["Asian", "Asian"]

return ethnicity_hierarchy


if __name__ == '__main__':

# specify the columns to drop
drop_columns = MockSubjectsLoader.FEATURES_DROP_NAMES + ["preventative_treatment", "gender",
"education", "mutation_status"]
MockSubjectsLoader.FEATURES_DROP_NAMES = drop_columns

# do a salary normalization
MockSubjectsLoader.NORMALIZED_COLUMNS = ["salary"]

# specify the columns to use
MockSubjectsLoader.COLUMNS_TYPES = {"ethnicity": str, "salary": float, "diagnosis": int}
ds = MockSubjectsLoader()

ehnicity_map = get_ethnicity_hierarchy()
# get the ethincity column loop over
# the values and create the hierarchy file
filename = "/home/alex/qi3/drl_anonymity/data/hierarchies/ethnicity_hierarchy.csv"
with open(filename, 'w') as fh:
writer = csv.writer(fh, delimiter=",")

ethnicity_column = ds.get_column(col_name="ethnicity").values

for val in ethnicity_column:

if val not in ehnicity_map:
raise ValueError("Value {0} not in ethnicity map")

row = [val]
row.extend(ehnicity_map[val])
writer.writerow(row)

12 changes: 6 additions & 6 deletions src/examples/qlearning_three_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,10 @@ def get_ethinicity_hierarchy():
# create the environment
env = DiscreteStateEnvironment(env_config=env_config)
env.reset()
env.save_current_dataset(episode_index=-1)

# save the original dataset for comparison
env.save_current_dataset(episode_index=-1)
env.reset()
# save the data before distortion so that we can
# later load it on ARX
env.save_current_dataset(episode_index=-1, save_index=False)

# configuration for the Q-learner
algo_config = QLearnConfig()
Expand Down Expand Up @@ -195,7 +194,8 @@ def get_ethinicity_hierarchy():

stop_criterion = IterationControl(n_itrs=10, min_dist=MIN_DISTORTION, max_dist=MAX_DISTORTION)
agent.play(env=env, stop_criterion=stop_criterion)
env.save_current_dataset(episode_index=-2)

env.save_current_dataset(episode_index=-2, save_index=False)
print("{0} Done....".format(INFO))
print("=============================================")


16 changes: 9 additions & 7 deletions src/spaces/discrete_state_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,16 @@ def n_states(self) -> int:
def get_action(self, aidx: int) -> ActionBase:
return self.config.action_space[aidx]

def save_current_dataset(self, episode_index: int) -> None:
def save_current_dataset(self, episode_index: int, save_index: bool = False) -> None:
"""
Save the current distorted datase for the given episode index
:param episode_index:
:param save_index:
:return:
"""
self.distorted_data_set.save_to_csv(
filename=Path(str(self.config.distorted_set_path) + "_" + str(episode_index)))
filename=Path(str(self.config.distorted_set_path) + "_" + str(episode_index)),
save_index=save_index)

def create_bins(self) -> None:
"""
Expand Down Expand Up @@ -216,15 +218,14 @@ def apply_action(self, action: ActionBase):

self.column_distances[action.column_name] = distance

def total_average_current_distortion(self) -> float:
def total_current_distortion(self) -> float:
"""
Calculates the average total distortion of the dataset
by summing over the current computed distances for each column
Calculates the current total distortion of the dataset.
:return:
"""

return self.config.distortion_calculator.total_distortion(
list(self.column_distances.values())) # float(np.mean(list(self.column_distances.values())))
list(self.column_distances.values()))

def reset(self, **options) -> TimeStep:
"""
Expand Down Expand Up @@ -270,7 +271,7 @@ def step(self, action: ActionBase) -> TimeStep:
self.apply_action(action=action)

# calculate the distortion of the dataset
current_distortion = self.total_average_current_distortion()
current_distortion = self.total_current_distortion()

# get the reward for the current distortion
reward = self.config.reward_manager.get_reward_for_state(state=current_distortion, **{"action": action})
Expand Down Expand Up @@ -312,6 +313,7 @@ def step(self, action: ActionBase) -> TimeStep:

# TODO: these modifications will cause the agent to always
# move close to transition points
# TODO: Remove the magic constants
if next_state is not None and self.current_time_step.observation is not None:
if next_state < min_dist_bin <= self.current_time_step.observation:
# the agent chose to step into the chaos again
Expand Down