From 5b461b36d28663361fde7115454d9f02b1c09a49 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 25 Jan 2022 10:21:50 +0000 Subject: [PATCH] #23 Add column normalization --- src/datasets/dataset_wrapper.py | 19 +++++++++++----- src/datasets/datasets_loaders.py | 38 +++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/datasets/dataset_wrapper.py b/src/datasets/dataset_wrapper.py index f90ed6f..758f477 100644 --- a/src/datasets/dataset_wrapper.py +++ b/src/datasets/dataset_wrapper.py @@ -40,10 +40,6 @@ def __init__(self, columns: dir) -> None: self.columns: dir = columns - # map that holds the hierarchy to be applied - # on each column in the dataset - #self.column_hierarchy = {} - @property def n_rows(self) -> int: """ @@ -91,6 +87,11 @@ def read(self, filename: Path, **options) -> None: # try to cast to the data types self.ds = change_column_types(ds=self.ds, column_types=self.columns) + if "column_normalization" in options and \ + options["column_normalization"] is not None: + for col in options["column_normalization"]: + self.normalize_column(column_name=col) + def normalize_column(self, column_name) -> None: """ Normalizes the column with the given name using the following @@ -108,7 +109,15 @@ def normalize_column(self, column_name) -> None: if data_type is not int or data_type is not float: raise InvalidDataTypeException(param_name=column_name, param_types="[int, float]") - raise NotImplementedError("Function is not implemented") + col_vals = self.get_column(col_name=column_name).values + + min_val = np.min(col_vals) + max_val = np.max(col_vals) + + for i in range(len(col_vals)): + col_vals[i] = (col_vals[i] - min_val) / (max_val - min_val) + + self.ds[column_name] = col_vals def sample_column_name(self) -> str: """ diff --git a/src/datasets/datasets_loaders.py b/src/datasets/datasets_loaders.py index 9d2acf4..3e17de4 100644 --- a/src/datasets/datasets_loaders.py +++ b/src/datasets/datasets_loaders.py @@ -1,28 +1,50 @@ +""" +Utility class that allows to load the mocksubjects.csv +dataset and perform various transformations and cleaning +on it +""" + from pathlib import Path from src.datasets.dataset_wrapper import PandasDSWrapper class MockSubjectsLoader(PandasDSWrapper): + """ + The class MockSubjectsLoader. Loads the mocksubjects.csv + """ + + # Path to the dataset file + FILENAME = Path("../../data/mocksubjects.csv") - DEFAULT_COLUMNS = {"gender": str, "ethnicity": str, "education": int, + # the assumed column types. We use this map to cast + # the types of the columns + COLUMNS_TYPES = {"gender": str, "ethnicity": str, "education": int, "salary": int, "diagnosis": int, "preventative_treatment": str, "mutation_status": int, } - FILENAME = Path("../../data/mocksubjects.csv") - + # features to drop FEATURES_DROP_NAMES = ["NHSno", "given_name", "surname", "dob"] + # Names of the columns in the dataset NAMES = ["NHSno", "given_name", "surname", "gender", "dob", "ethnicity", "education", "salary", "mutation_status", "preventative_treatment", "diagnosis"] + # option to drop NaN DROP_NA = True + # Map that holds for each column the transformations + # we want to apply for each value CHANGE_COLS_VALS = {"diagnosis": [('N', 0)]} + # list of columns to be normalized + NORMALIZED_COLUMNS = [] + def __init__(self): - super(MockSubjectsLoader, self).__init__(columns=MockSubjectsLoader.DEFAULT_COLUMNS) - self.read(filename=MockSubjectsLoader.FILENAME, **{"features_drop_names": MockSubjectsLoader.FEATURES_DROP_NAMES, - "names": MockSubjectsLoader.NAMES, - "drop_na": MockSubjectsLoader.DROP_NA, - "change_col_vals": MockSubjectsLoader.CHANGE_COLS_VALS}) \ No newline at end of file + super(MockSubjectsLoader, self).__init__(columns=MockSubjectsLoader.COLUMNS_TYPES) + self.read(filename=MockSubjectsLoader.FILENAME, + **{"features_drop_names": MockSubjectsLoader.FEATURES_DROP_NAMES, + "names": MockSubjectsLoader.NAMES, + "drop_na": MockSubjectsLoader.DROP_NA, + "change_col_vals": MockSubjectsLoader.CHANGE_COLS_VALS, + "column_normalization": MockSubjectsLoader.NORMALIZED_COLUMNS})