Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions src/datasets/dataset_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@ def __init__(self, columns: dir) -> None:

self.columns: dir = columns

# map that holds the hierarchy to be applied
# on each column in the dataset
#self.column_hierarchy = {}

@property
def n_rows(self) -> int:
"""
Expand Down Expand Up @@ -91,6 +87,11 @@ def read(self, filename: Path, **options) -> None:
# try to cast to the data types
self.ds = change_column_types(ds=self.ds, column_types=self.columns)

if "column_normalization" in options and \
options["column_normalization"] is not None:
for col in options["column_normalization"]:
self.normalize_column(column_name=col)

def normalize_column(self, column_name) -> None:
"""
Normalizes the column with the given name using the following
Expand All @@ -108,7 +109,15 @@ def normalize_column(self, column_name) -> None:
if data_type is not int or data_type is not float:
raise InvalidDataTypeException(param_name=column_name, param_types="[int, float]")

raise NotImplementedError("Function is not implemented")
col_vals = self.get_column(col_name=column_name).values

min_val = np.min(col_vals)
max_val = np.max(col_vals)

for i in range(len(col_vals)):
col_vals[i] = (col_vals[i] - min_val) / (max_val - min_val)

self.ds[column_name] = col_vals

def sample_column_name(self) -> str:
"""
Expand Down
38 changes: 30 additions & 8 deletions src/datasets/datasets_loaders.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,50 @@
"""
Utility class that allows to load the mocksubjects.csv
dataset and perform various transformations and cleaning
on it
"""

from pathlib import Path
from src.datasets.dataset_wrapper import PandasDSWrapper


class MockSubjectsLoader(PandasDSWrapper):
"""
The class MockSubjectsLoader. Loads the mocksubjects.csv
"""

# Path to the dataset file
FILENAME = Path("../../data/mocksubjects.csv")

DEFAULT_COLUMNS = {"gender": str, "ethnicity": str, "education": int,
# the assumed column types. We use this map to cast
# the types of the columns
COLUMNS_TYPES = {"gender": str, "ethnicity": str, "education": int,
"salary": int, "diagnosis": int, "preventative_treatment": str,
"mutation_status": int, }

FILENAME = Path("../../data/mocksubjects.csv")

# features to drop
FEATURES_DROP_NAMES = ["NHSno", "given_name", "surname", "dob"]

# Names of the columns in the dataset
NAMES = ["NHSno", "given_name", "surname", "gender",
"dob", "ethnicity", "education", "salary",
"mutation_status", "preventative_treatment", "diagnosis"]

# option to drop NaN
DROP_NA = True

# Map that holds for each column the transformations
# we want to apply for each value
CHANGE_COLS_VALS = {"diagnosis": [('N', 0)]}

# list of columns to be normalized
NORMALIZED_COLUMNS = []

def __init__(self):
super(MockSubjectsLoader, self).__init__(columns=MockSubjectsLoader.DEFAULT_COLUMNS)
self.read(filename=MockSubjectsLoader.FILENAME, **{"features_drop_names": MockSubjectsLoader.FEATURES_DROP_NAMES,
"names": MockSubjectsLoader.NAMES,
"drop_na": MockSubjectsLoader.DROP_NA,
"change_col_vals": MockSubjectsLoader.CHANGE_COLS_VALS})
super(MockSubjectsLoader, self).__init__(columns=MockSubjectsLoader.COLUMNS_TYPES)
self.read(filename=MockSubjectsLoader.FILENAME,
**{"features_drop_names": MockSubjectsLoader.FEATURES_DROP_NAMES,
"names": MockSubjectsLoader.NAMES,
"drop_na": MockSubjectsLoader.DROP_NA,
"change_col_vals": MockSubjectsLoader.CHANGE_COLS_VALS,
"column_normalization": MockSubjectsLoader.NORMALIZED_COLUMNS})