feat(clean): add clean_ml function

sfu-db · Sep 20, 2021 · 909cd19 · 909cd19 · github-actions · Sep 20, 2021
1 parent 26f4f9e
commit 909cd19
Show file tree

Hide file tree

Showing 27 changed files with 2,236 additions and 0 deletions.
diff --git a/dataprep/clean/clean_ml.py b/dataprep/clean/clean_ml.py
diff --git a/dataprep/clean/components/.DS_Store b/dataprep/clean/components/.DS_Store
diff --git a/dataprep/clean/components/__init__.py b/dataprep/clean/components/__init__.py
@@ -0,0 +1,17 @@
+"""
+Initialize component dictionary.
+"""
+
+from .cat_encoder import CatEncoder
+from .cat_imputer import CatImputer
+from .num_imputer import NumImputer
+from .num_scaler import NumScaler
+from .variance_thresholder import VarianceThresholder
+
+component_dic = {
+    "cat_encoding": CatEncoder,
+    "cat_imputation": CatImputer,
+    "num_imputation": NumImputer,
+    "num_scaling": NumScaler,
+    "variance_threshold": VarianceThresholder,
+}
diff --git a/dataprep/clean/components/cat_encoder.py b/dataprep/clean/components/cat_encoder.py
@@ -0,0 +1,80 @@
+"""
+Implement categorical encoder component.
+"""
+
+from typing import Any, Tuple, Dict
+import dask.dataframe as dd
+
+from .cat_encoding import operator_dic
+
+
+class CatEncoder:
+
+    """Categorical encoder for encoding categorical columns
+    Attributes:
+        encode_type
+            Name of categorical encoder
+        encoder
+            Encoder object
+    """
+
+    def __init__(self, cat_pipe_info: Dict[str, Any]) -> None:
+        """
+        This function initiate categorical encoder.
+
+        Parameters
+        ----------
+        cat_pipe_info
+            Information of pipeline managing categorical columns,
+            including the arrangement of components, name of operators
+            and other information should be provided, such as filling value for imputation.
+        """
+
+        if isinstance(cat_pipe_info["cat_encoding"], str):
+            encode_type = cat_pipe_info["cat_encoding"]
+            self.encode_type = encode_type
+            self.encoder = operator_dic[self.encode_type]()
+        # elif isinstance(cat_pipe_info['cat_encoding'], object):
+        else:
+            self.encoder = cat_pipe_info["cat_encoding"]()
+
+    def fit(self, col_df: dd.Series) -> Any:
+        """
+        Fit the parameters for encoder according to the provided column.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+
+        self.encoder.fit(col_df)
+        return self
+
+    def transform(self, col_df: dd.Series) -> dd.Series:
+        """
+        Transform the provided data column with the fitted parameters.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+        return self.encoder.transform(col_df)
+
+    def fit_transform(
+        self, training_df: dd.Series, test_df: dd.Series
+    ) -> Tuple[dd.Series, dd.Series]:
+        """
+        Fit the parameters for encoder according to the training data column.
+        Transform training data column and test data column with fitted parameters.
+
+        Parameters
+        ----------
+        training_df
+            Training data column.
+        test_df
+            Test data column.
+        """
+        self.encoder.fit(training_df)
+        return self.encoder.transform(training_df), self.encoder.transform(test_df)
diff --git a/dataprep/clean/components/cat_encoding/__init__.py b/dataprep/clean/components/cat_encoding/__init__.py
@@ -0,0 +1,9 @@
+"""
+Initialize dictionary of categorical encoders.
+"""
+
+from .one_hot_encoding import OneHotEncoder
+
+operator_dic = {
+    "one_hot": OneHotEncoder,
+}
diff --git a/dataprep/clean/components/cat_encoding/one_hot_encoding.py b/dataprep/clean/components/cat_encoding/one_hot_encoding.py
@@ -0,0 +1,83 @@
+"""
+Implement one-hot encoder.
+"""
+
+from typing import Any, List
+import dask.dataframe as dd
+import numpy as np
+
+
+class OneHotEncoder:
+    """One-hot encoder for encoding categorical values
+    Attributes:
+        name
+            Name of encoder
+        unique_list
+            Unique categorical values in provided data columns
+        unique_num
+            Number of unique categorical values in provided data columns
+    """
+
+    def __init__(self) -> None:
+        """
+        This function initiate numerical scaler.
+        """
+
+        self.name = "OneHotEncoder"
+        self.unique_list = np.zeros(1)
+        self.unique_num = 0
+
+    def fit(self, col_df: dd.Series) -> Any:
+        """
+        Extract unique categorical values for one-hot encoder according to the provided column.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+
+        self.unique_list = col_df.drop_duplicates().values
+        self.unique_num = col_df.drop_duplicates().count()
+        return self
+
+    def transform(self, col_df: dd.Series) -> dd.Series:
+        """
+        Transform the provided data column with the extracted unique values.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+
+        result = col_df.map(self.compute_val)
+        return result
+
+    def fit_transform(self, col_df: dd.Series) -> dd.Series:
+        """
+        Extract unique categorical values for one-hot encoder according to the data column.
+        Transform the data column with the extracted unique values.
+
+        Parameters
+        ----------
+        col_df
+            Data column.
+        """
+
+        return self.fit(col_df).transform(col_df)
+
+    def compute_val(self, val: str) -> List[float]:
+        """
+        Compute one-hot encoding of provided value.
+
+        Parameters
+        ----------
+        val
+            Value should be transferred to one-hot encoding.
+        """
+        temp_result = np.zeros(len(self.unique_list))
+        idx = self.unique_list.tolist().index(val)
+        temp_result[idx] = 1
+        result: List[float] = temp_result.tolist()
+        return result
diff --git a/dataprep/clean/components/cat_imputation/.DS_Store b/dataprep/clean/components/cat_imputation/.DS_Store
diff --git a/dataprep/clean/components/cat_imputation/__init__.py b/dataprep/clean/components/cat_imputation/__init__.py
@@ -0,0 +1,13 @@
+"""
+Initialize dictionary of categorical imputers.
+"""
+
+from .constant_imputer import ConstantImputer
+from .most_frequent_imputer import MostFrequentImputer
+from .drop_imputer import DropImputer
+
+operator_dic = {
+    "constant": ConstantImputer,
+    "most_frequent": MostFrequentImputer,
+    "drop": DropImputer,
+}
diff --git a/dataprep/clean/components/cat_imputation/constant_imputer.py b/dataprep/clean/components/cat_imputation/constant_imputer.py
@@ -0,0 +1,90 @@
+"""
+Implement categorical constant imputer.
+"""
+
+# pylint: disable=unused-argument
+from typing import Any, List, Optional
+import dask.dataframe as dd
+
+
+class ConstantImputer:
+    """Constant imputer for imputing categorical values
+    Attributes:
+        null_values
+            Specified null values which should be recognized
+        fill_value
+            Value used for imputing missing values, the default value is "Missing"
+    """
+
+    def __init__(self, null_values: Optional[List[Any]], fill_value: str = "") -> None:
+        """
+        This function initiate constant imputer.
+
+        Parameters
+        ----------
+        null_values
+            Specified null values which should be recognized
+        fill_value
+            Value used for imputing missing values.
+        """
+
+        self.null_values = null_values
+        if len(fill_value) == "":
+            self.fill_value = "Missing"
+        else:
+            self.fill_value = fill_value
+
+    def fit(self, col_df: dd.Series) -> Any:
+        """
+        Constant imputer don't need to fit any parameter.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+
+        return self
+
+    def transform(self, col_df: dd.Series) -> dd.Series:
+        """
+        Impute the provided data column with the fitted parameters.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+
+        result = col_df.map(self.fillna)
+        return result
+
+    def fit_transform(self, col_df: dd.Series) -> dd.Series:
+        """
+        Impute the data column with constant value.
+
+        Parameters
+        ----------
+        col_df
+            Data column.
+        """
+
+        return self.fit(col_df).transform(col_df)
+
+    def fillna(self, val: str) -> str:
+
+        """
+        Check if the value is in the list of null value.
+        If yes, impute the data column with constant value.
+        If no, just return the value.
+
+        Parameters
+        ----------
+        val
+            Each value in dask's Series
+        """
+
+        if not self.null_values is None:
+            if val in self.null_values:
+                return self.fill_value
+        return val
diff --git a/dataprep/clean/components/cat_imputation/drop_imputer.py b/dataprep/clean/components/cat_imputation/drop_imputer.py
@@ -0,0 +1,96 @@
+"""
+Implement categorical drop imputer.
+"""
+
+from typing import Any, List, Optional
+import dask.dataframe as dd
+from dask.dataframe import from_pandas
+import pandas as pd
+
+
+class DropImputer:
+    """Drop column with missing values
+    Attributes:
+        null_values
+            Specified null values which should be recognized
+        fill_value
+            Value used for imputing missing values.
+    """
+
+    def __init__(self, null_values: Optional[List[Any]], fill_value: str = "") -> None:
+        """
+        This function initiate drop imputer.
+
+        Parameters
+        ----------
+        null_values
+            Specified null values which should be recognized
+        fill_value
+            Value used for imputing missing values.
+        """
+
+        self.null_values = null_values
+        self.fill_value = fill_value
+        self.isdrop = False
+
+    def fit(self, col_df: dd.Series) -> Any:
+        """
+        Check if the provided column need to be dropped.
+        If categorical values in null values,
+            then the column should be dropped.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+
+        self.isdrop = True in col_df.map(self.check_isdrop).values
+        return self
+
+    def transform(self, col_df: dd.Series) -> dd.Series:
+        """
+        Check the value of isdrop.
+        If yes, then drop this column.
+        If no, then return origin column.
+
+        Parameters
+        ----------
+        col_df
+            Provided data column.
+        """
+
+        if not self.isdrop:
+            return col_df
+        return from_pandas(pd.Series([]), npartitions=2)
+
+    def fit_transform(self, col_df: dd.Series) -> dd.Series:
+        """
+        Check if the provided column need to be dropped.
+        If yes, then drop this column.
+        If no, then return origin df.
+
+        Parameters
+        ----------
+        col_df
+            Data column.
+        """
+
+        return self.fit(col_df).transform(col_df)
+
+    def check_isdrop(self, val: str) -> bool:
+        """
+        Check if the value is missing value.
+        If yes, then the whole column should be dropped.
+        If no, then return origin df.
+
+        Parameters
+        ----------
+        val
+            Current value needs to be checked.
+        """
+
+        if not self.null_values is None:
+            if val in self.null_values:
+                return True
+        return False