-
Notifications
You must be signed in to change notification settings - Fork 200
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
27 changed files
with
2,236 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
""" | ||
Initialize component dictionary. | ||
""" | ||
|
||
from .cat_encoder import CatEncoder | ||
from .cat_imputer import CatImputer | ||
from .num_imputer import NumImputer | ||
from .num_scaler import NumScaler | ||
from .variance_thresholder import VarianceThresholder | ||
|
||
component_dic = { | ||
"cat_encoding": CatEncoder, | ||
"cat_imputation": CatImputer, | ||
"num_imputation": NumImputer, | ||
"num_scaling": NumScaler, | ||
"variance_threshold": VarianceThresholder, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
""" | ||
Implement categorical encoder component. | ||
""" | ||
|
||
from typing import Any, Tuple, Dict | ||
import dask.dataframe as dd | ||
|
||
from .cat_encoding import operator_dic | ||
|
||
|
||
class CatEncoder: | ||
|
||
"""Categorical encoder for encoding categorical columns | ||
Attributes: | ||
encode_type | ||
Name of categorical encoder | ||
encoder | ||
Encoder object | ||
""" | ||
|
||
def __init__(self, cat_pipe_info: Dict[str, Any]) -> None: | ||
""" | ||
This function initiate categorical encoder. | ||
Parameters | ||
---------- | ||
cat_pipe_info | ||
Information of pipeline managing categorical columns, | ||
including the arrangement of components, name of operators | ||
and other information should be provided, such as filling value for imputation. | ||
""" | ||
|
||
if isinstance(cat_pipe_info["cat_encoding"], str): | ||
encode_type = cat_pipe_info["cat_encoding"] | ||
self.encode_type = encode_type | ||
self.encoder = operator_dic[self.encode_type]() | ||
# elif isinstance(cat_pipe_info['cat_encoding'], object): | ||
else: | ||
self.encoder = cat_pipe_info["cat_encoding"]() | ||
|
||
def fit(self, col_df: dd.Series) -> Any: | ||
""" | ||
Fit the parameters for encoder according to the provided column. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
|
||
self.encoder.fit(col_df) | ||
return self | ||
|
||
def transform(self, col_df: dd.Series) -> dd.Series: | ||
""" | ||
Transform the provided data column with the fitted parameters. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
return self.encoder.transform(col_df) | ||
|
||
def fit_transform( | ||
self, training_df: dd.Series, test_df: dd.Series | ||
) -> Tuple[dd.Series, dd.Series]: | ||
""" | ||
Fit the parameters for encoder according to the training data column. | ||
Transform training data column and test data column with fitted parameters. | ||
Parameters | ||
---------- | ||
training_df | ||
Training data column. | ||
test_df | ||
Test data column. | ||
""" | ||
self.encoder.fit(training_df) | ||
return self.encoder.transform(training_df), self.encoder.transform(test_df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
""" | ||
Initialize dictionary of categorical encoders. | ||
""" | ||
|
||
from .one_hot_encoding import OneHotEncoder | ||
|
||
operator_dic = { | ||
"one_hot": OneHotEncoder, | ||
} |
83 changes: 83 additions & 0 deletions
83
dataprep/clean/components/cat_encoding/one_hot_encoding.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
""" | ||
Implement one-hot encoder. | ||
""" | ||
|
||
from typing import Any, List | ||
import dask.dataframe as dd | ||
import numpy as np | ||
|
||
|
||
class OneHotEncoder: | ||
"""One-hot encoder for encoding categorical values | ||
Attributes: | ||
name | ||
Name of encoder | ||
unique_list | ||
Unique categorical values in provided data columns | ||
unique_num | ||
Number of unique categorical values in provided data columns | ||
""" | ||
|
||
def __init__(self) -> None: | ||
""" | ||
This function initiate numerical scaler. | ||
""" | ||
|
||
self.name = "OneHotEncoder" | ||
self.unique_list = np.zeros(1) | ||
self.unique_num = 0 | ||
|
||
def fit(self, col_df: dd.Series) -> Any: | ||
""" | ||
Extract unique categorical values for one-hot encoder according to the provided column. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
|
||
self.unique_list = col_df.drop_duplicates().values | ||
self.unique_num = col_df.drop_duplicates().count() | ||
return self | ||
|
||
def transform(self, col_df: dd.Series) -> dd.Series: | ||
""" | ||
Transform the provided data column with the extracted unique values. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
|
||
result = col_df.map(self.compute_val) | ||
return result | ||
|
||
def fit_transform(self, col_df: dd.Series) -> dd.Series: | ||
""" | ||
Extract unique categorical values for one-hot encoder according to the data column. | ||
Transform the data column with the extracted unique values. | ||
Parameters | ||
---------- | ||
col_df | ||
Data column. | ||
""" | ||
|
||
return self.fit(col_df).transform(col_df) | ||
|
||
def compute_val(self, val: str) -> List[float]: | ||
""" | ||
Compute one-hot encoding of provided value. | ||
Parameters | ||
---------- | ||
val | ||
Value should be transferred to one-hot encoding. | ||
""" | ||
temp_result = np.zeros(len(self.unique_list)) | ||
idx = self.unique_list.tolist().index(val) | ||
temp_result[idx] = 1 | ||
result: List[float] = temp_result.tolist() | ||
return result |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
""" | ||
Initialize dictionary of categorical imputers. | ||
""" | ||
|
||
from .constant_imputer import ConstantImputer | ||
from .most_frequent_imputer import MostFrequentImputer | ||
from .drop_imputer import DropImputer | ||
|
||
operator_dic = { | ||
"constant": ConstantImputer, | ||
"most_frequent": MostFrequentImputer, | ||
"drop": DropImputer, | ||
} |
90 changes: 90 additions & 0 deletions
90
dataprep/clean/components/cat_imputation/constant_imputer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
""" | ||
Implement categorical constant imputer. | ||
""" | ||
|
||
# pylint: disable=unused-argument | ||
from typing import Any, List, Optional | ||
import dask.dataframe as dd | ||
|
||
|
||
class ConstantImputer: | ||
"""Constant imputer for imputing categorical values | ||
Attributes: | ||
null_values | ||
Specified null values which should be recognized | ||
fill_value | ||
Value used for imputing missing values, the default value is "Missing" | ||
""" | ||
|
||
def __init__(self, null_values: Optional[List[Any]], fill_value: str = "") -> None: | ||
""" | ||
This function initiate constant imputer. | ||
Parameters | ||
---------- | ||
null_values | ||
Specified null values which should be recognized | ||
fill_value | ||
Value used for imputing missing values. | ||
""" | ||
|
||
self.null_values = null_values | ||
if len(fill_value) == "": | ||
self.fill_value = "Missing" | ||
else: | ||
self.fill_value = fill_value | ||
|
||
def fit(self, col_df: dd.Series) -> Any: | ||
""" | ||
Constant imputer don't need to fit any parameter. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
|
||
return self | ||
|
||
def transform(self, col_df: dd.Series) -> dd.Series: | ||
""" | ||
Impute the provided data column with the fitted parameters. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
|
||
result = col_df.map(self.fillna) | ||
return result | ||
|
||
def fit_transform(self, col_df: dd.Series) -> dd.Series: | ||
""" | ||
Impute the data column with constant value. | ||
Parameters | ||
---------- | ||
col_df | ||
Data column. | ||
""" | ||
|
||
return self.fit(col_df).transform(col_df) | ||
|
||
def fillna(self, val: str) -> str: | ||
|
||
""" | ||
Check if the value is in the list of null value. | ||
If yes, impute the data column with constant value. | ||
If no, just return the value. | ||
Parameters | ||
---------- | ||
val | ||
Each value in dask's Series | ||
""" | ||
|
||
if not self.null_values is None: | ||
if val in self.null_values: | ||
return self.fill_value | ||
return val |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
""" | ||
Implement categorical drop imputer. | ||
""" | ||
|
||
from typing import Any, List, Optional | ||
import dask.dataframe as dd | ||
from dask.dataframe import from_pandas | ||
import pandas as pd | ||
|
||
|
||
class DropImputer: | ||
"""Drop column with missing values | ||
Attributes: | ||
null_values | ||
Specified null values which should be recognized | ||
fill_value | ||
Value used for imputing missing values. | ||
""" | ||
|
||
def __init__(self, null_values: Optional[List[Any]], fill_value: str = "") -> None: | ||
""" | ||
This function initiate drop imputer. | ||
Parameters | ||
---------- | ||
null_values | ||
Specified null values which should be recognized | ||
fill_value | ||
Value used for imputing missing values. | ||
""" | ||
|
||
self.null_values = null_values | ||
self.fill_value = fill_value | ||
self.isdrop = False | ||
|
||
def fit(self, col_df: dd.Series) -> Any: | ||
""" | ||
Check if the provided column need to be dropped. | ||
If categorical values in null values, | ||
then the column should be dropped. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
|
||
self.isdrop = True in col_df.map(self.check_isdrop).values | ||
return self | ||
|
||
def transform(self, col_df: dd.Series) -> dd.Series: | ||
""" | ||
Check the value of isdrop. | ||
If yes, then drop this column. | ||
If no, then return origin column. | ||
Parameters | ||
---------- | ||
col_df | ||
Provided data column. | ||
""" | ||
|
||
if not self.isdrop: | ||
return col_df | ||
return from_pandas(pd.Series([]), npartitions=2) | ||
|
||
def fit_transform(self, col_df: dd.Series) -> dd.Series: | ||
""" | ||
Check if the provided column need to be dropped. | ||
If yes, then drop this column. | ||
If no, then return origin df. | ||
Parameters | ||
---------- | ||
col_df | ||
Data column. | ||
""" | ||
|
||
return self.fit(col_df).transform(col_df) | ||
|
||
def check_isdrop(self, val: str) -> bool: | ||
""" | ||
Check if the value is missing value. | ||
If yes, then the whole column should be dropped. | ||
If no, then return origin df. | ||
Parameters | ||
---------- | ||
val | ||
Current value needs to be checked. | ||
""" | ||
|
||
if not self.null_values is None: | ||
if val in self.null_values: | ||
return True | ||
return False |
Oops, something went wrong.
909cd19
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DataPrep.EDA Benchmarks
dataprep/tests/benchmarks/eda.py::test_create_report
0.15902619910557225
iter/sec (stddev: 0.11187225798616067
)0.1678442287550567
iter/sec (stddev: 0.1352877254045239
)1.06
This comment was automatically generated by workflow using github-action-benchmark.