Skip to content

Commit

Permalink
feat(clean): add clean_ml function
Browse files Browse the repository at this point in the history
  • Loading branch information
qidanrui committed Sep 20, 2021
1 parent 26f4f9e commit 909cd19
Show file tree
Hide file tree
Showing 27 changed files with 2,236 additions and 0 deletions.
581 changes: 581 additions & 0 deletions dataprep/clean/clean_ml.py

Large diffs are not rendered by default.

Binary file added dataprep/clean/components/.DS_Store
Binary file not shown.
17 changes: 17 additions & 0 deletions dataprep/clean/components/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Initialize component dictionary.
"""

from .cat_encoder import CatEncoder
from .cat_imputer import CatImputer
from .num_imputer import NumImputer
from .num_scaler import NumScaler
from .variance_thresholder import VarianceThresholder

component_dic = {
"cat_encoding": CatEncoder,
"cat_imputation": CatImputer,
"num_imputation": NumImputer,
"num_scaling": NumScaler,
"variance_threshold": VarianceThresholder,
}
80 changes: 80 additions & 0 deletions dataprep/clean/components/cat_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Implement categorical encoder component.
"""

from typing import Any, Tuple, Dict
import dask.dataframe as dd

from .cat_encoding import operator_dic


class CatEncoder:

"""Categorical encoder for encoding categorical columns
Attributes:
encode_type
Name of categorical encoder
encoder
Encoder object
"""

def __init__(self, cat_pipe_info: Dict[str, Any]) -> None:
"""
This function initiate categorical encoder.
Parameters
----------
cat_pipe_info
Information of pipeline managing categorical columns,
including the arrangement of components, name of operators
and other information should be provided, such as filling value for imputation.
"""

if isinstance(cat_pipe_info["cat_encoding"], str):
encode_type = cat_pipe_info["cat_encoding"]
self.encode_type = encode_type
self.encoder = operator_dic[self.encode_type]()
# elif isinstance(cat_pipe_info['cat_encoding'], object):
else:
self.encoder = cat_pipe_info["cat_encoding"]()

def fit(self, col_df: dd.Series) -> Any:
"""
Fit the parameters for encoder according to the provided column.
Parameters
----------
col_df
Provided data column.
"""

self.encoder.fit(col_df)
return self

def transform(self, col_df: dd.Series) -> dd.Series:
"""
Transform the provided data column with the fitted parameters.
Parameters
----------
col_df
Provided data column.
"""
return self.encoder.transform(col_df)

def fit_transform(
self, training_df: dd.Series, test_df: dd.Series
) -> Tuple[dd.Series, dd.Series]:
"""
Fit the parameters for encoder according to the training data column.
Transform training data column and test data column with fitted parameters.
Parameters
----------
training_df
Training data column.
test_df
Test data column.
"""
self.encoder.fit(training_df)
return self.encoder.transform(training_df), self.encoder.transform(test_df)
9 changes: 9 additions & 0 deletions dataprep/clean/components/cat_encoding/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Initialize dictionary of categorical encoders.
"""

from .one_hot_encoding import OneHotEncoder

operator_dic = {
"one_hot": OneHotEncoder,
}
83 changes: 83 additions & 0 deletions dataprep/clean/components/cat_encoding/one_hot_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Implement one-hot encoder.
"""

from typing import Any, List
import dask.dataframe as dd
import numpy as np


class OneHotEncoder:
"""One-hot encoder for encoding categorical values
Attributes:
name
Name of encoder
unique_list
Unique categorical values in provided data columns
unique_num
Number of unique categorical values in provided data columns
"""

def __init__(self) -> None:
"""
This function initiate numerical scaler.
"""

self.name = "OneHotEncoder"
self.unique_list = np.zeros(1)
self.unique_num = 0

def fit(self, col_df: dd.Series) -> Any:
"""
Extract unique categorical values for one-hot encoder according to the provided column.
Parameters
----------
col_df
Provided data column.
"""

self.unique_list = col_df.drop_duplicates().values
self.unique_num = col_df.drop_duplicates().count()
return self

def transform(self, col_df: dd.Series) -> dd.Series:
"""
Transform the provided data column with the extracted unique values.
Parameters
----------
col_df
Provided data column.
"""

result = col_df.map(self.compute_val)
return result

def fit_transform(self, col_df: dd.Series) -> dd.Series:
"""
Extract unique categorical values for one-hot encoder according to the data column.
Transform the data column with the extracted unique values.
Parameters
----------
col_df
Data column.
"""

return self.fit(col_df).transform(col_df)

def compute_val(self, val: str) -> List[float]:
"""
Compute one-hot encoding of provided value.
Parameters
----------
val
Value should be transferred to one-hot encoding.
"""
temp_result = np.zeros(len(self.unique_list))
idx = self.unique_list.tolist().index(val)
temp_result[idx] = 1
result: List[float] = temp_result.tolist()
return result
Binary file not shown.
13 changes: 13 additions & 0 deletions dataprep/clean/components/cat_imputation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
Initialize dictionary of categorical imputers.
"""

from .constant_imputer import ConstantImputer
from .most_frequent_imputer import MostFrequentImputer
from .drop_imputer import DropImputer

operator_dic = {
"constant": ConstantImputer,
"most_frequent": MostFrequentImputer,
"drop": DropImputer,
}
90 changes: 90 additions & 0 deletions dataprep/clean/components/cat_imputation/constant_imputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""
Implement categorical constant imputer.
"""

# pylint: disable=unused-argument
from typing import Any, List, Optional
import dask.dataframe as dd


class ConstantImputer:
"""Constant imputer for imputing categorical values
Attributes:
null_values
Specified null values which should be recognized
fill_value
Value used for imputing missing values, the default value is "Missing"
"""

def __init__(self, null_values: Optional[List[Any]], fill_value: str = "") -> None:
"""
This function initiate constant imputer.
Parameters
----------
null_values
Specified null values which should be recognized
fill_value
Value used for imputing missing values.
"""

self.null_values = null_values
if len(fill_value) == "":
self.fill_value = "Missing"
else:
self.fill_value = fill_value

def fit(self, col_df: dd.Series) -> Any:
"""
Constant imputer don't need to fit any parameter.
Parameters
----------
col_df
Provided data column.
"""

return self

def transform(self, col_df: dd.Series) -> dd.Series:
"""
Impute the provided data column with the fitted parameters.
Parameters
----------
col_df
Provided data column.
"""

result = col_df.map(self.fillna)
return result

def fit_transform(self, col_df: dd.Series) -> dd.Series:
"""
Impute the data column with constant value.
Parameters
----------
col_df
Data column.
"""

return self.fit(col_df).transform(col_df)

def fillna(self, val: str) -> str:

"""
Check if the value is in the list of null value.
If yes, impute the data column with constant value.
If no, just return the value.
Parameters
----------
val
Each value in dask's Series
"""

if not self.null_values is None:
if val in self.null_values:
return self.fill_value
return val
96 changes: 96 additions & 0 deletions dataprep/clean/components/cat_imputation/drop_imputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
Implement categorical drop imputer.
"""

from typing import Any, List, Optional
import dask.dataframe as dd
from dask.dataframe import from_pandas
import pandas as pd


class DropImputer:
"""Drop column with missing values
Attributes:
null_values
Specified null values which should be recognized
fill_value
Value used for imputing missing values.
"""

def __init__(self, null_values: Optional[List[Any]], fill_value: str = "") -> None:
"""
This function initiate drop imputer.
Parameters
----------
null_values
Specified null values which should be recognized
fill_value
Value used for imputing missing values.
"""

self.null_values = null_values
self.fill_value = fill_value
self.isdrop = False

def fit(self, col_df: dd.Series) -> Any:
"""
Check if the provided column need to be dropped.
If categorical values in null values,
then the column should be dropped.
Parameters
----------
col_df
Provided data column.
"""

self.isdrop = True in col_df.map(self.check_isdrop).values
return self

def transform(self, col_df: dd.Series) -> dd.Series:
"""
Check the value of isdrop.
If yes, then drop this column.
If no, then return origin column.
Parameters
----------
col_df
Provided data column.
"""

if not self.isdrop:
return col_df
return from_pandas(pd.Series([]), npartitions=2)

def fit_transform(self, col_df: dd.Series) -> dd.Series:
"""
Check if the provided column need to be dropped.
If yes, then drop this column.
If no, then return origin df.
Parameters
----------
col_df
Data column.
"""

return self.fit(col_df).transform(col_df)

def check_isdrop(self, val: str) -> bool:
"""
Check if the value is missing value.
If yes, then the whole column should be dropped.
If no, then return origin df.
Parameters
----------
val
Current value needs to be checked.
"""

if not self.null_values is None:
if val in self.null_values:
return True
return False

1 comment on commit 909cd19

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataPrep.EDA Benchmarks

Benchmark suite Current: 909cd19 Previous: 26f4f9e Ratio
dataprep/tests/benchmarks/eda.py::test_create_report 0.15902619910557225 iter/sec (stddev: 0.11187225798616067) 0.1678442287550567 iter/sec (stddev: 0.1352877254045239) 1.06

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.