From 9990a6c773b99e2ccac47441e39cda97db04bbca Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Sun, 29 May 2022 14:39:28 +0000 Subject: [PATCH 01/20] Classification docstring consistency --- pycaret/classification/functional.py | 116 ++------- pycaret/classification/oop.py | 222 +++++++++--------- .../pycaret_experiment/pycaret_experiment.py | 33 +++ .../supervised_experiment.py | 28 +++ .../pycaret_experiment/tabular_experiment.py | 109 ++++++++- 5 files changed, 302 insertions(+), 206 deletions(-) diff --git a/pycaret/classification/functional.py b/pycaret/classification/functional.py index f4c81f409..ff2e5f8e8 100644 --- a/pycaret/classification/functional.py +++ b/pycaret/classification/functional.py @@ -620,7 +620,6 @@ def compare_models( experiment_custom_tags: Optional[Dict[str, Any]] = None, probability_threshold: Optional[float] = None, verbose: bool = True, - # parallel: Optional[ParallelBackend] = None, ) -> Union[Any, List[Any]]: """ @@ -731,18 +730,6 @@ def compare_models( - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False. """ - # params = dict(locals()) - parallel = None - if parallel is not None: - global _pycaret_setup_call - parallel.attach(_pycaret_setup_call["func"], _pycaret_setup_call["params"]) - if params.get("include", None) is None: - _models = models() - if turbo: - _models = _models[_models.Turbo] - params["include"] = _models.index.tolist() - del params["parallel"] - return parallel.compare_models(compare_models, params) return _CURRENT_EXPERIMENT.compare_models( include=include, @@ -1727,7 +1714,8 @@ def interpret_model( """ This function analyzes the predictions generated from a trained model. Most plots in this function are implemented based on the SHAP (SHapley Additive exPlanations). - For more info on this, please see https://shap.readthedocs.io/en/latest/ + For more info on this, please see https://shap.readthedocs.io/en/latest/. + For more info on Partial Dependence Plot see https://github.com/SauceCat/PDPbox. Example @@ -1834,7 +1822,10 @@ def calibrate_model( or logistic regression. The output of this function is a score grid with CV scores by fold. Metrics evaluated during CV can be accessed using the ``get_metrics`` function. Custom metrics can be added or removed using - ``add_metric`` and ``remove_metric`` function. + ``add_metric`` and ``remove_metric`` function. The ouput of the original estimator + and the calibrated estimator (created using this function) might not differ much. + In order to see the calibration differences, use 'calibration' plot in ``plot_model`` + to see the difference before and after. Example @@ -2544,7 +2535,7 @@ def add_metric( ) -> pd.Series: """ - Adds a custom metric to be used for CV. + Adds a custom metric to be used in the experiment. Example @@ -2609,7 +2600,7 @@ def add_metric( def remove_metric(name_or_id: str): """ - Removes a metric from CV. + Removes a metric from experiment. Example @@ -2670,48 +2661,17 @@ def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.Da def get_config(variable: str): """ - This function retrieves the global variables created when initializing the - ``setup`` function. Following variables are accessible: - - - dataset: Transformed dataset - - train: Transformed training set - - test: Transformed test set - - X: Transformed feature set - - y: Transformed target column - - X_train, X_test, y_train, y_test: Subsets of the train and test sets. - - seed: random state set through session_id - - pipeline: Transformation pipeline configured through setup - - fold_shuffle_param: shuffle parameter used in Kfolds - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment - - logging_param: log_experiment param - - log_plots_param: log_plots param - - USI: Unique session ID parameter - - fix_imbalance_param: fix_imbalance param - - fix_imbalance_method_param: fix_imbalance_method param - - data_before_preprocess: data before preprocessing - - target_param: name of target variable - - gpu_param: use_gpu param configured through setup - - fold_generator: CV splitter configured in fold_strategy - - fold_param: fold params defined in the setup - - fold_groups_param: fold groups defined in the setup - - stratify_param: stratify parameter defined in the setup - + This function is used to access global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> juice = get_data('juice') - >>> from pycaret.classification import * - >>> exp_name = setup(data = juice, target = 'Purchase') >>> X_train = get_config('X_train') + This will return X_train transformed dataset. - Returns: - Global variable + Returns + ------- + variable """ @@ -2722,50 +2682,15 @@ def get_config(variable: str): def set_config(variable: str, value): """ - This function resets the global variables. Following variables are - accessible: - - - X: Transformed dataset (X) - - y: Transformed dataset (y) - - X_train: Transformed train dataset (X) - - X_test: Transformed test/holdout dataset (X) - - y_train: Transformed train dataset (y) - - y_test: Transformed test/holdout dataset (y) - - seed: random state set through session_id - - prep_pipe: Transformation pipeline - - fold_shuffle_param: shuffle parameter used in Kfolds - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment - - logging_param: log_experiment param - - log_plots_param: log_plots param - - USI: Unique session ID parameter - - fix_imbalance_param: fix_imbalance param - - fix_imbalance_method_param: fix_imbalance_method param - - data_before_preprocess: data before preprocessing - - target_param: name of target variable - - gpu_param: use_gpu param configured through setup - - fold_generator: CV splitter configured in fold_strategy - - fold_param: fold params defined in the setup - - fold_groups_param: fold groups defined in the setup - - stratify_param: stratify parameter defined in the setup + This function is used to reset global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> juice = get_data('juice') - >>> from pycaret.classification import * - >>> exp_name = setup(data = juice, target = 'Purchase') >>> set_config('seed', 123) - - Returns: - None + This will set the global seed to '123'. """ - return _CURRENT_EXPERIMENT.set_config(variable=variable, value=value) @@ -3215,7 +3140,16 @@ def deep_check(estimator, check_kwargs: Optional[dict] = None) -> None: ) -def set_current_experiment(experiment: ClassificationExperiment): +def set_current_experiment(experiment: ClassificationExperiment) -> None: + """ + Set the current experiment to be used with the functional API. + + experiment: ClassificationExperiment + Experiment object to use. + + Returns: + None + """ global _CURRENT_EXPERIMENT if not isinstance(experiment, ClassificationExperiment): diff --git a/pycaret/classification/oop.py b/pycaret/classification/oop.py index 946f01573..b9e1c7511 100644 --- a/pycaret/classification/oop.py +++ b/pycaret/classification/oop.py @@ -168,7 +168,6 @@ def setup( log_plots: Union[bool, list] = False, log_profile: bool = False, log_data: bool = False, - silent: bool = False, verbose: bool = True, memory: Union[bool, str, Memory] = True, profile: bool = False, @@ -583,11 +582,6 @@ def setup( Ignored when ``log_experiment`` is False. - silent: bool, default = False - Controls the confirmation input of data types when ``setup`` is executed. When - executing in completely automated mode or on a remote kernel, this must be True. - - verbose: bool, default = True When set to False, Information grid is not printed. @@ -609,7 +603,7 @@ def setup( Returns: - Global variables that can be changed using the ``set_config`` function. + ClassificationExperiment object. """ @@ -1008,6 +1002,11 @@ def compare_models( as the column name in the dataset containing group labels. + experiment_custom_tags: dict, default = None + Dictionary of tag_name: String -> value: (String, but will be string-ified + if not) passed to the mlflow.set_tags to add new custom tags for the experiment. + + probability_threshold: float, default = None Threshold for converting predicted probability to class label. It defaults to 0.5 for all classifiers unless explicitly defined @@ -1021,6 +1020,7 @@ def compare_models( Returns: Trained model or list of trained models, depending on the ``n_select`` param. + Warnings -------- - Changing turbo parameter to False may result in very high training times with @@ -1139,12 +1139,13 @@ def create_model( in this parameter. Only applicable for binary classification. - verbose: bool, default = True - Score grid is not printed when verbose is set to False. + experiment_custom_tags: dict, default = None + Dictionary of tag_name: String -> value: (String, but will be string-ified + if not) passed to the mlflow.set_tags to add new custom tags for the experiment. - **kwargs: - Additional keyword arguments to pass to the estimator. + verbose: bool, default = True + Score grid is not printed when verbose is set to False. return_train_score: bool, default = False @@ -1154,6 +1155,10 @@ def create_model( training score with a low corresponding CV validation score indicates overfitting. + **kwargs: + Additional keyword arguments to pass to the estimator. + + Returns: Trained Model @@ -1293,6 +1298,7 @@ def tune_model( - 'grid' : grid search - 'bayesian' : ``pip install scikit-optimize`` - 'hyperopt' : ``pip install hyperopt`` + - 'optuna' : ``pip install optuna`` - 'bohb' : ``pip install hpbandster ConfigSpace`` - 'optuna' possible values: @@ -1344,7 +1350,7 @@ def tune_model( tuner_verbose: bool or in, default = True If True or above 0, will print messages from the tuner. Higher values - print more messages. Ignored when ``verbose`` parameter is False. + print more messages. Ignored when ``verbose`` param is False. return_train_score: bool, default = False @@ -1703,7 +1709,7 @@ def stack_models( of the value from 'predict_proba', 'decision_function' or 'predict'. - restack: bool, default = False + restack: bool, default = True When set to False, only the predictions of estimators will be used as training data for the ``meta_model``. @@ -1949,6 +1955,10 @@ def evaluate_model( Dictionary of arguments passed to the fit method of the model. + plot_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the visualizer class. + + groups: str or array-like, with shape (n_samples,), default = None Optional group labels when GroupKFold is used for the cross validation. It takes an array with shape (n_samples, ) where n_samples is the number @@ -1994,16 +2004,10 @@ def interpret_model( ): """ - This function takes a trained model object and returns an interpretation plot - based on the test / hold-out set. It only supports tree based algorithms. - - This function is implemented based on the SHAP (SHapley Additive exPlanations), - which is a unified approach to explain the output of any machine learning model. - SHAP connects game theory with local explanations. - - For more information : https://shap.readthedocs.io/en/latest/ - - For Partial Dependence Plot : https://github.com/SauceCat/PDPbox + This function analyzes the predictions generated from a trained model. Most plots + in this function are implemented based on the SHAP (SHapley Additive exPlanations). + For more info on this, please see https://shap.readthedocs.io/en/latest/. + For more info on Partial Dependence Plot see https://github.com/SauceCat/PDPbox. Example @@ -2102,38 +2106,37 @@ def calibrate_model( groups: Optional[Union[str, Any]] = None, verbose: bool = True, return_train_score: bool = False, - display: Optional[CommonDisplay] = None, # added in pycaret==2.2.0 ) -> Any: """ - This function takes the input of trained estimator and performs probability - calibration with sigmoid or isotonic regression. The output prints a score - grid that shows Accuracy, AUC, Recall, Precision, F1, Kappa and MCC by fold - (default = 10 Fold). The ouput of the original estimator and the calibrated - estimator (created using this function) might not differ much. In order - to see the calibration differences, use 'calibration' plot in plot_model to - see the difference before and after. + This function calibrates the probability of a given estimator using isotonic + or logistic regression. The output of this function is a score grid with CV + scores by fold. Metrics evaluated during CV can be accessed using the + ``get_metrics`` function. Custom metrics can be added or removed using + ``add_metric`` and ``remove_metric`` function. The ouput of the original estimator + and the calibrated estimator (created using this function) might not differ much. + In order to see the calibration differences, use 'calibration' plot in ``plot_model`` + to see the difference before and after. - This function returns a trained model object. Example ------- >>> from pycaret.datasets import get_data >>> juice = get_data('juice') - >>> experiment_name = setup(data = juice, target = 'Purchase') - >>> dt_boosted = create_model('dt', ensemble = True, method = 'Boosting') - >>> calibrated_dt = calibrate_model(dt_boosted) + >>> from pycaret.classification import * + >>> exp_name = setup(data = juice, target = 'Purchase') + >>> dt = create_model('dt') + >>> calibrated_dt = calibrate_model(dt) - This will return Calibrated Boosted Decision Tree Model. - Parameters - ---------- - estimator : object + estimator: scikit-learn compatible object + Trained model object + + + method: str, default = 'sigmoid' + The method to use for calibration. Can be 'sigmoid' which corresponds to + Platt's method or 'isotonic' which is a non-parametric approach. - method : str, default = 'sigmoid' - The method to use for calibration. Can be 'sigmoid' which corresponds to Platt's - method or 'isotonic' which is a non-parametric approach. It is not advised to use - isotonic calibration with too few calibration samples calibrate_fold: integer or scikit-learn compatible CV generator, default = 5 Controls internal cross-validation. Can be an integer or a scikit-learn @@ -2141,51 +2144,49 @@ def calibrate_model( that many folds. See scikit-learn documentation on Stacking for more details. - fold: integer or scikit-learn compatible CV generator, default = None - Controls cross-validation. If None, will use the CV generator defined in setup(). - If integer, will use KFold CV with that many folds. - When cross_validation is False, this parameter is ignored. - round: integer, default = 4 + fold: int or scikit-learn compatible CV generator, default = None + Controls cross-validation. If None, the CV generator in the ``fold_strategy`` + parameter of the ``setup`` function is used. When an integer is passed, + it is interpreted as the 'n_splits' parameter of the CV generator in the + ``setup`` function. + + + round: int, default = 4 Number of decimal places the metrics in the score grid will be rounded to. + fit_kwargs: dict, default = {} (empty dict) Dictionary of arguments passed to the fit method of the model. + groups: str or array-like, with shape (n_samples,), default = None - Optional Group labels for the samples used while splitting the dataset into train/test set. - If string is passed, will use the data column with that name as the groups. - Only used if a group based cross-validation generator is used (eg. GroupKFold). - If None, will use the value set in fold_groups parameter in setup(). + Optional group labels when GroupKFold is used for the cross validation. + It takes an array with shape (n_samples, ) where n_samples is the number + of rows in training dataset. When string is passed, it is interpreted as + the column name in the dataset containing group labels. + verbose: bool, default = True Score grid is not printed when verbose is set to False. + return_train_score: bool, default = False If False, returns the CV Validation scores only. If True, returns the CV training scores along with the CV validation scores. This is useful when the user wants to do bias-variance tradeoff. A high CV training score with a low corresponding CV validation score indicates overfitting. - Returns - ------- - score_grid - A table containing the scores of the model across the kfolds. - Scoring metrics used are Accuracy, AUC, Recall, Precision, F1, - Kappa and MCC. Mean and standard deviation of the scores across - the folds are also returned. - model - trained and calibrated model object. + Returns: + Trained Model + Warnings -------- - - Avoid isotonic calibration with too few calibration samples (<1000) since it + - Avoid isotonic calibration with too few calibration samples (< 1000) since it tends to overfit. - - calibration plot not available for multiclass problems. - - """ function_params_str = ", ".join([f"{k}={v}" for k, v in locals().items()]) @@ -2241,28 +2242,27 @@ def calibrate_model( self.logger.info("Preparing display monitor") - if not display: - progress_args = {"max": 2 + 4} - timestampStr = datetime.datetime.now().strftime("%H:%M:%S") - monitor_rows = [ - ["Initiated", ". . . . . . . . . . . . . . . . . .", timestampStr], - [ - "Status", - ". . . . . . . . . . . . . . . . . .", - "Loading Dependencies", - ], - [ - "Estimator", - ". . . . . . . . . . . . . . . . . .", - "Compiling Library", - ], - ] - display = CommonDisplay( - verbose=verbose, - html_param=self.html_param, - progress_args=progress_args, - monitor_rows=monitor_rows, - ) + progress_args = {"max": 2 + 4} + timestampStr = datetime.datetime.now().strftime("%H:%M:%S") + monitor_rows = [ + ["Initiated", ". . . . . . . . . . . . . . . . . .", timestampStr], + [ + "Status", + ". . . . . . . . . . . . . . . . . .", + "Loading Dependencies", + ], + [ + "Estimator", + ". . . . . . . . . . . . . . . . . .", + "Compiling Library", + ], + ] + display = CommonDisplay( + verbose=verbose, + html_param=self.html_param, + progress_args=progress_args, + monitor_rows=monitor_rows, + ) np.random.seed(self.seed) @@ -2424,11 +2424,10 @@ def optimize_threshold( ------- Trained Model + Warnings -------- - - This function is not supported for multiclass problems. - - + - This function does not support multiclass classification problems. """ function_params_str = ", ".join([f"{k}={v}" for k, v in locals().items()]) @@ -2598,8 +2597,9 @@ def predict_model( probability_threshold: float, default = None Threshold for converting predicted probability to class label. - It defaults to 0.5 for all classifiers unless explicitly defined - in this parameter. + Unless this parameter is set, it will default to the value set + during model creation. If that wasn't set, the default will be 0.5 + for all classifiers. Only applicable for binary classification. encoded_labels: bool, default = False @@ -2610,6 +2610,11 @@ def predict_model( When set to True, scores for all labels will be returned. + drift_report: bool, default = False + When set to True, interactive drift report is generated on test set + with the evidently library. + + round: int, default = 4 Number of decimal places the metrics in the score grid will be rounded to. @@ -2687,6 +2692,10 @@ def finalize_model( transformations in Pipeline are ignored. + experiment_custom_tags: dict, default = None + Dictionary of tag_name: String -> value: (String, but will be string-ified + if not) passed to the mlflow.set_tags to add new custom tags for the experiment. + return_train_score: bool, default = False If False, returns the CV Validation scores only. If True, returns the CV training scores along with the CV validation scores. @@ -2841,6 +2850,10 @@ def save_model( Success message is not printed when verbose is set to False. + **kwargs: + Additional keyword arguments to pass to joblib.dump(). + + Returns: Tuple of the model object and the filename. @@ -2885,7 +2898,7 @@ def load_model( dictionary of applicable authentication tokens. when platform = 'aws': - {'bucket' : 'S3-bucket-name'} + {'bucket' : 'Name of Bucket on S3', 'path': (optional) folder name under the bucket} when platform = 'gcp': {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'} @@ -2970,24 +2983,6 @@ def automl( return_train_score=return_train_score, ) - def pull(self, pop: bool = False) -> pd.DataFrame: - - """ - Returns last printed score grid. Use ``pull`` function after - any training function to store the score grid in pandas.DataFrame. - - - pop: bool, default = False - If True, will pop (remove) the returned dataframe from the - display container. - - - Returns: - pandas.DataFrame - - """ - return super().pull(pop=pop) - def models( self, type: Optional[str] = None, @@ -3085,7 +3080,7 @@ def add_metric( ) -> pd.Series: """ - Adds a custom metric to be used for CV. + Adds a custom metric to be used in the experiment. Example @@ -3148,7 +3143,7 @@ def add_metric( def remove_metric(self, name_or_id: str): """ - Removes a metric from CV. + Removes a metric from experiment. Example @@ -3168,6 +3163,7 @@ def remove_metric(self, name_or_id: str): None """ + return super().remove_metric(name_or_id=name_or_id) def get_logs( diff --git a/pycaret/internal/pycaret_experiment/pycaret_experiment.py b/pycaret/internal/pycaret_experiment/pycaret_experiment.py index 706f95560..a9fb97a4a 100644 --- a/pycaret/internal/pycaret_experiment/pycaret_experiment.py +++ b/pycaret/internal/pycaret_experiment/pycaret_experiment.py @@ -316,6 +316,24 @@ def set_config( return def save_config(self, file_name: str) -> None: + """ + This function save all global variables to a pickle file, allowing to + later resume without rerunning the ``setup``. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> juice = get_data('juice') + >>> from pycaret.classification import * + >>> exp_name = setup(data = juice, target = 'Purchase') + >>> save_config('myvars.pkl') + + + Returns: + None + + """ function_params_str = ", ".join( [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"] ) @@ -346,6 +364,21 @@ def save_config(self, file_name: str) -> None: return def load_config(self, file_name: str) -> None: + """ + This function loads global variables from a pickle file into Python + environment. + + + Example + ------- + >>> from pycaret.classification import load_config + >>> load_config('myvars.pkl') + + + Returns: + Global variables + + """ function_params_str = ", ".join( [f"{k}={v}" for k, v in locals().items() if not k == "globals_d"] ) diff --git a/pycaret/internal/pycaret_experiment/supervised_experiment.py b/pycaret/internal/pycaret_experiment/supervised_experiment.py index c51a6d880..194a8e573 100644 --- a/pycaret/internal/pycaret_experiment/supervised_experiment.py +++ b/pycaret/internal/pycaret_experiment/supervised_experiment.py @@ -5054,6 +5054,34 @@ def check_fairness( the approach known as group fairness, which asks: Which groups of individuals are at risk for experiencing harms. This function provides fairness-related metrics between different groups (also called subpopulation). + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> income = get_data('income') + >>> from pycaret.classification import * + >>> exp_name = setup(data = income, target = 'income >50K') + >>> lr = create_model('lr') + >>> lr_fairness = check_fairness(lr, sensitive_features = ['sex', 'race']) + + + estimator: scikit-learn compatible object + Trained model object + + + sensitive_features: list + List of column names as present in the original dataset before any + transformations. + + + plot_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the matplotlib plot. + + + Returns: + pandas.DataFrame + """ _check_soft_dependencies("fairlearn", extra="analysis", severity="error") diff --git a/pycaret/internal/pycaret_experiment/tabular_experiment.py b/pycaret/internal/pycaret_experiment/tabular_experiment.py index 98e90e283..e2c3a5d42 100644 --- a/pycaret/internal/pycaret_experiment/tabular_experiment.py +++ b/pycaret/internal/pycaret_experiment/tabular_experiment.py @@ -2536,6 +2536,45 @@ def convert_model(estimator, language: str = "python") -> str: Ruby, F#). This functionality is very useful if you want to deploy models into environments where you can't install your normal Python stack to support model inference. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> juice = get_data('juice') + >>> from pycaret.classification import * + >>> exp_name = setup(data = juice, target = 'Purchase') + >>> lr = create_model('lr') + >>> lr_java = convert_model(lr, 'java') + + + estimator: scikit-learn compatible object + Trained model object + + + language: str, default = 'python' + Language in which inference script to be generated. Following + options are available: + + * 'python' + * 'java' + * 'javascript' + * 'c' + * 'c#' + * 'f#' + * 'go' + * 'haskell' + * 'php' + * 'powershell' + * 'r' + * 'ruby' + * 'vb' + * 'dart' + + + Returns: + str + """ _check_soft_dependencies("m2cgen", extra=None, severity="error") @@ -2578,7 +2617,40 @@ def convert_model(estimator, language: str = "python") -> str: def create_api(self, estimator, api_name, host="127.0.0.1", port=8000): """ - This function creates API and write it as a python file using FastAPI + This function takes an input ``estimator`` and creates a POST API for + inference. It only creates the API and doesn't run it automatically. + To run the API, you must run the Python file using ``!python``. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> juice = get_data('juice') + >>> from pycaret.classification import * + >>> exp_name = setup(data = juice, target = 'Purchase') + >>> lr = create_model('lr') + >>> create_api(lr, 'lr_api' + >>> !python lr_api.py + + + estimator: scikit-learn compatible object + Trained model object + + + api_name: scikit-learn compatible object + Trained model object + + + host: str, default = '127.0.0.1' + API host address. + + + port: int, default = 8000 + port for API. + + + Returns: + None """ _check_soft_dependencies("fastapi", extra="mlops", severity="error") @@ -2647,7 +2719,30 @@ def predict({INPUT_COLS}): def eda(self, display_format: str = "bokeh", **kwargs): """ - Function to generate EDA using AutoVIZ library. + This function generates AutoEDA using AutoVIZ library. You must + install Autoviz separately ``pip install autoviz`` to use this + function. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> juice = get_data('juice') + >>> from pycaret.classification import * + >>> exp_name = setup(data = juice, target = 'Purchase') + >>> eda(display_format = 'bokeh') + + display_format: str, default = 'bokeh' + When set to 'bokeh' the plots are interactive. Other option is ``svg`` for static + plots that are generated using matplotlib and seaborn. + + + **kwargs: + Additional keyword arguments to pass to the AutoVIZ class. + + + Returns: + None """ _check_soft_dependencies("autoviz", extra="mlops", severity="error") @@ -2671,6 +2766,8 @@ def create_docker( """ This function creates a ``Dockerfile`` and ``requirements.txt`` for productionalizing API end-point. + + Example ------- >>> from pycaret.datasets import get_data @@ -2680,12 +2777,20 @@ def create_docker( >>> lr = create_model('lr') >>> create_api(lr, 'lr_api') >>> create_docker('lr_api') + + api_name: str Name of API. Must be saved as a .py file in the same folder. + + base_image: str, default = "python:3.8-slim" Name of the base image for Dockerfile. + + expose_port: int, default = 8000 port for expose for API in the Dockerfile. + + Returns: None """ From cc784f015c76c11b5fd74522f111f791835fafe7 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Sun, 29 May 2022 21:13:03 +0000 Subject: [PATCH 02/20] Remove 3.6 from other setups --- setup_nightly.py | 1 - setup_ts_alpha.py | 1 - 2 files changed, 2 deletions(-) diff --git a/setup_nightly.py b/setup_nightly.py index a703aeb90..159069ea7 100644 --- a/setup_nightly.py +++ b/setup_nightly.py @@ -37,7 +37,6 @@ def readme(): license="MIT", classifiers=[ "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", diff --git a/setup_ts_alpha.py b/setup_ts_alpha.py index 3dff58cd6..2a1136d99 100644 --- a/setup_ts_alpha.py +++ b/setup_ts_alpha.py @@ -35,7 +35,6 @@ def readme(): license="MIT", classifiers=[ "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", From d30eb75fc10afc2fbb62f4a4e35939f3b6b9d8c3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Sun, 29 May 2022 21:28:22 +0000 Subject: [PATCH 03/20] Regression docstring consistency --- pycaret/anomaly/functional.py | 7 - pycaret/classification/functional.py | 45 ++-- pycaret/classification/oop.py | 20 +- pycaret/clustering/functional.py | 7 - .../unsupervised_experiment.py | 1 - pycaret/regression/functional.py | 237 ++++++------------ pycaret/regression/oop.py | 206 ++++++++------- 7 files changed, 225 insertions(+), 298 deletions(-) diff --git a/pycaret/anomaly/functional.py b/pycaret/anomaly/functional.py index e7fd61f91..c7f4c4601 100644 --- a/pycaret/anomaly/functional.py +++ b/pycaret/anomaly/functional.py @@ -62,7 +62,6 @@ def setup( log_plots: Union[bool, list] = False, log_profile: bool = False, log_data: bool = False, - silent: bool = False, verbose: bool = True, memory: Union[bool, str, Memory] = True, profile: bool = False, @@ -350,11 +349,6 @@ def setup( Ignored when ``log_experiment`` is False. - silent: bool, default = False - Controls the confirmation input of data types when ``setup`` is executed. When - executing in completely automated mode or on a remote kernel, this must be True. - - verbose: bool, default = True When set to False, Information grid is not printed. @@ -426,7 +420,6 @@ def setup( log_plots=log_plots, log_profile=log_profile, log_data=log_data, - silent=silent, verbose=verbose, memory=memory, profile=profile, diff --git a/pycaret/classification/functional.py b/pycaret/classification/functional.py index ff2e5f8e8..43b04bf7d 100644 --- a/pycaret/classification/functional.py +++ b/pycaret/classification/functional.py @@ -86,7 +86,6 @@ def setup( log_plots: Union[bool, list] = False, log_profile: bool = False, log_data: bool = False, - silent: bool = False, verbose: bool = True, memory: Union[bool, str, Memory] = True, profile: bool = False, @@ -501,11 +500,6 @@ def setup( Ignored when ``log_experiment`` is False. - silent: bool, default = False - Controls the confirmation input of data types when ``setup`` is executed. When - executing in completely automated mode or on a remote kernel, this must be True. - - verbose: bool, default = True When set to False, Information grid is not printed. @@ -1414,7 +1408,7 @@ def stack_models( of the value from 'predict_proba', 'decision_function' or 'predict'. - restack: bool, default = True + restack: bool, default = False When set to False, only the predictions of estimators will be used as training data for the ``meta_model``. @@ -1712,10 +1706,16 @@ def interpret_model( ): """ - This function analyzes the predictions generated from a trained model. Most plots - in this function are implemented based on the SHAP (SHapley Additive exPlanations). - For more info on this, please see https://shap.readthedocs.io/en/latest/. - For more info on Partial Dependence Plot see https://github.com/SauceCat/PDPbox. + This function takes a trained model object and returns an interpretation plot + based on the test / hold-out set. + + This function is implemented based on the SHAP (SHapley Additive exPlanations), + which is a unified approach to explain the output of any machine learning model. + SHAP connects game theory with local explanations. + + For more information: https://shap.readthedocs.io/en/latest/ + + For more information on Partial Dependence Plot: https://github.com/SauceCat/PDPbox Example @@ -2416,19 +2416,19 @@ def automl( @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) def pull(pop: bool = False) -> pd.DataFrame: - """ - Returns last printed score grid. Use ``pull`` function after - any training function to store the score grid in pandas.DataFrame. + Returns the latest displayed table. - - pop: bool, default = False - If True, will pop (remove) the returned dataframe from the + Parameters + ---------- + pop : bool, default = False + If true, will pop (remove) the returned dataframe from the display container. - - Returns: - pandas.DataFrame + Returns + ------- + pandas.DataFrame + Equivalent to get_config('display_container')[-1] """ return _CURRENT_EXPERIMENT.pull(pop=pop) @@ -2485,7 +2485,7 @@ def get_metrics( ) -> pd.DataFrame: """ - Returns table of available metrics used for CV. + Returns table of available metrics used in the experiment. Example @@ -2600,7 +2600,7 @@ def add_metric( def remove_metric(name_or_id: str): """ - Removes a metric from experiment. + Removes a metric from the experiment. Example @@ -2691,6 +2691,7 @@ def set_config(variable: str, value): This will set the global seed to '123'. """ + return _CURRENT_EXPERIMENT.set_config(variable=variable, value=value) diff --git a/pycaret/classification/oop.py b/pycaret/classification/oop.py index b9e1c7511..78d3c116b 100644 --- a/pycaret/classification/oop.py +++ b/pycaret/classification/oop.py @@ -1709,7 +1709,7 @@ def stack_models( of the value from 'predict_proba', 'decision_function' or 'predict'. - restack: bool, default = True + restack: bool, default = False When set to False, only the predictions of estimators will be used as training data for the ``meta_model``. @@ -2004,10 +2004,16 @@ def interpret_model( ): """ - This function analyzes the predictions generated from a trained model. Most plots - in this function are implemented based on the SHAP (SHapley Additive exPlanations). - For more info on this, please see https://shap.readthedocs.io/en/latest/. - For more info on Partial Dependence Plot see https://github.com/SauceCat/PDPbox. + This function takes a trained model object and returns an interpretation plot + based on the test / hold-out set. + + This function is implemented based on the SHAP (SHapley Additive exPlanations), + which is a unified approach to explain the output of any machine learning model. + SHAP connects game theory with local explanations. + + For more information: https://shap.readthedocs.io/en/latest/ + + For more information on Partial Dependence Plot: https://github.com/SauceCat/PDPbox Example @@ -3031,7 +3037,7 @@ def get_metrics( ) -> pd.DataFrame: """ - Returns table of available metrics used for CV. + Returns table of available metrics used in the experiment. Example @@ -3143,7 +3149,7 @@ def add_metric( def remove_metric(self, name_or_id: str): """ - Removes a metric from experiment. + Removes a metric from the experiment. Example diff --git a/pycaret/clustering/functional.py b/pycaret/clustering/functional.py index 103338737..8f26c36a9 100644 --- a/pycaret/clustering/functional.py +++ b/pycaret/clustering/functional.py @@ -62,7 +62,6 @@ def setup( log_plots: Union[bool, list] = False, log_profile: bool = False, log_data: bool = False, - silent: bool = False, verbose: bool = True, memory: Union[bool, str, Memory] = True, profile: bool = False, @@ -352,11 +351,6 @@ def setup( Ignored when ``log_experiment`` is False. - silent: bool, default = False - Controls the confirmation input of data types when ``setup`` is executed. When - executing in completely automated mode or on a remote kernel, this must be True. - - verbose: bool, default = True When set to False, Information grid is not printed. @@ -428,7 +422,6 @@ def setup( log_plots=log_plots, log_profile=log_profile, log_data=log_data, - silent=silent, verbose=verbose, memory=memory, profile=profile, diff --git a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py index 4eb4fda1c..00186da82 100644 --- a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py +++ b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py @@ -135,7 +135,6 @@ def setup( log_plots: Union[bool, list] = False, log_profile: bool = False, log_data: bool = False, - silent: bool = False, verbose: bool = True, memory: Union[bool, str, Memory] = True, profile: bool = False, diff --git a/pycaret/regression/functional.py b/pycaret/regression/functional.py index f5f273008..be58a77d4 100644 --- a/pycaret/regression/functional.py +++ b/pycaret/regression/functional.py @@ -87,7 +87,6 @@ def setup( log_plots: Union[bool, list] = False, log_profile: bool = False, log_data: bool = False, - silent: bool = False, verbose: bool = True, memory: Union[bool, str, Memory] = True, profile: bool = False, @@ -503,11 +502,6 @@ def setup( Ignored when ``log_experiment`` is False. - silent: bool, default = False - When executing in completely automated mode or on a remote kernel, this must be True. - Leave False otherwise - - verbose: bool, default = True When set to False, Information grid is not printed. @@ -595,7 +589,6 @@ def setup( log_plots=log_plots, log_profile=log_profile, log_data=log_data, - silent=silent, verbose=verbose, memory=memory, profile=profile, @@ -619,7 +612,6 @@ def compare_models( groups: Optional[Union[str, Any]] = None, experiment_custom_tags: Optional[Dict[str, Any]] = None, verbose: bool = True, - # parallel: Optional[ParallelBackend] = None, ): """ @@ -724,19 +716,6 @@ def compare_models( - No models are logged in ``MLFlow`` when ``cross_validation`` parameter is False. """ - # params = dict(locals()) - parallel = None - if parallel is not None: - global _pycaret_setup_call - parallel.attach(_pycaret_setup_call["func"], _pycaret_setup_call["params"]) - if params.get("include", None) is None: - _models = models() - if turbo: - _models = _models[_models.Turbo] - params["include"] = _models.index.tolist() - del params["parallel"] - return parallel.compare_models(compare_models, params) - return _CURRENT_EXPERIMENT.compare_models( include=include, exclude=exclude, @@ -1123,79 +1102,79 @@ def ensemble_model( ) -> Any: """ - This function ensembles a given estimator. The output of this function is - a score grid with CV scores by fold. Metrics evaluated during CV can be - accessed using the ``get_metrics`` function. Custom metrics can be added - or removed using ``add_metric`` and ``remove_metric`` function. + This function ensembles a given estimator. The output of this function is + a score grid with CV scores by fold. Metrics evaluated during CV can be + accessed using the ``get_metrics`` function. Custom metrics can be added + or removed using ``add_metric`` and ``remove_metric`` function. - Example - -------- - >>> from pycaret.datasets import get_data - >>> boston = get_data('boston') - >>> from pycaret.regression import * - >>> exp_name = setup(data = boston, target = 'medv') - >>> dt = create_model('dt') - >>> bagged_dt = ensemble_model(dt, method = 'Bagging') + Example + -------- + >>> from pycaret.datasets import get_data + >>> boston = get_data('boston') + >>> from pycaret.regression import * + >>> exp_name = setup(data = boston, target = 'medv') + >>> dt = create_model('dt') + >>> bagged_dt = ensemble_model(dt, method = 'Bagging') estimator: scikit-learn compatible object - Trained model object + Trained model object - method: str, default = 'Bagging' - Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'. + method: str, default = 'Bagging' + Method for ensembling base estimator. It can be 'Bagging' or 'Boosting'. - fold: int or scikit-learn compatible CV generator, default = None - Controls cross-validation. If None, the CV generator in the ``fold_strategy`` - parameter of the ``setup`` function is used. When an integer is passed, - it is interpreted as the 'n_splits' parameter of the CV generator in the - ``setup`` function. + fold: int or scikit-learn compatible CV generator, default = None + Controls cross-validation. If None, the CV generator in the ``fold_strategy`` + parameter of the ``setup`` function is used. When an integer is passed, + it is interpreted as the 'n_splits' parameter of the CV generator in the + ``setup`` function. - n_estimators: int, default = 10 - The number of base estimators in the ensemble. In case of perfect fit, the - learning procedure is stopped early. + n_estimators: int, default = 10 + The number of base estimators in the ensemble. In case of perfect fit, the + learning procedure is stopped early. - round: int, default = 4 - Number of decimal places the metrics in the score grid will be rounded to. + round: int, default = 4 + Number of decimal places the metrics in the score grid will be rounded to. - choose_better: bool, default = False - When set to True, the returned object is always better performing. The - metric used for comparison is defined by the ``optimize`` parameter. + choose_better: bool, default = False + When set to True, the returned object is always better performing. The + metric used for comparison is defined by the ``optimize`` parameter. - optimize: str, default = 'R2' - Metric to compare for model selection when ``choose_better`` is True. + optimize: str, default = 'R2' + Metric to compare for model selection when ``choose_better`` is True. - fit_kwargs: dict, default = {} (empty dict) - Dictionary of arguments passed to the fit method of the model. + fit_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the fit method of the model. - groups: str or array-like, with shape (n_samples,), default = None - Optional group labels when GroupKFold is used for the cross validation. - It takes an array with shape (n_samples, ) where n_samples is the number - of rows in training dataset. When string is passed, it is interpreted as - the column name in the dataset containing group labels. + groups: str or array-like, with shape (n_samples,), default = None + Optional group labels when GroupKFold is used for the cross validation. + It takes an array with shape (n_samples, ) where n_samples is the number + of rows in training dataset. When string is passed, it is interpreted as + the column name in the dataset containing group labels. return_train_score: bool, default = False - If False, returns the CV Validation scores only. - If True, returns the CV training scores along with the CV validation scores. - This is useful when the user wants to do bias-variance tradeoff. A high CV - training score with a low corresponding CV validation score indicates overfitting. + If False, returns the CV Validation scores only. + If True, returns the CV training scores along with the CV validation scores. + This is useful when the user wants to do bias-variance tradeoff. A high CV + training score with a low corresponding CV validation score indicates overfitting. - verbose: bool, default = True - Score grid is not printed when verbose is set to False. + verbose: bool, default = True + Score grid is not printed when verbose is set to False. - Returns: - Trained Model + Returns: + Trained Model """ @@ -1379,7 +1358,7 @@ def stack_models( Number of decimal places the metrics in the score grid will be rounded to. - restack: bool, default = True + restack: bool, default = False When set to False, only the predictions of estimators will be used as training data for the ``meta_model``. @@ -1648,15 +1627,16 @@ def interpret_model( """ This function takes a trained model object and returns an interpretation plot - based on the test / hold-out set. It only supports tree based algorithms. + based on the test / hold-out set. This function is implemented based on the SHAP (SHapley Additive exPlanations), which is a unified approach to explain the output of any machine learning model. SHAP connects game theory with local explanations. - For more information : https://shap.readthedocs.io/en/latest/ + For more information: https://shap.readthedocs.io/en/latest/ + + For more information on Partial Dependence Plot: https://github.com/SauceCat/PDPbox - For Partial Dependence Plot : https://github.com/SauceCat/PDPbox Example -------- @@ -2162,17 +2142,18 @@ def automl( @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) def pull(pop: bool = False) -> pd.DataFrame: """ - Returns last printed score grid. Use ``pull`` function after - any training function to store the score grid in pandas.DataFrame. + Returns the latest displayed table. - - pop: bool, default = False - If True, will pop (remove) the returned dataframe from the + Parameters + ---------- + pop : bool, default = False + If true, will pop (remove) the returned dataframe from the display container. - - Returns: - pandas.DataFrame + Returns + ------- + pandas.DataFrame + Equivalent to get_config('display_container')[-1] """ return _CURRENT_EXPERIMENT.pull(pop=pop) @@ -2184,7 +2165,6 @@ def models( internal: bool = False, raise_errors: bool = True, ) -> pd.DataFrame: - """ Returns table of models available in the model library. @@ -2216,6 +2196,7 @@ def models( pandas.DataFrame """ + return _CURRENT_EXPERIMENT.models( type=type, internal=internal, raise_errors=raise_errors ) @@ -2227,9 +2208,8 @@ def get_metrics( include_custom: bool = True, raise_errors: bool = True, ) -> pd.DataFrame: - """ - Returns table of available metrics used for CV. + Returns table of available metrics used in the experiment. Example @@ -2275,9 +2255,8 @@ def add_metric( greater_is_better: bool = True, **kwargs, ) -> pd.Series: - """ - Adds a custom metric to be used for CV. + Adds a custom metric to be used in the experiment. Example @@ -2326,9 +2305,8 @@ def add_metric( @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) def remove_metric(name_or_id: str): - """ - Removes a metric from CV. + Removes a metric from experiment. Example @@ -2348,12 +2326,12 @@ def remove_metric(name_or_id: str): None """ + return _CURRENT_EXPERIMENT.remove_metric(name_or_id=name_or_id) @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.DataFrame: - """ Returns a table of experiment logs. Only works when ``log_experiment`` is True when initializing the ``setup`` function. @@ -2389,51 +2367,17 @@ def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.Da def get_config(variable: str): """ - This function retrieves the global variables created when initializing the - ``setup`` function. Following variables are accessible: - - - dataset: Transformed dataset - - train: Transformed training set - - test: Transformed test set - - X: Transformed feature set - - y: Transformed target column - - X_train, X_test, y_train, y_test: Subsets of the train and test sets. - - seed: random state set through session_id - - pipeline: Transformation pipeline configured through setup - - fold_shuffle_param: shuffle parameter used in Kfolds - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment - - logging_param: log_experiment param - - log_plots_param: log_plots param - - USI: Unique session ID parameter - - fix_imbalance_param: fix_imbalance param - - fix_imbalance_method_param: fix_imbalance_method param - - data_before_preprocess: data before preprocessing - - target_param: name of target variable - - gpu_param: use_gpu param configured through setup - - fold_generator: CV splitter configured in fold_strategy - - fold_param: fold params defined in the setup - - fold_groups_param: fold groups defined in the setup - - stratify_param: stratify parameter defined in the setup - - transform_target_param: transform_target_param in setup - - transform_target_method_param: transform_target_method_param in setup - + This function is used to access global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> boston = get_data('boston') - >>> from pycaret.regression import * - >>> exp_name = setup(data = boston, target = 'medv') >>> X_train = get_config('X_train') + This will return X_train transformed dataset. - Returns: - Global variable - + Returns + ------- + variable """ @@ -2444,53 +2388,15 @@ def get_config(variable: str): def set_config(variable: str, value): """ - This function resets the global variables. Following variables are - accessible: - - - X: Transformed dataset (X) - - y: Transformed dataset (y) - - X_train: Transformed train dataset (X) - - X_test: Transformed test/holdout dataset (X) - - y_train: Transformed train dataset (y) - - y_test: Transformed test/holdout dataset (y) - - seed: random state set through session_id - - prep_pipe: Transformation pipeline - - fold_shuffle_param: shuffle parameter used in Kfolds - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment - - logging_param: log_experiment param - - log_plots_param: log_plots param - - USI: Unique session ID parameter - - fix_imbalance_param: fix_imbalance param - - fix_imbalance_method_param: fix_imbalance_method param - - data_before_preprocess: data before preprocessing - - target_param: name of target variable - - gpu_param: use_gpu param configured through setup - - fold_generator: CV splitter configured in fold_strategy - - fold_param: fold params defined in the setup - - fold_groups_param: fold groups defined in the setup - - stratify_param: stratify parameter defined in the setup - - transform_target_param: transform_target_param in setup - - transform_target_method_param: transform_target_method_param in setup - + This function is used to reset global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> boston = get_data('boston') - >>> from pycaret.regression import * - >>> exp_name = setup(data = boston, target = 'medv') >>> set_config('seed', 123) - - Returns: - None + This will set the global seed to '123'. """ - return _CURRENT_EXPERIMENT.set_config(variable=variable, value=value) @@ -2653,6 +2559,7 @@ def dashboard( Returns: ExplainerDashboard """ + return _CURRENT_EXPERIMENT.dashboard( estimator, display_format, dashboard_kwargs, run_kwargs, **kwargs ) diff --git a/pycaret/regression/oop.py b/pycaret/regression/oop.py index dacd7d6b5..f3a831b8a 100644 --- a/pycaret/regression/oop.py +++ b/pycaret/regression/oop.py @@ -140,7 +140,6 @@ def setup( log_plots: Union[bool, list] = False, log_profile: bool = False, log_data: bool = False, - silent: bool = False, verbose: bool = True, memory: Union[bool, str, Memory] = True, profile: bool = False, @@ -557,11 +556,6 @@ def setup( Ignored when ``log_experiment`` is False. - silent: bool, default = False - When executing in completely automated mode or on a remote kernel, this must be True. - Leave False otherwise - - verbose: bool, default = True When set to False, Information grid is not printed. @@ -1099,6 +1093,11 @@ def create_model( the column name in the dataset containing group labels. + experiment_custom_tags: dict, default = None + Dictionary of tag_name: String -> value: (String, but will be string-ified + if not) passed to the mlflow.set_tags to add new custom tags for the experiment. + + verbose: bool, default = True Score grid is not printed when verbose is set to False. @@ -1251,6 +1250,7 @@ def tune_model( - 'grid' : grid search - 'bayesian' : ``pip install scikit-optimize`` - 'hyperopt' : ``pip install hyperopt`` + - 'optuna' : ``pip install optuna`` - 'bohb' : ``pip install hpbandster ConfigSpace`` - 'optuna' possible values: @@ -1302,7 +1302,7 @@ def tune_model( tuner_verbose: bool or in, default = True If True or above 0, will print messages from the tuner. Higher values - print more messages. Ignored when ``verbose`` parameter is False. + print more messages. Ignored when ``verbose`` param is False. return_train_score: bool, default = False @@ -1368,24 +1368,24 @@ def ensemble_model( ) -> Any: """ - This function ensembles a given estimator. The output of this function is - a score grid with CV scores by fold. Metrics evaluated during CV can be - accessed using the ``get_metrics`` function. Custom metrics can be added - or removed using ``add_metric`` and ``remove_metric`` function. + This function ensembles a given estimator. The output of this function is + a score grid with CV scores by fold. Metrics evaluated during CV can be + accessed using the ``get_metrics`` function. Custom metrics can be added + or removed using ``add_metric`` and ``remove_metric`` function. - Example - -------- - >>> from pycaret.datasets import get_data - >>> boston = get_data('boston') - >>> from pycaret.regression import * - >>> exp_name = setup(data = boston, target = 'medv') - >>> dt = create_model('dt') - >>> bagged_dt = ensemble_model(dt, method = 'Bagging') + Example + -------- + >>> from pycaret.datasets import get_data + >>> boston = get_data('boston') + >>> from pycaret.regression import * + >>> exp_name = setup(data = boston, target = 'medv') + >>> dt = create_model('dt') + >>> bagged_dt = ensemble_model(dt, method = 'Bagging') estimator: scikit-learn compatible object - Trained model object + Trained model object method: str, default = 'Bagging' @@ -1428,19 +1428,19 @@ def ensemble_model( the column name in the dataset containing group labels. - verbose: bool, default = True - Score grid is not printed when verbose is set to False. + return_train_score: bool, default = False + If False, returns the CV Validation scores only. + If True, returns the CV training scores along with the CV validation scores. + This is useful when the user wants to do bias-variance tradeoff. A high CV + training score with a low corresponding CV validation score indicates overfitting. - return_train_score: bool, default = False - If False, returns the CV Validation scores only. - If True, returns the CV training scores along with the CV validation scores. - This is useful when the user wants to do bias-variance tradeoff. A high CV - training score with a low corresponding CV validation score indicates overfitting. + verbose: bool, default = True + Score grid is not printed when verbose is set to False. - Returns: - Trained Model + Returns: + Trained Model """ @@ -1696,59 +1696,64 @@ def plot_model( ) -> str: """ - This function analyzes the performance of a trained model on holdout set. - It may require re-training the model in certain cases. + This function analyzes the performance of a trained model on holdout set. + It may require re-training the model in certain cases. - Example - -------- - >>> from pycaret.datasets import get_data - >>> boston = get_data('boston') - >>> from pycaret.regression import * - >>> exp_name = setup(data = boston, target = 'medv') - >>> lr = create_model('lr') - >>> plot_model(lr, plot = 'residual') + Example + -------- + >>> from pycaret.datasets import get_data + >>> boston = get_data('boston') + >>> from pycaret.regression import * + >>> exp_name = setup(data = boston, target = 'medv') + >>> lr = create_model('lr') + >>> plot_model(lr, plot = 'residual') + + + estimator: scikit-learn compatible object + Trained model object - estimator: scikit-learn compatible object - Trained model object + plot: str, default = 'residual' + List of available plots (ID - Name): + * 'pipeline' - Schematic drawing of the preprocessing pipeline + * 'residuals_interactive' - Interactive Residual plots + * 'residuals' - Residuals Plot + * 'error' - Prediction Error Plot + * 'cooks' - Cooks Distance Plot + * 'rfe' - Recursive Feat. Selection + * 'learning' - Learning Curve + * 'vc' - Validation Curve + * 'manifold' - Manifold Learning + * 'feature' - Feature Importance + * 'feature_all' - Feature Importance (All) + * 'parameter' - Model Hyperparameter + * 'tree' - Decision Tree - plot: str, default = 'residual' - List of available plots (ID - Name): - * 'pipeline' - Schematic drawing of the preprocessing pipeline - * 'residuals_interactive' - Interactive Residual plots - * 'residuals' - Residuals Plot - * 'error' - Prediction Error Plot - * 'cooks' - Cooks Distance Plot - * 'rfe' - Recursive Feat. Selection - * 'learning' - Learning Curve - * 'vc' - Validation Curve - * 'manifold' - Manifold Learning - * 'feature' - Feature Importance - * 'feature_all' - Feature Importance (All) - * 'parameter' - Model Hyperparameter - * 'tree' - Decision Tree + scale: float, default = 1 + The resolution scale of the figure. - scale: float, default = 1 - The resolution scale of the figure. + save: bool, default = False + When set to True, plot is saved in the current working directory. - save: bool, default = False - When set to True, plot is saved in the current working directory. + fold: int or scikit-learn compatible CV generator, default = None + Controls cross-validation. If None, the CV generator in the ``fold_strategy`` + parameter of the ``setup`` function is used. When an integer is passed, + it is interpreted as the 'n_splits' parameter of the CV generator in the + ``setup`` function. - fold: int or scikit-learn compatible CV generator, default = None - Controls cross-validation. If None, the CV generator in the ``fold_strategy`` - parameter of the ``setup`` function is used. When an integer is passed, - it is interpreted as the 'n_splits' parameter of the CV generator in the - ``setup`` function. + fit_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the fit method of the model. - fit_kwargs: dict, default = {} (empty dict) - Dictionary of arguments passed to the fit method of the model. + plot_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the visualizer class. + - pipeline: fontsize -> int plot_kwargs: dict, default = {} (empty dict) @@ -1756,29 +1761,29 @@ def plot_model( - pipeline: fontsize -> int - groups: str or array-like, with shape (n_samples,), default = None - Optional group labels when GroupKFold is used for the cross validation. - It takes an array with shape (n_samples, ) where n_samples is the number - of rows in training dataset. When string is passed, it is interpreted as - the column name in the dataset containing group labels. + groups: str or array-like, with shape (n_samples,), default = None + Optional group labels when GroupKFold is used for the cross validation. + It takes an array with shape (n_samples, ) where n_samples is the number + of rows in training dataset. When string is passed, it is interpreted as + the column name in the dataset containing group labels. - use_train_data: bool, default = False - When set to true, train data will be used for plots, instead - of test data. + use_train_data: bool, default = False + When set to true, train data will be used for plots, instead + of test data. - verbose: bool, default = True - When set to False, progress bar is not displayed. + verbose: bool, default = True + When set to False, progress bar is not displayed. - display_format: str, default = None - To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'. - Currently, not all plots are supported. + display_format: str, default = None + To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'. + Currently, not all plots are supported. - Returns: - None + Returns: + None """ @@ -1835,6 +1840,10 @@ def evaluate_model( Dictionary of arguments passed to the fit method of the model. + plot_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the visualizer class. + + groups: str or array-like, with shape (n_samples,), default = None Optional group labels when GroupKFold is used for the cross validation. It takes an array with shape (n_samples, ) where n_samples is the number @@ -1881,15 +1890,16 @@ def interpret_model( """ This function takes a trained model object and returns an interpretation plot - based on the test / hold-out set. It only supports tree based algorithms. + based on the test / hold-out set. This function is implemented based on the SHAP (SHapley Additive exPlanations), which is a unified approach to explain the output of any machine learning model. SHAP connects game theory with local explanations. - For more information : https://shap.readthedocs.io/en/latest/ + For more information: https://shap.readthedocs.io/en/latest/ + + For more information on Partial Dependence Plot: https://github.com/SauceCat/PDPbox - For Partial Dependence Plot : https://github.com/SauceCat/PDPbox Example -------- @@ -2009,6 +2019,11 @@ def predict_model( must be available in the unseen dataset. + drift_report: bool, default = False + When set to True, interactive drift report is generated on test set + with the evidently library. + + round: int, default = 4 Number of decimal places to round predictions to. @@ -2085,6 +2100,9 @@ def finalize_model( When set to False, only model object is re-trained and all the transformations in Pipeline are ignored. + experiment_custom_tags: dict, default = None + Dictionary of tag_name: String -> value: (String, but will be string-ified if + not) passed to the mlflow.set_tags to add new custom tags for the experiment. return_train_score: bool, default = False If False, returns the CV Validation scores only. @@ -2095,6 +2113,8 @@ def finalize_model( Returns: Trained Model + + """ return super().finalize_model( @@ -2235,6 +2255,10 @@ def save_model( entire pipeline. + **kwargs: + Additional keyword arguments to pass to joblib.dump(). + + verbose: bool, default = True Success message is not printed when verbose is set to False. @@ -2282,7 +2306,7 @@ def load_model( dictionary of applicable authentication tokens. when platform = 'aws': - {'bucket' : 'S3-bucket-name'} + {'bucket' : 'Name of Bucket on S3', 'path': (optional) folder name under the bucket} when platform = 'gcp': {'project': 'gcp-project-name', 'bucket' : 'gcp-bucket-name'} @@ -2359,6 +2383,8 @@ def automl( Returns: Trained Model + + """ return super().automl( @@ -2406,6 +2432,7 @@ def models( pandas.DataFrame """ + return super().models(type=type, internal=internal, raise_errors=raise_errors) def get_metrics( @@ -2416,7 +2443,7 @@ def get_metrics( ) -> pd.DataFrame: """ - Returns table of available metrics used for CV. + Returns table of available metrics used in the experiment. Example @@ -2463,7 +2490,7 @@ def add_metric( ) -> pd.Series: """ - Adds a custom metric to be used for CV. + Adds a custom metric to be used in the experiment. Example @@ -2513,7 +2540,7 @@ def add_metric( def remove_metric(self, name_or_id: str): """ - Removes a metric from CV. + Removes a metric from experiment. Example @@ -2533,6 +2560,7 @@ def remove_metric(self, name_or_id: str): None """ + return super().remove_metric(name_or_id=name_or_id) def get_logs( From 3612d7029a0dd7c67ad281c59dc9cdcf14a18300 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Sun, 29 May 2022 21:31:48 +0000 Subject: [PATCH 04/20] set_current_experiment & *_config --- pycaret/anomaly/functional.py | 70 +++++------------- pycaret/clustering/functional.py | 71 +++++-------------- pycaret/regression/functional.py | 9 +++ pycaret/time_series/forecasting/functional.py | 9 +++ 4 files changed, 52 insertions(+), 107 deletions(-) diff --git a/pycaret/anomaly/functional.py b/pycaret/anomaly/functional.py index c7f4c4601..170f6e8ef 100644 --- a/pycaret/anomaly/functional.py +++ b/pycaret/anomaly/functional.py @@ -1160,39 +1160,17 @@ def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.Da def get_config(variable: str): """ - This function retrieves the global variables created when initializing the - ``setup`` function. Following variables are accessible: - - - dataset: Transformed dataset - - train: Transformed training set - - test: Transformed test set - - X: Transformed feature set - - y: Transformed target column - - X_train, X_test, y_train, y_test: Subsets of the train and test sets. - - seed: random state set through session_id - - pipeline: Transformation pipeline configured through setup - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - gpu_param: use_gpu param configured through setup - + This function is used to access global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> anomaly = get_data('anomaly') - >>> from pycaret.anomaly import * - >>> exp_name = setup(data = anomaly) - >>> X = get_config('X') + >>> X_train = get_config('X_train') + This will return X_train transformed dataset. - Returns: - Global variable + Returns + ------- + variable """ @@ -1203,38 +1181,15 @@ def get_config(variable: str): def set_config(variable: str, value): """ - This function resets the global variables. Following variables are - accessible: - - - X: Transformed dataset (X) - - data_before_preprocess: data before preprocessing - - seed: random state set through session_id - - prep_pipe: Transformation pipeline configured through setup - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - gpu_param: use_gpu param configured through setup - + This function is used to reset global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> anomaly = get_data('anomaly') - >>> from pycaret.anomaly import * - >>> exp_name = setup(data = anomaly) >>> set_config('seed', 123) - - Returns: - None + This will set the global seed to '123'. """ - return _CURRENT_EXPERIMENT.set_config(variable=variable, value=value) @@ -1397,6 +1352,15 @@ def get_outliers( def set_current_experiment(experiment: AnomalyExperiment): + """ + Set the current experiment to be used with the functional API. + + experiment: AnomalyExperiment + Experiment object to use. + + Returns: + None + """ global _CURRENT_EXPERIMENT if not isinstance(experiment, AnomalyExperiment): diff --git a/pycaret/clustering/functional.py b/pycaret/clustering/functional.py index 8f26c36a9..bc88eb4bb 100644 --- a/pycaret/clustering/functional.py +++ b/pycaret/clustering/functional.py @@ -1344,40 +1344,17 @@ def get_logs(experiment_name: Optional[str] = None, save: bool = False) -> pd.Da def get_config(variable: str): """ - This function retrieves the global variables created when initializing the - ``setup`` function. Following variables are accessible: - - - dataset: Transformed dataset - - train: Transformed training set - - test: Transformed test set - - X: Transformed feature set - - y: Transformed target column - - X_train, X_test, y_train, y_test: Subsets of the train and test sets. - - seed: random state set through session_id - - pipeline: Transformation pipeline configured through setup - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - gpu_param: use_gpu param configured through setup - + This function is used to access global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> jewellery = get_data('jewellery') - >>> from pycaret.clustering import * - >>> exp_name = setup(data = jewellery) - >>> X = get_config('X') - + >>> X_train = get_config('X_train') - Returns: - Global variable + This will return X_train transformed dataset. + Returns + ------- + variable """ @@ -1388,38 +1365,15 @@ def get_config(variable: str): def set_config(variable: str, value): """ - This function resets the global variables. Following variables are - accessible: - - - X: Transformed dataset (X) - - data_before_preprocess: data before preprocessing - - seed: random state set through session_id - - prep_pipe: Transformation pipeline configured through setup - - n_jobs_param: n_jobs parameter used in model training - - html_param: html_param configured through setup - - master_model_container: model storage container - - display_container: results display container - - exp_name_log: Name of experiment set through setup - - logging_param: log_experiment param set through setup - - log_plots_param: log_plots param set through setup - - USI: Unique session ID parameter set through setup - - gpu_param: use_gpu param configured through setup - + This function is used to reset global environment variables. Example ------- - >>> from pycaret.datasets import get_data - >>> jewellery = get_data('jewellery') - >>> from pycaret.clustering import * - >>> exp_name = setup(data = jewellery) >>> set_config('seed', 123) - - Returns: - None + This will set the global seed to '123'. """ - return _CURRENT_EXPERIMENT.set_config(variable=variable, value=value) @@ -1586,6 +1540,15 @@ def get_clusters( def set_current_experiment(experiment: ClusteringExperiment): + """ + Set the current experiment to be used with the functional API. + + experiment: ClusteringExperiment + Experiment object to use. + + Returns: + None + """ global _CURRENT_EXPERIMENT if not isinstance(experiment, ClusteringExperiment): diff --git a/pycaret/regression/functional.py b/pycaret/regression/functional.py index be58a77d4..b5980b4cb 100644 --- a/pycaret/regression/functional.py +++ b/pycaret/regression/functional.py @@ -2851,6 +2851,15 @@ def deep_check(estimator, check_kwargs: Optional[dict] = None) -> None: def set_current_experiment(experiment: RegressionExperiment): + """ + Set the current experiment to be used with the functional API. + + experiment: RegressionExperiment + Experiment object to use. + + Returns: + None + """ global _CURRENT_EXPERIMENT if not isinstance(experiment, RegressionExperiment): diff --git a/pycaret/time_series/forecasting/functional.py b/pycaret/time_series/forecasting/functional.py index 5a8d294db..b1a67a824 100644 --- a/pycaret/time_series/forecasting/functional.py +++ b/pycaret/time_series/forecasting/functional.py @@ -1838,6 +1838,15 @@ def load_config(file_name: str): def set_current_experiment(experiment: TSForecastingExperiment): + """ + Set the current experiment to be used with the functional API. + + experiment: TSForecastingExperiment + Experiment object to use. + + Returns: + None + """ global _CURRENT_EXPERIMENT if not isinstance(experiment, TSForecastingExperiment): From f03c7e202bc871c2013e9fe52af1a9a857db2690 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 30 May 2022 17:04:01 +0000 Subject: [PATCH 05/20] Clustering docstrings --- pycaret/anomaly/__init__.py | 4 +- pycaret/anomaly/functional.py | 131 +--- pycaret/clustering/__init__.py | 2 - pycaret/clustering/functional.py | 133 +--- pycaret/clustering/oop.py | 193 +++++- .../unsupervised_experiment.py | 589 ++++++++++++++++-- 6 files changed, 738 insertions(+), 314 deletions(-) diff --git a/pycaret/anomaly/__init__.py b/pycaret/anomaly/__init__.py index 9e2b85f9d..f70d63804 100644 --- a/pycaret/anomaly/__init__.py +++ b/pycaret/anomaly/__init__.py @@ -5,12 +5,12 @@ evaluate_model, get_config, get_logs, - get_outliers, load_config, load_model, models, plot_model, predict_model, + pull, save_config, save_model, set_config, @@ -32,12 +32,12 @@ "deploy_model", "save_model", "load_model", + "pull", "models", "get_logs", "get_config", "set_config", "save_config", "load_config", - "get_outliers", "set_current_experiment", ] diff --git a/pycaret/anomaly/functional.py b/pycaret/anomaly/functional.py index 170f6e8ef..f8db04537 100644 --- a/pycaret/anomaly/functional.py +++ b/pycaret/anomaly/functional.py @@ -1087,6 +1087,26 @@ def load_model( ) +@check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) +def pull(pop: bool = False) -> pd.DataFrame: + """ + Returns the latest displayed table. + + Parameters + ---------- + pop : bool, default = False + If true, will pop (remove) the returned dataframe from the + display container. + + Returns + ------- + pandas.DataFrame + Equivalent to get_config('display_container')[-1] + + """ + return _CURRENT_EXPERIMENT.pull(pop=pop) + + @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) def models( internal: bool = False, @@ -1240,117 +1260,6 @@ def load_config(file_name: str): return _CURRENT_EXPERIMENT.load_config(file_name=file_name) -def get_outliers( - data, - model: Union[str, Any] = "knn", - fraction: float = 0.05, - fit_kwargs: Optional[dict] = None, - preprocess: bool = True, - imputation_type: str = "simple", - iterative_imputation_iters: int = 5, - categorical_features: Optional[List[str]] = None, - categorical_imputation: str = "mode", - categorical_iterative_imputer: Union[str, Any] = "lightgbm", - ordinal_features: Optional[Dict[str, list]] = None, - high_cardinality_features: Optional[List[str]] = None, - high_cardinality_method: str = "frequency", - numeric_features: Optional[List[str]] = None, - numeric_imputation: str = "mean", # method 'zero' added in pycaret==2.1 - numeric_iterative_imputer: Union[str, Any] = "lightgbm", - date_features: Optional[List[str]] = None, - ignore_features: Optional[List[str]] = None, - normalize: bool = False, - normalize_method: str = "zscore", - transformation: bool = False, - transformation_method: str = "yeo-johnson", - handle_unknown_categorical: bool = True, - unknown_categorical_method: str = "least_frequent", - pca: bool = False, - pca_method: str = "linear", - pca_components: Union[int, float] = 1.0, - low_variance_threshold: float = 0, - combine_rare_levels: bool = False, - rare_level_threshold: float = 0.10, - bin_numeric_features: Optional[List[str]] = None, - remove_multicollinearity: bool = False, - multicollinearity_threshold: float = 0.9, - remove_perfect_collinearity: bool = False, - group_features: Optional[List[str]] = None, - group_names: Optional[List[str]] = None, - n_jobs: Optional[int] = -1, - session_id: Optional[int] = None, - system_log: Union[bool, str, logging.Logger] = True, - log_experiment: Union[bool, str, BaseLogger, List[Union[str, BaseLogger]]] = False, - experiment_name: Optional[str] = None, - log_plots: Union[bool, list] = False, - log_profile: bool = False, - log_data: bool = False, - profile: bool = False, - **kwargs, -) -> pd.DataFrame: - - """ - Callable from any external environment without requiring setup initialization. - """ - exp = _EXPERIMENT_CLASS() - exp.setup( - data=data, - preprocess=preprocess, - imputation_type=imputation_type, - iterative_imputation_iters=iterative_imputation_iters, - categorical_features=categorical_features, - categorical_imputation=categorical_imputation, - categorical_iterative_imputer=categorical_iterative_imputer, - ordinal_features=ordinal_features, - high_cardinality_features=high_cardinality_features, - high_cardinality_method=high_cardinality_method, - numeric_features=numeric_features, - numeric_imputation=numeric_imputation, - numeric_iterative_imputer=numeric_iterative_imputer, - date_features=date_features, - ignore_features=ignore_features, - normalize=normalize, - normalize_method=normalize_method, - transformation=transformation, - transformation_method=transformation_method, - handle_unknown_categorical=handle_unknown_categorical, - unknown_categorical_method=unknown_categorical_method, - pca=pca, - pca_method=pca_method, - pca_components=pca_components, - low_variance_threshold=low_variance_threshold, - combine_rare_levels=combine_rare_levels, - rare_level_threshold=rare_level_threshold, - bin_numeric_features=bin_numeric_features, - remove_multicollinearity=remove_multicollinearity, - multicollinearity_threshold=multicollinearity_threshold, - remove_perfect_collinearity=remove_perfect_collinearity, - group_features=group_features, - group_names=group_names, - n_jobs=n_jobs, - html=False, - session_id=session_id, - system_log=system_log, - log_experiment=log_experiment, - experiment_name=experiment_name, - log_plots=log_plots, - log_profile=log_profile, - log_data=log_data, - silent=True, - verbose=False, - profile=profile, - ) - - c = exp.create_model( - model=model, - fraction=fraction, - fit_kwargs=fit_kwargs, - verbose=False, - **kwargs, - ) - return exp.assign_model(c, verbose=False) - - def set_current_experiment(experiment: AnomalyExperiment): """ Set the current experiment to be used with the functional API. diff --git a/pycaret/clustering/__init__.py b/pycaret/clustering/__init__.py index 02fb7c1b0..889558409 100644 --- a/pycaret/clustering/__init__.py +++ b/pycaret/clustering/__init__.py @@ -4,7 +4,6 @@ create_model, deploy_model, evaluate_model, - get_clusters, get_config, get_logs, get_metrics, @@ -46,6 +45,5 @@ "set_config", "save_config", "load_config", - "get_clusters", "set_current_experiment", ] diff --git a/pycaret/clustering/functional.py b/pycaret/clustering/functional.py index bc88eb4bb..6de8d94c5 100644 --- a/pycaret/clustering/functional.py +++ b/pycaret/clustering/functional.py @@ -1123,17 +1123,18 @@ def load_model( @check_if_global_is_not_none(globals(), _CURRENT_EXPERIMENT_DECORATOR_DICT) def pull(pop: bool = False) -> pd.DataFrame: """ - Returns last printed score grid. Use ``pull`` function after - any training function to store the score grid in pandas.DataFrame. + Returns the latest displayed table. - - pop: bool, default = False - If True, will pop (remove) the returned dataframe from the + Parameters + ---------- + pop : bool, default = False + If true, will pop (remove) the returned dataframe from the display container. - Returns: - pandas.DataFrame - + Returns + ------- + pandas.DataFrame + Equivalent to get_config('display_container')[-1] """ return _CURRENT_EXPERIMENT.pull(pop=pop) @@ -1303,6 +1304,7 @@ def remove_metric(name_or_id: str): None """ + return _CURRENT_EXPERIMENT.remove_metric(name_or_id=name_or_id) @@ -1424,121 +1426,6 @@ def load_config(file_name: str): return _CURRENT_EXPERIMENT.load_config(file_name=file_name) -def get_clusters( - data, - model: Union[str, Any] = "kmeans", - num_clusters: int = 4, - ground_truth: Optional[str] = None, - round: int = 4, - fit_kwargs: Optional[dict] = None, - preprocess: bool = True, - imputation_type: str = "simple", - iterative_imputation_iters: int = 5, - categorical_features: Optional[List[str]] = None, - categorical_imputation: str = "mode", - categorical_iterative_imputer: Union[str, Any] = "lightgbm", - ordinal_features: Optional[Dict[str, list]] = None, - high_cardinality_features: Optional[List[str]] = None, - high_cardinality_method: str = "frequency", - numeric_features: Optional[List[str]] = None, - numeric_imputation: str = "mean", # method 'zero' added in pycaret==2.1 - numeric_iterative_imputer: Union[str, Any] = "lightgbm", - date_features: Optional[List[str]] = None, - ignore_features: Optional[List[str]] = None, - normalize: bool = False, - normalize_method: str = "zscore", - transformation: bool = False, - transformation_method: str = "yeo-johnson", - handle_unknown_categorical: bool = True, - unknown_categorical_method: str = "least_frequent", - pca: bool = False, - pca_method: str = "linear", - pca_components: Union[int, float] = 1.0, - low_variance_threshold: float = 0, - combine_rare_levels: bool = False, - rare_level_threshold: float = 0.10, - bin_numeric_features: Optional[List[str]] = None, - remove_multicollinearity: bool = False, - multicollinearity_threshold: float = 0.9, - remove_perfect_collinearity: bool = False, - group_features: Optional[List[str]] = None, - group_names: Optional[List[str]] = None, - n_jobs: Optional[int] = -1, - session_id: Optional[int] = None, - system_log: Union[bool, str, logging.Logger] = True, - log_experiment: Union[bool, str, BaseLogger, List[Union[str, BaseLogger]]] = False, - experiment_name: Optional[str] = None, - log_plots: Union[bool, list] = False, - log_profile: bool = False, - log_data: bool = False, - profile: bool = False, - **kwargs, -) -> pd.DataFrame: - - """ - Callable from any external environment without requiring setup initialization. - """ - exp = _EXPERIMENT_CLASS() - exp.setup( - data=data, - preprocess=preprocess, - imputation_type=imputation_type, - iterative_imputation_iters=iterative_imputation_iters, - categorical_features=categorical_features, - categorical_imputation=categorical_imputation, - categorical_iterative_imputer=categorical_iterative_imputer, - ordinal_features=ordinal_features, - high_cardinality_features=high_cardinality_features, - high_cardinality_method=high_cardinality_method, - numeric_features=numeric_features, - numeric_imputation=numeric_imputation, - numeric_iterative_imputer=numeric_iterative_imputer, - date_features=date_features, - ignore_features=ignore_features, - normalize=normalize, - normalize_method=normalize_method, - transformation=transformation, - transformation_method=transformation_method, - handle_unknown_categorical=handle_unknown_categorical, - unknown_categorical_method=unknown_categorical_method, - pca=pca, - pca_method=pca_method, - pca_components=pca_components, - low_variance_threshold=low_variance_threshold, - combine_rare_levels=combine_rare_levels, - rare_level_threshold=rare_level_threshold, - bin_numeric_features=bin_numeric_features, - remove_multicollinearity=remove_multicollinearity, - multicollinearity_threshold=multicollinearity_threshold, - remove_perfect_collinearity=remove_perfect_collinearity, - group_features=group_features, - group_names=group_names, - n_jobs=n_jobs, - html=False, - session_id=session_id, - system_log=system_log, - log_experiment=log_experiment, - experiment_name=experiment_name, - log_plots=log_plots, - log_profile=log_profile, - log_data=log_data, - silent=True, - verbose=False, - profile=profile, - ) - - c = exp.create_model( - model=model, - num_clusters=num_clusters, - ground_truth=ground_truth, - round=round, - fit_kwargs=fit_kwargs, - verbose=False, - **kwargs, - ) - return exp.assign_model(c, verbose=False) - - def set_current_experiment(experiment: ClusteringExperiment): """ Set the current experiment to be used with the functional API. diff --git a/pycaret/clustering/oop.py b/pycaret/clustering/oop.py index b3aae91ff..c5c94ab86 100644 --- a/pycaret/clustering/oop.py +++ b/pycaret/clustering/oop.py @@ -1,4 +1,5 @@ -from typing import List, Tuple +from ctypes import Union +from typing import Any, List, Optional, Tuple import numpy as np # type: ignore import pandas as pd # type ignore @@ -56,6 +57,140 @@ def _get_metrics(self, raise_errors: bool = True) -> dict: def _get_default_plots_to_log(self) -> List[str]: return ["cluster", "distribution", "elbow"] + def predict_model( + self, estimator, data: pd.DataFrame, ml_usecase: Optional[MLUsecase] = None + ) -> pd.DataFrame: + """ + This function generates cluster labels using a trained model. + + Example + ------- + >>> from pycaret.datasets import get_data + >>> jewellery = get_data('jewellery') + >>> from pycaret.clustering import * + >>> exp_name = setup(data = jewellery) + >>> kmeans = create_model('kmeans') + >>> kmeans_predictions = predict_model(model = kmeans, data = unseen_data) + + + model: scikit-learn compatible object + Trained Model Object. + + + data : pandas.DataFrame + Shape (n_samples, n_features) where n_samples is the number of samples and + n_features is the number of features. + + + Returns: + pandas.DataFrame + + + Warnings + -------- + - Models that do not support 'predict' method cannot be used in the ``predict_model``. + + - The behavior of the predict_model is changed in version 2.1 without backward compatibility. + As such, the pipelines trained using the version (<= 2.0), may not work for inference + with version >= 2.1. You can either retrain your models with a newer version or downgrade + the version for inference. + + + """ + + return super().predict_model(estimator, data, ml_usecase) + + def plot_model( + self, + estimator, + plot: str = "auc", + scale: float = 1, + save: Union[str, bool] = False, + fold: Optional[Union[int, Any]] = None, + fit_kwargs: Optional[dict] = None, + plot_kwargs: Optional[dict] = None, + groups: Optional[Union[str, Any]] = None, + feature_name: Optional[str] = None, + label: bool = False, + use_train_data: bool = False, + verbose: bool = True, + display_format: Optional[str] = None, + ) -> str: + """ + This function analyzes the performance of a trained model. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> jewellery = get_data('jewellery') + >>> from pycaret.clustering import * + >>> exp_name = setup(data = jewellery) + >>> kmeans = create_model('kmeans') + >>> plot_model(kmeans, plot = 'cluster') + + + model: scikit-learn compatible object + Trained Model Object + + + plot: str, default = 'cluster' + List of available plots (ID - Name): + + * 'cluster' - Cluster PCA Plot (2d) + * 'tsne' - Cluster t-SNE (3d) + * 'elbow' - Elbow Plot + * 'silhouette' - Silhouette Plot + * 'distance' - Distance Plot + * 'distribution' - Distribution Plot + + + feature: str, default = None + Feature to be evaluated when plot = 'distribution'. When ``plot`` type is + 'cluster' or 'tsne' feature column is used as a hoverover tooltip and/or + label when the ``label`` param is set to True. When the ``plot`` type is + 'cluster' or 'tsne' and feature is None, first column of the dataset is + used. + + + label: bool, default = False + Name of column to be used as data labels. Ignored when ``plot`` is not + 'cluster' or 'tsne'. + + + scale: float, default = 1 + The resolution scale of the figure. + + + save: bool, default = False + When set to True, plot is saved in the current working directory. + + + display_format: str, default = None + To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'. + Currently, not all plots are supported. + + + Returns: + None + + """ + return super().plot_model( + estimator, + plot, + scale, + save, + fold, + fit_kwargs, + plot_kwargs, + groups, + feature_name, + label, + use_train_data, + verbose, + display_format, + ) + def get_metrics( self, reset: bool = False, @@ -65,6 +200,7 @@ def get_metrics( """ Returns table of metrics available. + Example ------- >>> from pycaret.datasets import get_data @@ -73,22 +209,22 @@ def get_metrics( >>> exp_name = setup(data = jewellery) >>> all_metrics = get_metrics() - This will return pandas dataframe with all available - metrics and their metadata. - Parameters - ---------- reset: bool, default = False If True, will reset all changes made using add_metric() and get_metric(). + + include_custom: bool, default = True Whether to include user added (custom) metrics or not. + + raise_errors: bool, default = True If False, will suppress all exceptions, ignoring models that couldn't be created. - Returns - ------- - pandas.DataFrame + + Returns: + pandas.DataFrame """ @@ -124,38 +260,42 @@ def add_metric( """ Adds a custom metric to be used in all functions. - Parameters - ---------- + id: str Unique id for the metric. + name: str Display name of the metric. + score_func: type - Score function (or loss function) with signature score_func(y, y_pred, **kwargs). + Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``. + target: str, default = 'pred' The target of the score function. + - 'pred' for the prediction table - 'pred_proba' for pred_proba - 'threshold' for decision_function or predict_proba + greater_is_better: bool, default = True Whether score_func is a score function (default), meaning high is good, or a loss function, meaning low is good. In the latter case, the scorer object will sign-flip the outcome of the score_func. - needs_ground_truth: bool, default = False - Whether the metric needs ground truth to be calculated. + + multiclass: bool, default = True + Whether the metric supports multiclass problems. + **kwargs: Arguments to be passed to score function. - Returns - ------- - pandas.Series - The created row as Series. + Returns: + pandas.Series """ @@ -186,14 +326,27 @@ def add_metric( def remove_metric(self, name_or_id: str): """ - Removes a metric used in all functions. + Removes a metric used for evaluation. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> jewellery = get_data('jewellery') + >>> from pycaret.clustering import * + >>> exp_name = setup(data = jewellery) + >>> remove_metric('cs') + - Parameters - ---------- name_or_id: str Display name or ID of the metric. + + Returns: + None + """ + if not self._setup_ran: raise ValueError("setup() needs to be ran first.") diff --git a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py index 00186da82..d64126cd7 100644 --- a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py +++ b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py @@ -140,6 +140,314 @@ def setup( profile: bool = False, profile_kwargs: Dict[str, Any] = None, ): + """ + + This function initializes the training environment and creates the transformation + pipeline. Setup function must be called before executing any other function. It + takes one mandatory parameter: ``data``. All the other parameters are optional. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> jewellery = get_data('jewellery') + >>> from pycaret.clustering import * + >>> exp_name = setup(data = jewellery) + + + data: dataframe-like + Data set with shape (n_samples, n_features), where n_samples is the + number of samples and n_features is the number of features. If data + is not a pandas dataframe, it's converted to one using default column + names. + + + ordinal_features: dict, default = None + Categorical features to be encoded ordinally. For example, a categorical + feature with 'low', 'medium', 'high' values where low < medium < high can + be passed as ordinal_features = {'column_name' : ['low', 'medium', 'high']}. + + + numeric_features: list of str, default = None + If the inferred data types are not correct, the numeric_features param can + be used to define the data types. It takes a list of strings with column + names that are numeric. + + + categorical_features: list of str, default = None + If the inferred data types are not correct, the categorical_features param + can be used to define the data types. It takes a list of strings with column + names that are categorical. + + + date_features: list of str, default = None + If the inferred data types are not correct, the date_features param can be + used to overwrite the data types. It takes a list of strings with column + names that are DateTime. + + + text_features: list of str, default = None + Column names that contain a text corpus. If None, no text features are + selected. + + + ignore_features: list of str, default = None + ignore_features param can be used to ignore features during preprocessing + and model training. It takes a list of strings with column names that are + to be ignored. + + + keep_features: list of str, default = None + keep_features param can be used to always keep specific features during + preprocessing, i.e. these features are never dropped by any kind of + feature selection. It takes a list of strings with column names that are + to be kept. + + + preprocess: bool, default = True + When set to False, no transformations are applied except for train_test_split + and custom transformations passed in ``custom_pipeline`` param. Data must be + ready for modeling (no missing values, no dates, categorical data encoding), + when preprocess is set to False. + + + imputation_type: str or None, default = 'simple' + The type of imputation to use. Can be either 'simple' or 'iterative'. + If None, no imputation of missing values is performed. + + + numeric_imputation: str, default = 'mean' + Missing values in numeric features are imputed with 'mean' value of the feature + in the training dataset. The other available option is 'median' or 'zero'. + + + categorical_imputation: str, default = 'constant' + Missing values in categorical features are imputed with a constant 'not_available' + value. The other available option is 'mode'. + + + text_features_method: str, default = "tf-idf" + Method with which to embed the text features in the dataset. Choose + between "bow" (Bag of Words - CountVectorizer) or "tf-idf" (TfidfVectorizer). + Be aware that the sparse matrix output of the transformer is converted + internally to its full array. This can cause memory issues for large + text embeddings. + + + max_encoding_ohe: int, default = 5 + Categorical columns with `max_encoding_ohe` or less unique values are + encoded using OneHotEncoding. If more, the `encoding_method` estimator + is used. Note that columns with exactly two classes are always encoded + ordinally. + + + encoding_method: category-encoders estimator, default = None + A `category-encoders` estimator to encode the categorical columns + with more than `max_encoding_ohe` unique values. If None, + `category_encoders.leave_one_out.LeaveOneOutEncoder` is used. + + + polynomial_features: bool, default = False + When set to True, new features are derived using existing numeric features. + + + polynomial_degree: int, default = 2 + Degree of polynomial features. For example, if an input sample is two dimensional + and of the form [a, b], the polynomial features with degree = 2 are: + [1, a, b, a^2, ab, b^2]. Ignored when ``polynomial_features`` is not True. + + + low_variance_threshold: float or None, default = 0 + Remove features with a training-set variance lower than the provided + threshold. The default is to keep all features with non-zero variance, + i.e. remove the features that have the same value in all samples. If + None, skip this treansformation step. + + + remove_multicollinearity: bool, default = False + When set to True, features with the inter-correlations higher than the defined + threshold are removed. When two features are highly correlated with each other, + the feature that is less correlated with the target variable is removed. Only + considers numeric features. + + + multicollinearity_threshold: float, default = 0.9 + Threshold for correlated features. Ignored when ``remove_multicollinearity`` + is not True. + + + bin_numeric_features: list of str, default = None + To convert numeric features into categorical, bin_numeric_features parameter can + be used. It takes a list of strings with column names to be discretized. It does + so by using 'sturges' rule to determine the number of clusters and then apply + KMeans algorithm. Original values of the feature are then replaced by the + cluster label. + + + remove_outliers: bool, default = False + When set to True, outliers from the training data are removed using an + Isolation Forest. + + + outliers_method: str, default = "iforest" + Method with which to remove outliers. Possible values are: + - 'iforest': Uses sklearn's IsolationForest. + - 'ee': Uses sklearn's EllipticEnvelope. + - 'lof': Uses sklearn's LocalOutlierFactor. + + + outliers_threshold: float, default = 0.05 + The percentage outliers to be removed from the dataset. Ignored + when ``remove_outliers=False``. + + + transformation: bool, default = False + When set to True, it applies the power transform to make data more Gaussian-like. + Type of transformation is defined by the ``transformation_method`` parameter. + + + transformation_method: str, default = 'yeo-johnson' + Defines the method for transformation. By default, the transformation method is + set to 'yeo-johnson'. The other available option for transformation is 'quantile'. + Ignored when ``transformation`` is not True. + + + normalize: bool, default = False + When set to True, it transforms the features by scaling them to a given + range. Type of scaling is defined by the ``normalize_method`` parameter. + + + normalize_method: str, default = 'zscore' + Defines the method for scaling. By default, normalize method is set to 'zscore' + The standard zscore is calculated as z = (x - u) / s. Ignored when ``normalize`` + is not True. The other options are: + + - minmax: scales and translates each feature individually such that it is in + the range of 0 - 1. + - maxabs: scales and translates each feature individually such that the + maximal absolute value of each feature will be 1.0. It does not + shift/center the data, and thus does not destroy any sparsity. + - robust: scales and translates each feature according to the Interquartile + range. When the dataset contains outliers, robust scaler often gives + better results. + + + pca: bool, default = False + When set to True, dimensionality reduction is applied to project the data into + a lower dimensional space using the method defined in ``pca_method`` parameter. + + + pca_method: str, default = 'linear' + Method with which to apply PCA. Possible values are: + - 'linear': Uses Singular Value Decomposition. + - kernel: Dimensionality reduction through the use of RBF kernel. + - incremental: Similar to 'linear', but more efficient for large datasets. + + + pca_components: int or float, default = 1.0 + Number of components to keep. If >1, it selects that number of + components. If <= 1, it selects that fraction of components from + the original features. The value must be smaller than the number + of original features. This parameter is ignored when `pca=False`. + + + custom_pipeline: list of (str, transformer), dict or Pipeline, default = None + Addidiotnal custom transformers. If passed, they are applied to the + pipeline last, after all the build-in transformers. + + + n_jobs: int, default = -1 + The number of jobs to run in parallel (for functions that supports parallel + processing) -1 means using all processors. To run all functions on single + processor set n_jobs to None. + + + use_gpu: bool or str, default = False + When set to True, it will use GPU for training with algorithms that support it, + and fall back to CPU if they are unavailable. When set to 'force', it will only + use GPU-enabled algorithms and raise exceptions when they are unavailable. When + False, all algorithms are trained using CPU only. + + GPU enabled algorithms: + + - None at this moment. + + + html: bool, default = True + When set to False, prevents runtime display of monitor. This must be set to False + when the environment does not support IPython. For example, command line terminal, + Databricks Notebook, Spyder and other similar IDEs. + + + session_id: int, default = None + Controls the randomness of experiment. It is equivalent to 'random_state' in + scikit-learn. When None, a pseudo random number is generated. This can be used + for later reproducibility of the entire experiment. + + + system_log: bool or str or logging.Logger, default = True + Whether to save the system logging file (as logs.log). If the input + is a string, use that as the path to the logging file. If the input + already is a logger object, use that one instead. + + + log_experiment: bool, default = False + A (list of) PyCaret ``BaseLogger`` or str (one of 'mlflow', 'wandb') + corresponding to a logger to determine which experiment loggers to use. + Setting to True will use just MLFlow. + If ``wandb`` (Weights & Biases) is installed, will also log there. + + + experiment_name: str, default = None + Name of the experiment for logging. Ignored when ``log_experiment`` is False. + + + experiment_custom_tags: dict, default = None + Dictionary of tag_name: String -> value: (String, but will be string-ified + if not) passed to the mlflow.set_tags to add new custom tags for the experiment. + + + log_plots: bool or list, default = False + When set to True, certain plots are logged automatically in the ``MLFlow`` server. + To change the type of plots to be logged, pass a list containing plot IDs. Refer + to documentation of ``plot_model``. Ignored when ``log_experiment`` is False. + + + log_profile: bool, default = False + When set to True, data profile is logged on the ``MLflow`` server as a html file. + Ignored when ``log_experiment`` is False. + + + log_data: bool, default = False + When set to True, dataset is logged on the ``MLflow`` server as a csv file. + Ignored when ``log_experiment`` is False. + + + verbose: bool, default = True + When set to False, Information grid is not printed. + + + memory: str, bool or Memory, default=True + Used to cache the fitted transformers of the pipeline. + If False: No caching is performed. + If True: A default temp directory is used. + If str: Path to the caching directory. + + + profile: bool, default = False + When set to True, an interactive EDA report is displayed. + + + profile_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the ProfileReport method used + to create the EDA report. Ignored if ``profile`` is False. + + + Returns: + Global variables that can be changed using the ``set_config`` function. + + """ + # Setup initialization ===================================== >> runtime_start = time.time() @@ -373,6 +681,122 @@ def tune_model( verbose: bool = True, **kwargs, ): + """ + This function tunes the ``num_clusters`` parameter of a given model. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> juice = get_data('juice') + >>> from pycaret.clustering import * + >>> exp_name = setup(data = juice) + >>> tuned_kmeans = tune_model(model = 'kmeans', supervised_target = 'Purchase') + + + model: str + ID of an model available in the model library. Models that can be + tuned in this function (ID - Model): + + * 'kmeans' - K-Means Clustering + * 'sc' - Spectral Clustering + * 'hclust' - Agglomerative Clustering + * 'birch' - Birch Clustering + * 'kmodes' - K-Modes Clustering + + + supervised_target: str + Name of the target column containing labels. + + + supervised_type: str, default = None + Type of task. 'classification' or 'regression'. Automatically inferred + when None. + + + supervised_estimator: str, default = None + Classification (ID - Name): + * 'lr' - Logistic Regression (Default) + * 'knn' - K Nearest Neighbour + * 'nb' - Naive Bayes + * 'dt' - Decision Tree Classifier + * 'svm' - SVM - Linear Kernel + * 'rbfsvm' - SVM - Radial Kernel + * 'gpc' - Gaussian Process Classifier + * 'mlp' - Multi Level Perceptron + * 'ridge' - Ridge Classifier + * 'rf' - Random Forest Classifier + * 'qda' - Quadratic Discriminant Analysis + * 'ada' - Ada Boost Classifier + * 'gbc' - Gradient Boosting Classifier + * 'lda' - Linear Discriminant Analysis + * 'et' - Extra Trees Classifier + * 'xgboost' - Extreme Gradient Boosting + * 'lightgbm' - Light Gradient Boosting + * 'catboost' - CatBoost Classifier + + Regression (ID - Name): + * 'lr' - Linear Regression (Default) + * 'lasso' - Lasso Regression + * 'ridge' - Ridge Regression + * 'en' - Elastic Net + * 'lar' - Least Angle Regression + * 'llar' - Lasso Least Angle Regression + * 'omp' - Orthogonal Matching Pursuit + * 'br' - Bayesian Ridge + * 'ard' - Automatic Relevance Determ. + * 'par' - Passive Aggressive Regressor + * 'ransac' - Random Sample Consensus + * 'tr' - TheilSen Regressor + * 'huber' - Huber Regressor + * 'kr' - Kernel Ridge + * 'svm' - Support Vector Machine + * 'knn' - K Neighbors Regressor + * 'dt' - Decision Tree + * 'rf' - Random Forest + * 'et' - Extra Trees Regressor + * 'ada' - AdaBoost Regressor + * 'gbr' - Gradient Boosting + * 'mlp' - Multi Level Perceptron + * 'xgboost' - Extreme Gradient Boosting + * 'lightgbm' - Light Gradient Boosting + * 'catboost' - CatBoost Regressor + + + optimize: str, default = None + For Classification tasks: + Accuracy, AUC, Recall, Precision, F1, Kappa (default = 'Accuracy') + + For Regression tasks: + MAE, MSE, RMSE, R2, RMSLE, MAPE (default = 'R2') + + + custom_grid: list, default = None + By default, a pre-defined number of clusters is iterated over to + optimize the supervised objective. To overwrite default iteration, + pass a list of num_clusters to iterate over in custom_grid param. + + + fold: int, default = 10 + Number of folds to be used in Kfold CV. Must be at least 2. + + + verbose: bool, default = True + Status update is not printed when verbose is set to False. + + + Returns: + Trained Model with optimized ``num_clusters`` parameter. + + + Warnings + -------- + - Affinity Propagation, Mean shift, Density-Based Spatial Clustering + and OPTICS Clustering cannot be used in this function since they donot + support the ``num_clusters`` param. + + + """ function_params_str = ", ".join([f"{k}={v}" for k, v in locals().items()]) @@ -734,37 +1158,34 @@ def assign_model( ) -> pd.DataFrame: """ - This function assigns each of the data point in the dataset passed during setup - stage to one of the clusters using trained model object passed as model param. - create_model() function must be called before using assign_model(). + This function assigns cluster labels to the dataset for a given model. - This function returns a pandas.DataFrame. Example ------- >>> from pycaret.datasets import get_data >>> jewellery = get_data('jewellery') - >>> experiment_name = setup(data = jewellery, normalize = True) + >>> from pycaret.clustering import * + >>> exp_name = setup(data = jewellery) >>> kmeans = create_model('kmeans') >>> kmeans_df = assign_model(kmeans) - This will return a pandas.DataFrame with inferred clusters using trained model. - Parameters - ---------- - model: trained model object, default = None + + model: scikit-learn compatible object + Trained model object + transformation: bool, default = False - When set to True, assigned clusters are returned on transformed dataset instead - of original dataset passed during setup(). + Whether to apply cluster labels on the transformed dataset. + - verbose: Boolean, default = True + verbose: bool, default = True Status update is not printed when verbose is set to False. - Returns - ------- - pandas.DataFrame - Returns a DataFrame with assigned clusters using a trained model. + + Returns: + pandas.DataFrame """ @@ -1183,29 +1604,26 @@ def create_model( ) -> Any: """ - This function creates a model and scores it using Cross Validation. - The output prints a score grid that shows Accuracy, AUC, Recall, Precision, - F1, Kappa and MCC by fold (default = 10 Fold). - - This function returns a trained model object. + This function trains and evaluates the performance of a given model. + Metrics evaluated can be accessed using the ``get_metrics`` function. + Custom metrics can be added or removed using the ``add_metric`` and + ``remove_metric`` function. All the available models can be accessed + using the ``models`` function. - setup() function must be called before using create_model() Example ------- >>> from pycaret.datasets import get_data - >>> juice = get_data('juice') - >>> experiment_name = setup(data = juice, target = 'Purchase') - >>> lr = create_model('lr') + >>> jewellery = get_data('jewellery') + >>> from pycaret.clustering import * + >>> exp_name = setup(data = jewellery) + >>> kmeans = create_model('kmeans') - This will create a trained Logistic Regression model. - Parameters - ---------- - model : string / object, default = None - Enter ID of the models available in model library or pass an untrained model - object consistent with fit / predict API to train and evaluate model. List of - models available in model library (ID - Model): + model: str or scikit-learn compatible object + ID of an model available in the model library or pass an untrained + model object consistent with scikit-learn API. Models available + in the model library (ID - Name): * 'kmeans' - K-Means Clustering * 'ap' - Affinity Propagation @@ -1217,49 +1635,47 @@ def create_model( * 'birch' - Birch Clustering * 'kmodes' - K-Modes Clustering + num_clusters: int, default = 4 - Number of clusters to be generated with the dataset. + The number of clusters to form. + + + ground_truth: str, default = None + ground_truth to be provided to evaluate metrics that require true labels. + When None, such metrics are returned as 0.0. All metrics evaluated can + be accessed using ``get_metrics`` function. - ground_truth: string, default = None - When ground_truth is provided, Homogeneity Score, Rand Index, and - Completeness Score is evaluated and printer along with other metrics. - round: integer, default = 4 + round: int, default = 4 Number of decimal places the metrics in the score grid will be rounded to. + fit_kwargs: dict, default = {} (empty dict) Dictionary of arguments passed to the fit method of the model. + verbose: bool, default = True - Score grid is not printed when verbose is set to False. + Status update is not printed when verbose is set to False. - system: bool, default = True - Must remain True all times. Only to be changed by internal functions. - If False, method will return a tuple of model and the model fit time. - add_to_model_list: bool, default = True - Whether to save model and results in master_model_container. + experiment_custom_tags: dict, default = None + Dictionary of tag_name: String -> value: (String, but will be string-ified + if not) passed to the mlflow.set_tags to add new custom tags for the experiment. + **kwargs: Additional keyword arguments to pass to the estimator. - Returns - ------- - score_grid - A table containing the Silhouette, Calinski-Harabasz, - Davies-Bouldin, Homogeneity Score, Rand Index, and - Completeness Score. Last 3 are only evaluated when - ground_truth parameter is provided. - model - trained model object + Returns: + Trained Model + Warnings -------- - - num_clusters not required for Affinity Propagation ('ap'), Mean shift - clustering ('meanshift'), Density-Based Spatial Clustering ('dbscan') - and OPTICS Clustering ('optics'). num_clusters parameter for these models - are automatically determined. + - ``num_clusters`` param not required for Affinity Propagation ('ap'), + Mean shift ('meanshift'), Density-Based Spatial Clustering ('dbscan') + and OPTICS Clustering ('optics'). - When fit doesn't converge in Affinity Propagation ('ap') model, all datapoints are labelled as -1. @@ -1270,6 +1686,7 @@ def create_model( - OPTICS ('optics') clustering may take longer training times on large datasets. + """ # TODO improve error message @@ -1296,3 +1713,63 @@ def create_model( verbose=verbose, **kwargs, ) + + def evaluate_model( + self, + estimator, + fold: Optional[Union[int, Any]] = None, + fit_kwargs: Optional[dict] = None, + plot_kwargs: Optional[dict] = None, + feature_name: Optional[str] = None, + groups: Optional[Union[str, Any]] = None, + use_train_data: bool = False, + ): + """ + This function displays a user interface for analyzing performance of a trained + model. It calls the ``plot_model`` function internally. + + Example + -------- + >>> from pycaret.datasets import get_data + >>> jewellery = get_data('jewellery') + >>> from pycaret.clustering import * + >>> exp_name = setup(data = jewellery) + >>> kmeans = create_model('kmeans') + >>> evaluate_model(kmeans) + + + model: scikit-learn compatible object + Trained model object + + + feature: str, default = None + Feature to be evaluated when plot = 'distribution'. When ``plot`` type is + 'cluster' or 'tsne' feature column is used as a hoverover tooltip and/or + label when the ``label`` param is set to True. When the ``plot`` type is + 'cluster' or 'tsne' and feature is None, first column of the dataset is + used. + + + fit_kwargs: dict, default = {} (empty dict) + Dictionary of arguments passed to the fit method of the model. + + + Returns: + None + + + Warnings + -------- + - This function only works in IPython enabled Notebook. + + """ + + return super().evaluate_model( + estimator, + fold, + fit_kwargs, + plot_kwargs, + feature_name, + groups, + use_train_data, + ) From ac932d177d486cffcb164c03017e2518e1b90862 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 31 May 2022 14:41:25 +0000 Subject: [PATCH 06/20] Remove silent --- pycaret/classification/functional.py | 1 - pycaret/tests/test_anomaly.py | 1 - pycaret/tests/test_check_fairness.py | 3 --- pycaret/tests/test_classification.py | 5 ----- pycaret/tests/test_classification_plots.py | 1 - pycaret/tests/test_classification_tuning.py | 1 - pycaret/tests/test_clustering.py | 1 - pycaret/tests/test_convert_model.py | 2 -- pycaret/tests/test_create_api.py | 2 -- pycaret/tests/test_create_app.py | 2 -- pycaret/tests/test_create_docker.py | 2 -- pycaret/tests/test_dashboard.py | 1 - pycaret/tests/test_drift_report.py | 1 - pycaret/tests/test_eda.py | 1 - pycaret/tests/test_multiclass.py | 2 -- pycaret/tests/test_optimize_threshold.py | 1 - pycaret/tests/test_overflow.py | 1 - pycaret/tests/test_probability_threshold.py | 1 - pycaret/tests/test_regression.py | 5 ----- pycaret/tests/test_regression_plots.py | 1 - pycaret/tests/test_regression_tuning.py | 1 - pycaret/tests/test_utils.py | 2 -- 22 files changed, 38 deletions(-) diff --git a/pycaret/classification/functional.py b/pycaret/classification/functional.py index 12c5e70b2..b44dfffc8 100644 --- a/pycaret/classification/functional.py +++ b/pycaret/classification/functional.py @@ -598,7 +598,6 @@ def setup( log_plots=log_plots, log_profile=log_profile, log_data=log_data, - silent=silent, verbose=verbose, memory=memory, profile=profile, diff --git a/pycaret/tests/test_anomaly.py b/pycaret/tests/test_anomaly.py index 11de0f05c..5d1bc9ee8 100644 --- a/pycaret/tests/test_anomaly.py +++ b/pycaret/tests/test_anomaly.py @@ -27,7 +27,6 @@ def test_anomaly(data): experiment_name=experiment_name, experiment_custom_tags={"tag": 1}, log_plots=True, - silent=True, html=False, session_id=123, n_jobs=1, diff --git a/pycaret/tests/test_check_fairness.py b/pycaret/tests/test_check_fairness.py index fcb7b025b..012ec560b 100644 --- a/pycaret/tests/test_check_fairness.py +++ b/pycaret/tests/test_check_fairness.py @@ -21,7 +21,6 @@ def test_check_fairness_binary_classification(): clf1 = pycaret.classification.setup( data, target="income >50K", - silent=True, html=False, n_jobs=1, ) @@ -43,7 +42,6 @@ def test_check_fairness_multiclass_classification(): clf1 = pycaret.classification.setup( data, target="species", - silent=True, html=False, n_jobs=1, ) @@ -67,7 +65,6 @@ def test_check_fairness_regression(): reg1 = pycaret.regression.setup( data, target="medv", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_classification.py b/pycaret/tests/test_classification.py index 7d32d9219..0a6d6bf80 100644 --- a/pycaret/tests/test_classification.py +++ b/pycaret/tests/test_classification.py @@ -37,7 +37,6 @@ def test_classification(juice_dataframe, return_train_score): remove_multicollinearity=True, multicollinearity_threshold=0.95, log_experiment=True, - silent=True, html=False, session_id=123, n_jobs=1, @@ -142,7 +141,6 @@ def test_classification_predict_on_unseen(juice_dataframe): remove_multicollinearity=True, multicollinearity_threshold=0.95, log_experiment=True, - silent=True, html=False, session_id=123, n_jobs=1, @@ -170,7 +168,6 @@ def test_classification_setup_fails_with_experiment_custom_tags( remove_multicollinearity=True, multicollinearity_threshold=0.95, log_experiment=True, - silent=True, html=False, session_id=123, n_jobs=1, @@ -190,7 +187,6 @@ def test_classification_setup_fails_with_experiment_custom_multiples_inputs( remove_multicollinearity=True, multicollinearity_threshold=0.95, log_experiment=True, - silent=True, html=False, session_id=123, n_jobs=1, @@ -209,7 +205,6 @@ def test_classification_models_with_experiment_custom_tags( remove_multicollinearity=True, multicollinearity_threshold=0.95, log_experiment=True, - silent=True, html=False, session_id=123, n_jobs=1, diff --git a/pycaret/tests/test_classification_plots.py b/pycaret/tests/test_classification_plots.py index 8cefab74e..ff4092a3d 100644 --- a/pycaret/tests/test_classification_plots.py +++ b/pycaret/tests/test_classification_plots.py @@ -20,7 +20,6 @@ def test_plot(): target="Purchase", log_experiment=True, log_plots=True, - silent=True, html=False, session_id=123, fold=2, diff --git a/pycaret/tests/test_classification_tuning.py b/pycaret/tests/test_classification_tuning.py index c7bbc1a34..207b7d4cf 100644 --- a/pycaret/tests/test_classification_tuning.py +++ b/pycaret/tests/test_classification_tuning.py @@ -25,7 +25,6 @@ def test_classification_tuning(): target="Purchase", train_size=0.7, fold=2, - silent=True, html=False, session_id=123, n_jobs=1, diff --git a/pycaret/tests/test_clustering.py b/pycaret/tests/test_clustering.py index bcb64ce28..54c4fcf1b 100644 --- a/pycaret/tests/test_clustering.py +++ b/pycaret/tests/test_clustering.py @@ -27,7 +27,6 @@ def test_clustering(data): experiment_name=experiment_name, experiment_custom_tags={"tag": 1}, log_plots=True, - silent=True, html=False, session_id=123, n_jobs=1, diff --git a/pycaret/tests/test_convert_model.py b/pycaret/tests/test_convert_model.py index b63b9a5e7..28970568a 100644 --- a/pycaret/tests/test_convert_model.py +++ b/pycaret/tests/test_convert_model.py @@ -17,7 +17,6 @@ def test_classification_convert_model(): clf1 = pycaret.classification.setup( data, target="Class", - silent=True, html=False, n_jobs=1, ) @@ -39,7 +38,6 @@ def test_regression_convert_model(): reg1 = pycaret.regression.setup( data, target="medv", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_create_api.py b/pycaret/tests/test_create_api.py index 7bdd9fb4e..f592c244e 100644 --- a/pycaret/tests/test_create_api.py +++ b/pycaret/tests/test_create_api.py @@ -21,7 +21,6 @@ def test_classification_create_api(): clf1 = pycaret.classification.setup( data, target="Class", - silent=True, html=False, n_jobs=1, ) @@ -43,7 +42,6 @@ def test_regression_create_api(): reg1 = pycaret.regression.setup( data, target="medv", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_create_app.py b/pycaret/tests/test_create_app.py index a8ed5a643..de4d5023c 100644 --- a/pycaret/tests/test_create_app.py +++ b/pycaret/tests/test_create_app.py @@ -21,7 +21,6 @@ def test_classification_create_app(): clf1 = pycaret.classification.setup( data, target="Class", - silent=True, html=False, n_jobs=1, ) @@ -43,7 +42,6 @@ def test_regression_create_app(): reg1 = pycaret.regression.setup( data, target="medv", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_create_docker.py b/pycaret/tests/test_create_docker.py index bd3e69fd8..b4a1c1ec2 100644 --- a/pycaret/tests/test_create_docker.py +++ b/pycaret/tests/test_create_docker.py @@ -21,7 +21,6 @@ def test_classification_create_docker(): clf1 = pycaret.classification.setup( data, target="Class", - silent=True, html=False, n_jobs=1, ) @@ -44,7 +43,6 @@ def test_regression_create_docker(): reg1 = pycaret.regression.setup( data, target="medv", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_dashboard.py b/pycaret/tests/test_dashboard.py index 4ff490a4c..9ea68c27e 100644 --- a/pycaret/tests/test_dashboard.py +++ b/pycaret/tests/test_dashboard.py @@ -23,7 +23,6 @@ def test_classification_dashboard(): # clf1 = pycaret.classification.setup( # data, # target="Class", - # silent=True, # html=False, # n_jobs=1, # ) diff --git a/pycaret/tests/test_drift_report.py b/pycaret/tests/test_drift_report.py index 5bd07dee2..75ea2ae5b 100644 --- a/pycaret/tests/test_drift_report.py +++ b/pycaret/tests/test_drift_report.py @@ -16,7 +16,6 @@ def test_drift_report(): clf1 = pycaret.classification.setup( data, target="Class", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_eda.py b/pycaret/tests/test_eda.py index 184ac8943..c90b9cecc 100644 --- a/pycaret/tests/test_eda.py +++ b/pycaret/tests/test_eda.py @@ -19,7 +19,6 @@ def test_eda(): pycaret.classification.setup( data, target="Class", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_multiclass.py b/pycaret/tests/test_multiclass.py index e4ef2b6a8..e6c619c26 100644 --- a/pycaret/tests/test_multiclass.py +++ b/pycaret/tests/test_multiclass.py @@ -26,7 +26,6 @@ def test_multiclass(iris_dataframe, return_train_score): iris_dataframe, target="species", log_experiment=True, - silent=True, html=False, session_id=123, n_jobs=1, @@ -125,7 +124,6 @@ def test_multiclass_predict_on_unseen(iris_dataframe): iris_dataframe, target="species", log_experiment=True, - silent=True, html=False, session_id=123, n_jobs=1, diff --git a/pycaret/tests/test_optimize_threshold.py b/pycaret/tests/test_optimize_threshold.py index f1f75796b..3629041c2 100644 --- a/pycaret/tests/test_optimize_threshold.py +++ b/pycaret/tests/test_optimize_threshold.py @@ -20,7 +20,6 @@ def test_optimize_threshold(): clf1 = pycaret.classification.setup( data, target="Class", - silent=True, html=False, n_jobs=1, ) diff --git a/pycaret/tests/test_overflow.py b/pycaret/tests/test_overflow.py index b1fd48bdb..8e5ec11fe 100644 --- a/pycaret/tests/test_overflow.py +++ b/pycaret/tests/test_overflow.py @@ -13,7 +13,6 @@ def test_overflow(): s = setup( data, target="medv", - silent=True, html=False, session_id=123, n_jobs=1, diff --git a/pycaret/tests/test_probability_threshold.py b/pycaret/tests/test_probability_threshold.py index 869ce8a6e..a9878a83d 100644 --- a/pycaret/tests/test_probability_threshold.py +++ b/pycaret/tests/test_probability_threshold.py @@ -20,7 +20,6 @@ def test_probability_threshold(): clf1 = pycaret.classification.setup( data, target="Purchase", - silent=True, log_experiment=True, html=False, session_id=123, diff --git a/pycaret/tests/test_regression.py b/pycaret/tests/test_regression.py index d69228f50..702cd872e 100644 --- a/pycaret/tests/test_regression.py +++ b/pycaret/tests/test_regression.py @@ -34,7 +34,6 @@ def test_regression(boston_dataframe, return_train_score): target="medv", remove_multicollinearity=True, multicollinearity_threshold=0.95, - silent=True, log_experiment=True, html=False, session_id=123, @@ -140,7 +139,6 @@ def test_regression_predict_on_unseen(boston_dataframe): target="medv", remove_multicollinearity=True, multicollinearity_threshold=0.95, - silent=True, log_experiment=True, html=False, session_id=123, @@ -165,7 +163,6 @@ def test_regression_setup_fails_with_experiment_custom_tags(self, boston_datafra _ = pycaret.regression.setup( boston_dataframe, target="medv", - silent=True, log_experiment=True, html=False, session_id=123, @@ -183,7 +180,6 @@ def test_regression_setup_fails_with_experiment_custom_multiples_inputs( _ = pycaret.regression.setup( pycaret.datasets.get_data("boston"), target="medv", - silent=True, log_experiment=True, html=False, session_id=123, @@ -200,7 +196,6 @@ def test_regression_models_with_experiment_custom_tags( _ = pycaret.regression.setup( boston_dataframe, target="medv", - silent=True, log_experiment=True, html=False, session_id=123, diff --git a/pycaret/tests/test_regression_plots.py b/pycaret/tests/test_regression_plots.py index 1bb94b796..a04fb1d8e 100644 --- a/pycaret/tests/test_regression_plots.py +++ b/pycaret/tests/test_regression_plots.py @@ -20,7 +20,6 @@ def test_plot(): target="medv", log_experiment=True, log_plots=True, - silent=True, html=False, session_id=123, fold=2, diff --git a/pycaret/tests/test_regression_tuning.py b/pycaret/tests/test_regression_tuning.py index 8efb55b73..2c5412510 100644 --- a/pycaret/tests/test_regression_tuning.py +++ b/pycaret/tests/test_regression_tuning.py @@ -23,7 +23,6 @@ def test_regression_tuning(): target="medv", train_size=0.99, fold=2, - silent=True, html=False, session_id=123, n_jobs=1, diff --git a/pycaret/tests/test_utils.py b/pycaret/tests/test_utils.py index 733c89220..dd4635b67 100644 --- a/pycaret/tests/test_utils.py +++ b/pycaret/tests/test_utils.py @@ -33,7 +33,6 @@ def test_utils(): clf1 = pycaret.classification.setup( train, target="Purchase", - silent=True, html=False, session_id=123, n_jobs=1, @@ -92,7 +91,6 @@ def test_utils(): reg1 = pycaret.regression.setup( data, target="medv", - silent=True, html=False, session_id=123, n_jobs=1, From 59cc719a2fbb2812835860da83e16e00c8f3f9ef Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Tue, 31 May 2022 15:42:46 +0000 Subject: [PATCH 07/20] Fix --- pycaret/clustering/oop.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pycaret/clustering/oop.py b/pycaret/clustering/oop.py index c5c94ab86..12d6fc0e9 100644 --- a/pycaret/clustering/oop.py +++ b/pycaret/clustering/oop.py @@ -1,5 +1,4 @@ -from ctypes import Union -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional, Tuple, Union import numpy as np # type: ignore import pandas as pd # type ignore From 83d7cb74947add23040848d24cb2812936f4d3d9 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 20:00:14 +0000 Subject: [PATCH 08/20] WIP --- pycaret/anomaly/functional.py | 4 +- pycaret/anomaly/oop.py | 129 +++++++++++++++++- pycaret/classification/functional.py | 4 +- pycaret/classification/oop.py | 4 +- pycaret/clustering/functional.py | 4 +- pycaret/clustering/oop.py | 4 +- .../pycaret_experiment/tabular_experiment.py | 2 +- pycaret/regression/functional.py | 4 +- pycaret/regression/oop.py | 4 +- pycaret/time_series/forecasting/functional.py | 6 +- pycaret/time_series/forecasting/oop.py | 4 +- 11 files changed, 148 insertions(+), 21 deletions(-) diff --git a/pycaret/anomaly/functional.py b/pycaret/anomaly/functional.py index f8db04537..90c06ef5f 100644 --- a/pycaret/anomaly/functional.py +++ b/pycaret/anomaly/functional.py @@ -561,7 +561,7 @@ def plot_model( scale: float = 1, save: bool = False, display_format: Optional[str] = None, -): +) -> Optional[str]: """ This function analyzes the performance of a trained model. @@ -612,7 +612,7 @@ def plot_model( Returns: - None + Path to saved file, if any. """ return _CURRENT_EXPERIMENT.plot_model( diff --git a/pycaret/anomaly/oop.py b/pycaret/anomaly/oop.py index bb6d98103..f54fecfe4 100644 --- a/pycaret/anomaly/oop.py +++ b/pycaret/anomaly/oop.py @@ -1,4 +1,6 @@ -from typing import List, Tuple +from typing import Any, List, Optional, Tuple, Union + +import pandas as pd import pycaret.containers.metrics.anomaly import pycaret.containers.models.anomaly @@ -48,3 +50,128 @@ def _get_metrics(self, raise_errors: bool = True) -> dict: def _get_default_plots_to_log(self) -> List[str]: return ["tsne"] + + def predict_model( + self, estimator, data: pd.DataFrame, ml_usecase: Optional[MLUsecase] = None + ) -> pd.DataFrame: + """ + This function generates anomaly labels on using a trained model. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> anomaly = get_data('anomaly') + >>> from pycaret.anomaly import * + >>> exp_name = setup(data = anomaly) + >>> knn = create_model('knn') + >>> knn_predictions = predict_model(model = knn, data = unseen_data) + + + model: scikit-learn compatible object + Trained Model Object. + + + data : pandas.DataFrame + Shape (n_samples, n_features) where n_samples is the number of samples and + n_features is the number of features. + + + Returns: + pandas.DataFrame + + + Warnings + -------- + - The behavior of the predict_model is changed in version 2.1 without backward compatibility. + As such, the pipelines trained using the version (<= 2.0), may not work for inference + with version >= 2.1. You can either retrain your models with a newer version or downgrade + the version for inference. + + + """ + return super().predict_model(estimator, data, ml_usecase) + + def plot_model( + self, + estimator, + plot: str = "auc", + scale: float = 1, + save: Union[str, bool] = False, + fold: Optional[Union[int, Any]] = None, + fit_kwargs: Optional[dict] = None, + plot_kwargs: Optional[dict] = None, + groups: Optional[Union[str, Any]] = None, + feature_name: Optional[str] = None, + label: bool = False, + use_train_data: bool = False, + verbose: bool = True, + display_format: Optional[str] = None, + ) -> Optional[str]: + """ + This function analyzes the performance of a trained model. + + + Example + ------- + >>> from pycaret.datasets import get_data + >>> anomaly = get_data('anomaly') + >>> from pycaret.anomaly import * + >>> exp_name = setup(data = anomaly) + >>> knn = create_model('knn') + >>> plot_model(knn, plot = 'tsne') + + + model: scikit-learn compatible object + Trained Model Object + + + plot: str, default = 'tsne' + List of available plots (ID - Name): + + * 'tsne' - t-SNE (3d) Dimension Plot + * 'umap' - UMAP Dimensionality Plot + + + feature: str, default = None + Feature to be used as a hoverover tooltip and/or label when the ``label`` + param is set to True. When feature is None, first column of the dataset + is used. + + + label: bool, default = False + Name of column to be used as data labels. + + + scale: float, default = 1 + The resolution scale of the figure. + + + save: bool, default = False + When set to True, plot is saved in the current working directory. + + + display_format: str, default = None + To display plots in Streamlit (https://www.streamlit.io/), set this to 'streamlit'. + Currently, not all plots are supported. + + + Returns: + Path to saved file, if any. + + """ + return super().plot_model( + estimator, + plot, + scale, + save, + fold, + fit_kwargs, + plot_kwargs, + groups, + feature_name, + label, + use_train_data, + verbose, + display_format, + ) diff --git a/pycaret/classification/functional.py b/pycaret/classification/functional.py index dba30e0cc..0fb035d77 100644 --- a/pycaret/classification/functional.py +++ b/pycaret/classification/functional.py @@ -1509,7 +1509,7 @@ def plot_model( use_train_data: bool = False, verbose: bool = True, display_format: Optional[str] = None, -) -> str: +) -> Optional[str]: """ This function analyzes the performance of a trained model on holdout set. @@ -1601,7 +1601,7 @@ def plot_model( Returns: - None + Path to saved file, if any. Warnings diff --git a/pycaret/classification/oop.py b/pycaret/classification/oop.py index a421b059e..9fbde0832 100644 --- a/pycaret/classification/oop.py +++ b/pycaret/classification/oop.py @@ -1838,7 +1838,7 @@ def plot_model( use_train_data: bool = False, verbose: bool = True, display_format: Optional[str] = None, - ) -> str: + ) -> Optional[str]: """ This function analyzes the performance of a trained model on holdout set. @@ -1930,7 +1930,7 @@ def plot_model( Returns: - None + Path to saved file, if any. Warnings diff --git a/pycaret/clustering/functional.py b/pycaret/clustering/functional.py index 6de8d94c5..e4570b028 100644 --- a/pycaret/clustering/functional.py +++ b/pycaret/clustering/functional.py @@ -590,7 +590,7 @@ def plot_model( scale: float = 1, save: bool = False, display_format: Optional[str] = None, -): +) -> Optional[str]: """ This function analyzes the performance of a trained model. @@ -648,7 +648,7 @@ def plot_model( Returns: - None + Path to saved file, if any. """ return _CURRENT_EXPERIMENT.plot_model( diff --git a/pycaret/clustering/oop.py b/pycaret/clustering/oop.py index 12d6fc0e9..8d7113a86 100644 --- a/pycaret/clustering/oop.py +++ b/pycaret/clustering/oop.py @@ -114,7 +114,7 @@ def plot_model( use_train_data: bool = False, verbose: bool = True, display_format: Optional[str] = None, - ) -> str: + ) -> Optional[str]: """ This function analyzes the performance of a trained model. @@ -171,7 +171,7 @@ def plot_model( Returns: - None + Path to saved file, if any. """ return super().plot_model( diff --git a/pycaret/internal/pycaret_experiment/tabular_experiment.py b/pycaret/internal/pycaret_experiment/tabular_experiment.py index 293da87a0..6795537c4 100644 --- a/pycaret/internal/pycaret_experiment/tabular_experiment.py +++ b/pycaret/internal/pycaret_experiment/tabular_experiment.py @@ -2040,7 +2040,7 @@ def plot_model( use_train_data: bool = False, verbose: bool = True, display_format: Optional[str] = None, - ) -> str: + ) -> Optional[str]: """ This function takes a trained model object and returns a plot based on the diff --git a/pycaret/regression/functional.py b/pycaret/regression/functional.py index 899a7965c..82c6912e4 100644 --- a/pycaret/regression/functional.py +++ b/pycaret/regression/functional.py @@ -1445,7 +1445,7 @@ def plot_model( use_train_data: bool = False, verbose: bool = True, display_format: Optional[str] = None, -) -> str: +) -> Optional[str]: """ This function analyzes the performance of a trained model on holdout set. @@ -1535,7 +1535,7 @@ def plot_model( Returns: - None + Path to saved file, if any. """ diff --git a/pycaret/regression/oop.py b/pycaret/regression/oop.py index 19c9bf14e..f9dd74c48 100644 --- a/pycaret/regression/oop.py +++ b/pycaret/regression/oop.py @@ -1733,7 +1733,7 @@ def plot_model( use_train_data: bool = False, verbose: bool = True, display_format: Optional[str] = None, - ) -> str: + ) -> Optional[str]: """ This function analyzes the performance of a trained model on holdout set. @@ -1823,7 +1823,7 @@ def plot_model( Returns: - None + Path to saved file, if any. """ diff --git a/pycaret/time_series/forecasting/functional.py b/pycaret/time_series/forecasting/functional.py index 2822b6eda..9595098d4 100644 --- a/pycaret/time_series/forecasting/functional.py +++ b/pycaret/time_series/forecasting/functional.py @@ -947,7 +947,7 @@ def plot_model( data_kwargs: Optional[Dict] = None, fig_kwargs: Optional[Dict] = None, save: Union[str, bool] = False, -) -> Optional[Tuple[str, Any]]: +) -> Optional[Tuple[str, list]]: """ This function analyzes the performance of a trained model on holdout set. @@ -999,7 +999,7 @@ def plot_model( return_fig: : bool, default = False - When set to True, it returns the figure used for plotting. + When set to True, it returns the figure used for plotting. return_data: bool, default = False @@ -1148,7 +1148,7 @@ def plot_model( Returns: - Optional[Tuple[str, Any]] + Path to saved file and list containing figure and data, if any. """ diff --git a/pycaret/time_series/forecasting/oop.py b/pycaret/time_series/forecasting/oop.py index 7284263c9..68e53b50f 100644 --- a/pycaret/time_series/forecasting/oop.py +++ b/pycaret/time_series/forecasting/oop.py @@ -3392,7 +3392,7 @@ def plot_model( data_kwargs: Optional[Dict] = None, fig_kwargs: Optional[Dict] = None, save: Union[str, bool] = False, - ) -> Optional[Tuple[str, Any]]: + ) -> Optional[Tuple[str, list]]: """ This function analyzes the performance of a trained model on holdout set. @@ -3593,7 +3593,7 @@ def plot_model( Returns: - Optional[Tuple[str, Any]] + Path to saved file and list containing figure and data, if any. """ From c9857fb5d986ca911b257a82ce398fbd00647840 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 22:05:12 +0000 Subject: [PATCH 09/20] Display tweaks --- pycaret/internal/display/display_backend.py | 8 ++++++-- pycaret/internal/plots/yellowbrick.py | 6 ------ .../pycaret_experiment/tabular_experiment.py | 18 ------------------ 3 files changed, 6 insertions(+), 26 deletions(-) diff --git a/pycaret/internal/display/display_backend.py b/pycaret/internal/display/display_backend.py index 2db25abe4..51f8d0090 100644 --- a/pycaret/internal/display/display_backend.py +++ b/pycaret/internal/display/display_backend.py @@ -110,7 +110,6 @@ class JupyterBackend(DisplayBackend): can_update_rich: bool = True def __init__(self) -> None: - _enable_matplotlib_inline() self._display_ref: Optional[DisplayHandle] = None def display(self, obj: Any, *, final_display: bool = True) -> None: @@ -144,7 +143,6 @@ class ColabBackend(JupyterBackend): id: str = "colab" def __init__(self) -> None: - _enable_colab() super().__init__() def _handle_input(self, obj: Any) -> Any: @@ -186,6 +184,7 @@ def detect_backend( class_name = "" if IN_DATABRICKS: + _enable_matplotlib_inline() return DatabricksBackend() try: @@ -199,7 +198,9 @@ def detect_backend( if not is_notebook: return CLIBackend() if "google.colab" in class_name: + _enable_colab() return ColabBackend() + _enable_matplotlib_inline() return JupyterBackend() if isinstance(backend, str): @@ -217,3 +218,6 @@ def detect_backend( raise TypeError( f"Wrong backend type. Expected None, str or DisplayBackend, got {type(backend)}." ) + + +detect_backend() diff --git a/pycaret/internal/plots/yellowbrick.py b/pycaret/internal/plots/yellowbrick.py index e527382b6..08510402f 100644 --- a/pycaret/internal/plots/yellowbrick.py +++ b/pycaret/internal/plots/yellowbrick.py @@ -68,7 +68,6 @@ def show_yellowbrick_plot( save: bool = False, fit_kwargs: Optional[dict] = None, groups: Optional[Any] = None, - display: Optional[CommonDisplay] = None, display_format: Optional[str] = None, **kwargs, ): @@ -96,8 +95,6 @@ def show_yellowbrick_plot( logger.info("Scoring train set") visualizer.score(X_train, y_train, **kwargs) - display.move_progress() - if handle_test == "draw": visualizer.draw(X_test, y_test) elif handle_test == "fit": @@ -108,9 +105,6 @@ def show_yellowbrick_plot( logger.info("Scoring test/hold-out set") visualizer.score(X_test, y_test) - display.move_progress() - # display.clear_output() - plot_filename = f"{name}.png" if save: if not isinstance(save, bool): diff --git a/pycaret/internal/pycaret_experiment/tabular_experiment.py b/pycaret/internal/pycaret_experiment/tabular_experiment.py index a9d39436f..bd168352d 100644 --- a/pycaret/internal/pycaret_experiment/tabular_experiment.py +++ b/pycaret/internal/pycaret_experiment/tabular_experiment.py @@ -602,7 +602,6 @@ def residuals_interactive(): x_test=self.X_test_transformed, y_test=self.y_test_transformed, model=estimator, - display=display, ) # display.clear_output() @@ -1069,7 +1068,6 @@ def elbow(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1097,7 +1095,6 @@ def silhouette(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) except: @@ -1122,7 +1119,6 @@ def distance(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) except: @@ -1146,7 +1142,6 @@ def residuals(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1166,7 +1161,6 @@ def auc(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1188,7 +1182,6 @@ def threshold(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1210,7 +1203,6 @@ def pr(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1235,7 +1227,6 @@ def confusion_matrix(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1266,7 +1257,6 @@ def error(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1287,7 +1277,6 @@ def cooks(): fit_kwargs=fit_kwargs, handle_test="", groups=groups, - display=display, display_format=display_format, ) @@ -1309,7 +1298,6 @@ def class_report(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1350,7 +1338,6 @@ def boundary(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, features=["Feature One", "Feature Two"], classes=["A", "B"], display_format=display_format, @@ -1373,7 +1360,6 @@ def rfe(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1401,7 +1387,6 @@ def learning(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1478,7 +1463,6 @@ def manifold(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1822,7 +1806,6 @@ def vc(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) @@ -1864,7 +1847,6 @@ def dimension(): save=save, fit_kwargs=fit_kwargs, groups=groups, - display=display, display_format=display_format, ) From 7b8ffc931bc3a583af7f44240c368d383e11a986 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 22:06:11 +0000 Subject: [PATCH 10/20] Change version number --- pycaret/utils/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pycaret/utils/__init__.py b/pycaret/utils/__init__.py index bbf27805b..7847a20c8 100644 --- a/pycaret/utils/__init__.py +++ b/pycaret/utils/__init__.py @@ -5,7 +5,7 @@ import pandas as pd from sklearn.metrics._scorer import _PredictScorer, get_scorer # type: ignore -version_ = "3.0.0" +version_ = "3.0.0.rc1" nightly_version_ = "3.0.0" __version__ = version_ diff --git a/setup.py b/setup.py index 07e5cbac3..7e1ba31bf 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def readme(): setup( name="pycaret", - version="3.0.0", + version="3.0.0.rc1", description="PyCaret - An open source, low-code machine learning library in Python.", long_description=readme(), long_description_content_type="text/markdown", From 779bcf573d520903e85836f97a2607cea19d0180 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 22:12:34 +0000 Subject: [PATCH 11/20] Fixes --- pycaret/internal/display/display_backend.py | 1 + pycaret/internal/plots/residual_plots.py | 15 +++++---------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pycaret/internal/display/display_backend.py b/pycaret/internal/display/display_backend.py index 51f8d0090..2aedb4f7e 100644 --- a/pycaret/internal/display/display_backend.py +++ b/pycaret/internal/display/display_backend.py @@ -198,6 +198,7 @@ def detect_backend( if not is_notebook: return CLIBackend() if "google.colab" in class_name: + _enable_matplotlib_inline() _enable_colab() return ColabBackend() _enable_matplotlib_inline() diff --git a/pycaret/internal/plots/residual_plots.py b/pycaret/internal/plots/residual_plots.py index 5cc3e51f7..93577ca75 100644 --- a/pycaret/internal/plots/residual_plots.py +++ b/pycaret/internal/plots/residual_plots.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional import numpy as np import pandas as pd @@ -604,20 +604,18 @@ class InteractiveResidualsPlot: def __init__( self, - display: CommonDisplay, model, x: np.ndarray, y: np.ndarray, x_test: np.ndarray = None, y_test: np.ndarray = None, + display: Optional[CommonDisplay] = None, ): """ Instantiates the interactive residual plots for the given data Parameters ---------- - display: CommonDisplay - this object is required to show the plots and move the progressbar model describes the regression model which is to be evaluated x: np.ndarray @@ -628,10 +626,12 @@ def __init__( optional, some test data (requires y_test) y_test: np.ndarray optional, the labels to the provided test data (requires x_test) + display: CommonDisplay + this object is required to show the plots """ self.figures: List[BaseFigureWidget] = [] - self.display: CommonDisplay = display + self.display: CommonDisplay = display or CommonDisplay() self.plot = self.__create_resplots(model, x, y, x_test, y_test) def show(self): @@ -703,21 +703,18 @@ def __create_resplots( split_origin = None logger.info("Calculated model residuals") - self.display.move_progress() tukey_anscombe_widget = TukeyAnscombeWidget( predictions, residuals, split_origin=split_origin ) logger.info("Calculated Tunkey-Anscombe Plot") self.figures.append(tukey_anscombe_widget) - self.display.move_progress() qq_plot_widget = QQPlotWidget( predictions, y, split_origin=split_origin, featuresize=x.shape[1] ) logger.info("Calculated Normal QQ Plot") self.figures.append(qq_plot_widget) - self.display.move_progress() standardized_residuals = helper.calculate_standardized_residual( predictions, y, None @@ -728,7 +725,6 @@ def __create_resplots( ) logger.info("Calculated Scale-Location Plot") self.figures.append(scale_location_widget) - self.display.move_progress() leverage = helper.leverage_statistic(np.array(x)) @@ -745,7 +741,6 @@ def __create_resplots( ) logger.info("Calculated Residual vs Leverage Plot inc. Cook's distance") self.figures.append(cooks_distance_widget) - self.display.move_progress() items_layout = Layout(width="1000px") h0 = widgets.HBox(self.figures[:2], layout=items_layout) From 9523cc36f35f6ded15af32aabfb4b2d5a01a4d2d Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 22:21:01 +0000 Subject: [PATCH 12/20] Fix --- pycaret/internal/display/display_backend.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/pycaret/internal/display/display_backend.py b/pycaret/internal/display/display_backend.py index 2aedb4f7e..68b0739d2 100644 --- a/pycaret/internal/display/display_backend.py +++ b/pycaret/internal/display/display_backend.py @@ -17,15 +17,11 @@ except ImportError: IN_DATABRICKS = False -MATPLOTLIB_INLINE_ENABLED = False COLAB_ENABLED = False def _enable_matplotlib_inline(): - global MATPLOTLIB_INLINE_ENABLED - if not MATPLOTLIB_INLINE_ENABLED: - get_ipython().run_line_magic("matplotlib", "inline") - MATPLOTLIB_INLINE_ENABLED = True + get_ipython().run_line_magic("matplotlib", "inline") def _enable_colab(): @@ -110,6 +106,7 @@ class JupyterBackend(DisplayBackend): can_update_rich: bool = True def __init__(self) -> None: + _enable_matplotlib_inline() self._display_ref: Optional[DisplayHandle] = None def display(self, obj: Any, *, final_display: bool = True) -> None: @@ -143,13 +140,9 @@ class ColabBackend(JupyterBackend): id: str = "colab" def __init__(self) -> None: + _enable_colab() super().__init__() - def _handle_input(self, obj: Any) -> Any: - if isinstance(obj, Styler): - return HTML(obj.to_html()) - return obj - class DatabricksBackend(JupyterBackend): id: str = "databricks" @@ -184,13 +177,12 @@ def detect_backend( class_name = "" if IN_DATABRICKS: - _enable_matplotlib_inline() return DatabricksBackend() try: ipython = get_ipython() assert ipython - class_name = ipython.__class__.__name__ + class_name = str(ipython.__class__) is_notebook = True if "Terminal" not in class_name else False except Exception: is_notebook = False @@ -198,10 +190,7 @@ def detect_backend( if not is_notebook: return CLIBackend() if "google.colab" in class_name: - _enable_matplotlib_inline() - _enable_colab() return ColabBackend() - _enable_matplotlib_inline() return JupyterBackend() if isinstance(backend, str): From 9453b18772396e134d1f01b663b275397faa94a3 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 22:44:41 +0000 Subject: [PATCH 13/20] Fixes --- pycaret/internal/display/display_backend.py | 6 ++++-- pycaret/internal/display/progress_bar.py | 17 ++++++++++++++++- .../pycaret_experiment/supervised_experiment.py | 11 ++++++----- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/pycaret/internal/display/display_backend.py b/pycaret/internal/display/display_backend.py index 68b0739d2..c779f427b 100644 --- a/pycaret/internal/display/display_backend.py +++ b/pycaret/internal/display/display_backend.py @@ -3,8 +3,10 @@ from typing import Any, Optional, Union import pandas as pd -from IPython import get_ipython -from IPython.display import HTML, DisplayHandle, clear_output + +if not "get_ipython" in globals(): + from IPython import get_ipython +from IPython.display import DisplayHandle, clear_output from IPython.display import display as ipython_display from pandas.io.formats.style import Styler diff --git a/pycaret/internal/display/progress_bar.py b/pycaret/internal/display/progress_bar.py index 43355d54f..bb7f8818f 100644 --- a/pycaret/internal/display/progress_bar.py +++ b/pycaret/internal/display/progress_bar.py @@ -74,9 +74,24 @@ def display( self.display_backend.display(self.container) self.displayed = True - if close and pbar.bar_style != "danger": + if close: + try: + self.container.close() + except AttributeError: + self.container.visible = False self.display_backend.clear_display() + def close(self): + if self.disable: + return + super().close() + # Try to detect if there was an error or KeyboardInterrupt + # in manual mode: if n < total, things probably got wrong + if self.leave: + self.disp(bar_style="success", check_delay=False) + else: + self.disp(close=True, check_delay=False) + class JupyterProgressBarBackend(ProgressBarBackend): def open(self): diff --git a/pycaret/internal/pycaret_experiment/supervised_experiment.py b/pycaret/internal/pycaret_experiment/supervised_experiment.py index 1cc64f9e3..bc6a118b4 100644 --- a/pycaret/internal/pycaret_experiment/supervised_experiment.py +++ b/pycaret/internal/pycaret_experiment/supervised_experiment.py @@ -531,7 +531,7 @@ def compare_models( elif exclude: len_mod -= len(exclude) - progress_args = {"max": (4 * len_mod) + 4 + len_mod} + progress_args = {"max": (4 * len_mod) + 4 + min(len_mod, abs(n_select))} master_display_columns = ( ["Model"] + [v.display_name for k, v in self._all_metrics.items()] @@ -662,8 +662,6 @@ def compare_models( break total_runtime_start = runtime_start - display.move_progress() - """ MONITOR UPDATE STARTS """ @@ -834,12 +832,13 @@ def highlight_cols(s): sorted_models = [] if master_display is not None: + clamped_n_select = min(len(master_display), abs(n_select)) if n_select < 0: n_select_range = range( - len(master_display) - n_select, len(master_display) + len(master_display) - clamped_n_select, len(master_display) ) else: - n_select_range = range(0, n_select) + n_select_range = range(0, clamped_n_select) if self.logging_param: self.logging_param.log_model_comparison( @@ -897,7 +896,9 @@ def highlight_cols(s): ) self.logger.error(traceback.format_exc()) model = None + display.move_progress() continue + display.move_progress() full_logging = True if self.logging_param and cross_validation and model is not None: From cfde09e0c1d9fac29cb99e08fa6d2473fa8f5433 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 22:50:32 +0000 Subject: [PATCH 14/20] Fix colab detection --- pycaret/internal/display/display_backend.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pycaret/internal/display/display_backend.py b/pycaret/internal/display/display_backend.py index c779f427b..70c1bc6e3 100644 --- a/pycaret/internal/display/display_backend.py +++ b/pycaret/internal/display/display_backend.py @@ -10,8 +10,6 @@ from IPython.display import display as ipython_display from pandas.io.formats.style import Styler -from pycaret.utils import enable_colab - try: import dbruntime.display @@ -19,6 +17,13 @@ except ImportError: IN_DATABRICKS = False +try: + import google.colab + + IN_COLAB = True +except ImportError: + IN_COLAB = False + COLAB_ENABLED = False @@ -29,7 +34,9 @@ def _enable_matplotlib_inline(): def _enable_colab(): global COLAB_ENABLED if not COLAB_ENABLED: - enable_colab() + from google.colab import output + + output.enable_custom_widget_manager() COLAB_ENABLED = True @@ -181,6 +188,9 @@ def detect_backend( if IN_DATABRICKS: return DatabricksBackend() + if IN_COLAB: + return ColabBackend() + try: ipython = get_ipython() assert ipython @@ -191,8 +201,6 @@ def detect_backend( if not is_notebook: return CLIBackend() - if "google.colab" in class_name: - return ColabBackend() return JupyterBackend() if isinstance(backend, str): From 425024da1f461aa5f184f7b1afafd235b40cfe66 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 23:11:23 +0000 Subject: [PATCH 15/20] Disable interactive residuals for colab --- pycaret/internal/plots/residual_plots.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pycaret/internal/plots/residual_plots.py b/pycaret/internal/plots/residual_plots.py index 93577ca75..5a20241d2 100644 --- a/pycaret/internal/plots/residual_plots.py +++ b/pycaret/internal/plots/residual_plots.py @@ -10,6 +10,7 @@ import pycaret.internal.plots.helper as helper from pycaret.internal.display import CommonDisplay +from pycaret.internal.display.display_backend import ColabBackend, DatabricksBackend from pycaret.internal.logging import get_logger from pycaret.internal.validation import fit_if_not_fitted @@ -632,6 +633,10 @@ def __init__( self.figures: List[BaseFigureWidget] = [] self.display: CommonDisplay = display or CommonDisplay() + if isinstance(self.display._general_display, (ColabBackend, DatabricksBackend)): + raise ValueError( + f"residuals_interactive plot is not supported on Google Colab or Databricks." + ) self.plot = self.__create_resplots(model, x, y, x_test, y_test) def show(self): From c9608fe9b2ba6e5150f5f7e6c4189f9a58c97234 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 1 Jun 2022 23:17:13 +0000 Subject: [PATCH 16/20] Fix --- pycaret/internal/plots/residual_plots.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pycaret/internal/plots/residual_plots.py b/pycaret/internal/plots/residual_plots.py index 5a20241d2..16745c6c5 100644 --- a/pycaret/internal/plots/residual_plots.py +++ b/pycaret/internal/plots/residual_plots.py @@ -209,13 +209,13 @@ def __scale_location_plot(fitted, sqrt_abs_standardized_residuals, split_origin) { "Predictions": fitted, "Split": split_origin, - "$\sqrt{|Standardized Residuals|}$": sqrt_abs_standardized_residuals, + "Standardized Residuals^1/2": sqrt_abs_standardized_residuals, } ) fig = px.scatter( dataframe, x="Predictions", - y="$\sqrt{|Standardized Residuals|}$", + y="Standardized Residuals^1/2", trendline="lowess", color="Split", color_discrete_sequence=["blue", "green"], @@ -228,13 +228,13 @@ def __scale_location_plot(fitted, sqrt_abs_standardized_residuals, split_origin) dataframe = pd.DataFrame( { "Predictions": fitted, - "$\sqrt{|Standardized Residuals|}$": sqrt_abs_standardized_residuals, + "Standardized Residuals^1/2": sqrt_abs_standardized_residuals, } ) fig = px.scatter( dataframe, x="Predictions", - y="$\sqrt{|Standardized Residuals|}$", + y="Standardized Residuals^1/2", trendline="lowess", title="Scale-Location Plot", opacity=0.3, @@ -246,7 +246,7 @@ def __scale_location_plot(fitted, sqrt_abs_standardized_residuals, split_origin) fig.add_annotation( x=fitted[i], y=sqrt_abs_standardized_residuals[i], - text=f"$\sqrt{{|\\tilde r_{{{i}}}|}}$", + text=f"~r_{i}^1/2", ) fig.update_annotations( dict(xref="x", yref="y", showarrow=True, arrowhead=7, ax=0, ay=-40) @@ -384,7 +384,7 @@ def __cooks_distance_plot( fig.add_annotation( x=model_leverage[i], y=standardized_residuals[i], - text=f"$\\tilde r_{{{i}}}$", + text=f"~r_{i}", ) fig.update_annotations( @@ -556,7 +556,7 @@ def __tukey_anscombe_plot(predictions, residuals, split_origin): abs_resid = model_abs_resid.sort_values(ascending=False) abs_resid_top_3 = abs_resid[:3] for i in abs_resid_top_3.index: - fig.add_annotation(x=predictions[i], y=residuals[i], text=f"$r_{{{i}}}$") + fig.add_annotation(x=predictions[i], y=residuals[i], text=f"r_{i}") fig.update_annotations( dict(xref="x", yref="y", showarrow=True, arrowhead=7, ax=0, ay=-40) ) From a203144f3c2cd59dd7382c228479109b56f1f2a4 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 2 Jun 2022 13:54:32 +0000 Subject: [PATCH 17/20] Fix encoding for unsupervised --- pycaret/anomaly/functional.py | 6 +++--- pycaret/classification/functional.py | 2 +- pycaret/classification/oop.py | 2 +- pycaret/clustering/functional.py | 6 +++--- pycaret/internal/preprocess/preprocessor.py | 2 +- pycaret/internal/preprocess/transformers.py | 4 ++++ .../internal/pycaret_experiment/unsupervised_experiment.py | 6 +++--- pycaret/regression/functional.py | 2 +- pycaret/regression/oop.py | 2 +- 9 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pycaret/anomaly/functional.py b/pycaret/anomaly/functional.py index 90c06ef5f..08cc910d7 100644 --- a/pycaret/anomaly/functional.py +++ b/pycaret/anomaly/functional.py @@ -32,7 +32,7 @@ def setup( numeric_imputation: str = "mean", categorical_imputation: str = "constant", text_features_method: str = "tf-idf", - max_encoding_ohe: int = 5, + max_encoding_ohe: int = -1, encoding_method: Optional[Any] = None, polynomial_features: bool = False, polynomial_degree: int = 2, @@ -161,11 +161,11 @@ def setup( text embeddings. - max_encoding_ohe: int, default = 5 + max_encoding_ohe: int, default = -1 Categorical columns with `max_encoding_ohe` or less unique values are encoded using OneHotEncoding. If more, the `encoding_method` estimator is used. Note that columns with exactly two classes are always encoded - ordinally. + ordinally. Set to below 0 to always use OneHotEncoding. encoding_method: category-encoders estimator, default = None diff --git a/pycaret/classification/functional.py b/pycaret/classification/functional.py index 0fb035d77..0c76cf64c 100644 --- a/pycaret/classification/functional.py +++ b/pycaret/classification/functional.py @@ -238,7 +238,7 @@ def setup( Categorical columns with `max_encoding_ohe` or less unique values are encoded using OneHotEncoding. If more, the `encoding_method` estimator is used. Note that columns with exactly two classes are always encoded - ordinally. + ordinally. Set to below 0 to always use OneHotEncoding. encoding_method: category-encoders estimator, default = None diff --git a/pycaret/classification/oop.py b/pycaret/classification/oop.py index 9fbde0832..e21dfb13b 100644 --- a/pycaret/classification/oop.py +++ b/pycaret/classification/oop.py @@ -332,7 +332,7 @@ def setup( Categorical columns with `max_encoding_ohe` or less unique values are encoded using OneHotEncoding. If more, the `encoding_method` estimator is used. Note that columns with exactly two classes are always encoded - ordinally. + ordinally. Set to below 0 to always use OneHotEncoding. encoding_method: category-encoders estimator, default = None diff --git a/pycaret/clustering/functional.py b/pycaret/clustering/functional.py index e4570b028..8033e69ba 100644 --- a/pycaret/clustering/functional.py +++ b/pycaret/clustering/functional.py @@ -32,7 +32,7 @@ def setup( numeric_imputation: str = "mean", categorical_imputation: str = "constant", text_features_method: str = "tf-idf", - max_encoding_ohe: int = 5, + max_encoding_ohe: int = -1, encoding_method: Optional[Any] = None, polynomial_features: bool = False, polynomial_degree: int = 2, @@ -162,11 +162,11 @@ def setup( text embeddings. - max_encoding_ohe: int, default = 5 + max_encoding_ohe: int, default = -1 Categorical columns with `max_encoding_ohe` or less unique values are encoded using OneHotEncoding. If more, the `encoding_method` estimator is used. Note that columns with exactly two classes are always encoded - ordinally. + ordinally. Set to below 0 to always use OneHotEncoding. encoding_method: category-encoders estimator, default = None diff --git a/pycaret/internal/preprocess/preprocessor.py b/pycaret/internal/preprocess/preprocessor.py index 8a87973e2..7532dfdf8 100644 --- a/pycaret/internal/preprocess/preprocessor.py +++ b/pycaret/internal/preprocess/preprocessor.py @@ -507,7 +507,7 @@ def _encoding(self, max_encoding_ohe, encoding_method): n_unique = self.X[col].nunique(dropna=False) if n_unique == 2: self._fxs["Ordinal"][col] = list(sorted(self.X[col].unique())) - elif n_unique <= max_encoding_ohe: + elif max_encoding_ohe < 0 or n_unique <= max_encoding_ohe: one_hot_cols.append(col) else: rest_cols.append(col) diff --git a/pycaret/internal/preprocess/transformers.py b/pycaret/internal/preprocess/transformers.py index de8c83a7c..3f6225b6c 100644 --- a/pycaret/internal/preprocess/transformers.py +++ b/pycaret/internal/preprocess/transformers.py @@ -179,6 +179,10 @@ def fit(self, X=None, y=None, **fit_params): if "y" in transformer_params and y is not None: args.append(y) + print(X, y) + print(transformer_params) + print(self.transformer) + print(args) self.transformer.fit(*args, **fit_params) return self diff --git a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py index f09510dac..c3a922361 100644 --- a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py +++ b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py @@ -98,7 +98,7 @@ def setup( numeric_imputation: str = "mean", categorical_imputation: str = "constant", text_features_method: str = "tf-idf", - max_encoding_ohe: int = 5, + max_encoding_ohe: int = -1, encoding_method: Optional[Any] = None, polynomial_features: bool = False, polynomial_degree: int = 2, @@ -229,11 +229,11 @@ def setup( text embeddings. - max_encoding_ohe: int, default = 5 + max_encoding_ohe: int, default = -1 Categorical columns with `max_encoding_ohe` or less unique values are encoded using OneHotEncoding. If more, the `encoding_method` estimator is used. Note that columns with exactly two classes are always encoded - ordinally. + ordinally. Set to below 0 to always use OneHotEncoding. encoding_method: category-encoders estimator, default = None diff --git a/pycaret/regression/functional.py b/pycaret/regression/functional.py index 82c6912e4..4ac03d9b5 100644 --- a/pycaret/regression/functional.py +++ b/pycaret/regression/functional.py @@ -238,7 +238,7 @@ def setup( Categorical columns with `max_encoding_ohe` or less unique values are encoded using OneHotEncoding. If more, the `encoding_method` estimator is used. Note that columns with exactly two classes are always encoded - ordinally. + ordinally. Set to below 0 to always use OneHotEncoding. encoding_method: category-encoders estimator, default = None diff --git a/pycaret/regression/oop.py b/pycaret/regression/oop.py index f9dd74c48..3b141efd3 100644 --- a/pycaret/regression/oop.py +++ b/pycaret/regression/oop.py @@ -297,7 +297,7 @@ def setup( Categorical columns with `max_encoding_ohe` or less unique values are encoded using OneHotEncoding. If more, the `encoding_method` estimator is used. Note that columns with exactly two classes are always encoded - ordinally. + ordinally. Set to below 0 to always use OneHotEncoding. encoding_method: category-encoders estimator, default = None From b826c4f355c237e1a25dcac6fefdf3cb9449ebff Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 2 Jun 2022 14:00:54 +0000 Subject: [PATCH 18/20] Remove debug --- pycaret/internal/preprocess/transformers.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pycaret/internal/preprocess/transformers.py b/pycaret/internal/preprocess/transformers.py index 3f6225b6c..de8c83a7c 100644 --- a/pycaret/internal/preprocess/transformers.py +++ b/pycaret/internal/preprocess/transformers.py @@ -179,10 +179,6 @@ def fit(self, X=None, y=None, **fit_params): if "y" in transformer_params and y is not None: args.append(y) - print(X, y) - print(transformer_params) - print(self.transformer) - print(args) self.transformer.fit(*args, **fit_params) return self From 94cf4954dcb3d7e03690459901a2636f311edf19 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 2 Jun 2022 15:40:09 +0000 Subject: [PATCH 19/20] Fix clustering metrics --- .../pycaret_experiment/unsupervised_experiment.py | 8 +++++++- pycaret/internal/utils.py | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py index c3a922361..b8caecc31 100644 --- a/pycaret/internal/pycaret_experiment/unsupervised_experiment.py +++ b/pycaret/internal/pycaret_experiment/unsupervised_experiment.py @@ -1421,6 +1421,9 @@ def _create_model( # Storing X_train and y_train in data_X and data_y parameter data_X = self.X if X_data is None else X_data + transformed_data = ( + self.X_transformed if X_data is None else self.pipeline.transform(X_data) + ) """ MONITOR UPDATE STARTS @@ -1523,7 +1526,10 @@ def _create_model( gt = None if self._ml_usecase == MLUsecase.CLUSTERING: - metrics = self._calculate_metrics(data_X, model.labels_, ground_truth=gt) + with redirect_output(self.logger): + metrics = self._calculate_metrics( + transformed_data, model.labels_, ground_truth=gt + ) else: metrics = {} diff --git a/pycaret/internal/utils.py b/pycaret/internal/utils.py index 3add5e0fd..2609a8e16 100644 --- a/pycaret/internal/utils.py +++ b/pycaret/internal/utils.py @@ -487,9 +487,11 @@ def _calculate_unsupervised_metric( if not score_func: return None target = ground_truth if container.needs_ground_truth else X + if target is None: + return 0 try: calculated_metric = score_func(target, labels, **container.args) - except: + except Exception: calculated_metric = 0 return (display_name, calculated_metric) From 2b9c55cdba5b9db3b5ae18c84f28084f9a417772 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 2 Jun 2022 17:23:59 +0000 Subject: [PATCH 20/20] Fix --- pycaret/internal/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pycaret/internal/utils.py b/pycaret/internal/utils.py index 2609a8e16..c277f47dc 100644 --- a/pycaret/internal/utils.py +++ b/pycaret/internal/utils.py @@ -488,11 +488,12 @@ def _calculate_unsupervised_metric( return None target = ground_truth if container.needs_ground_truth else X if target is None: - return 0 - try: - calculated_metric = score_func(target, labels, **container.args) - except Exception: calculated_metric = 0 + else: + try: + calculated_metric = score_func(target, labels, **container.args) + except Exception: + calculated_metric = 0 return (display_name, calculated_metric)