From e5d12a76020f8a5bac87695ab87011166a56097c Mon Sep 17 00:00:00 2001 From: Donato Riccio Date: Sun, 4 Dec 2022 07:56:19 +0100 Subject: [PATCH] v0.3.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added camels 🐫 - Added more metrics: models are now evaluated on MSE, MAE, RMSE for regression AUC, F1, Precision, Recall, Accuracy, Log loss for classification - Reworked stacking - Added docstrings - Other minor changes --- .gitignore | 2 +- README.md | 2 +- notebooks/regression_example/regression.ipynb | 1452 ++++++++++++----- pycaML/experiments.py | 413 +++-- pycaML/models.py | 111 +- setup.py | 4 +- 6 files changed, 1347 insertions(+), 637 deletions(-) diff --git a/.gitignore b/.gitignore index aa35f9a..b5d7096 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ __pycache__/ *.py[cod] *$py.class - +dist/ # C extensions *.so install.bat diff --git a/README.md b/README.md index cbc9ba6..9ec0201 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![image](logo.png) -### Python Comparative Analysis for Machine Learning +### 🐫 Python Comparative Analysis for Machine Learning 🐫 pycaML is an easy machine learning model comparison tool with optimization. It allows to generate a table comparing multiple machine learning models, to see which one is best for your data. The unique feature of pycaML is built-in hyperparameters tuning using Bayesian Optimization. It also supports meta-models like Stacking and Voting ensembles. You can setup and optimize 25 models with one line of code. diff --git a/notebooks/regression_example/regression.ipynb b/notebooks/regression_example/regression.ipynb index b3cde5c..c76da9f 100644 --- a/notebooks/regression_example/regression.ipynb +++ b/notebooks/regression_example/regression.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "![image](logo.png)\n", - "### Python Comparative Analysis for Machine Learning\n" + "### 🐫 Python Comparative Analysis for Machine Learning 🐫\n" ] }, { @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -84,230 +84,476 @@ " \n", " \n", " \n", + " CV mean_squared_error\n", + " CV mean_absolute_error\n", + " CV root_mean_squared_error\n", + " Test mean_squared_error\n", + " Test mean_absolute_error\n", + " Test root_mean_squared_error\n", + " STD mean_squared_error\n", + " STD mean_absolute_error\n", + " STD root_mean_squared_error\n", + " \n", + " \n", " Model\n", - " RMSE_test\n", - " RMSE_cv\n", - " RMSE_std\n", - " Time\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 10\n", - " EBM\n", - " 25.559451\n", - " 26.538906\n", - " 2.928745\n", - " 1.310682\n", - " \n", - " \n", - " 2\n", - " Random Forest\n", - " 26.527714\n", - " 27.335807\n", - " 3.334061\n", - " 0.198201\n", + " Ridge\n", + " 684.0937\n", + " 19.7132\n", + " 25.9922\n", + " 704.9065\n", + " 20.6151\n", + " 26.5501\n", + " 149.3266\n", + " 2.0916\n", + " 2.9153\n", " \n", " \n", - " 15\n", - " Least Angle Regression\n", - " 26.545738\n", - " 26.156182\n", - " 2.910194\n", - " 0.001802\n", + " Least Angle Regression\n", + " 684.1458\n", + " 19.7181\n", + " 25.9938\n", + " 704.6762\n", + " 20.6074\n", + " 26.5457\n", + " 149.0704\n", + " 2.0922\n", + " 2.9102\n", " \n", " \n", - " 13\n", - " Ridge\n", - " 26.550075\n", - " 26.155185\n", - " 2.915300\n", - " 0.000800\n", + " Bayesian Ridge\n", + " 685.2642\n", + " 19.7011\n", + " 26.0095\n", + " 707.7618\n", + " 20.6916\n", + " 26.6038\n", + " 151.7873\n", + " 2.0711\n", + " 2.9613\n", " \n", " \n", - " 19\n", - " Bayesian Ridge\n", - " 26.603792\n", - " 26.177551\n", - " 2.961254\n", - " 0.001001\n", + " Lasso\n", + " 687.2510\n", + " 19.6490\n", + " 26.0409\n", + " 710.1292\n", + " 20.7857\n", + " 26.6482\n", + " 155.7772\n", + " 2.0639\n", + " 3.0206\n", " \n", " \n", - " 12\n", - " Lasso\n", - " 26.648249\n", - " 26.215473\n", - " 3.020647\n", - " 0.000601\n", + " TheilSen\n", + " 711.0897\n", + " 19.8694\n", + " 26.4725\n", + " 736.8144\n", + " 20.6382\n", + " 27.1443\n", + " 167.3953\n", + " 2.1714\n", + " 3.2089\n", " \n", " \n", - " 21\n", - " TheilSen\n", - " 27.144326\n", - " 26.666264\n", - " 3.208906\n", - " 0.557400\n", + " Huber\n", + " 728.2750\n", + " 20.4693\n", + " 26.8886\n", + " 796.3084\n", + " 21.8721\n", + " 28.2189\n", + " 123.0590\n", + " 1.6999\n", + " 2.2976\n", " \n", " \n", - " 16\n", - " Orthogonal Matching Pursuit\n", - " 27.353908\n", - " 28.552806\n", - " 2.754056\n", - " 0.001002\n", + " ExtraTrees\n", + " 744.8984\n", + " 20.0806\n", + " 27.1331\n", + " 769.2363\n", + " 21.5691\n", + " 27.7351\n", + " 162.8124\n", + " 1.7093\n", + " 2.9489\n", " \n", " \n", - " 0\n", - " Bagging\n", - " 27.691749\n", - " 28.809562\n", - " 3.732251\n", - " 0.021000\n", + " CatBoost\n", + " 746.5364\n", + " 19.9194\n", + " 27.1227\n", + " 810.2917\n", + " 22.4741\n", + " 28.4656\n", + " 187.3362\n", + " 2.3122\n", + " 3.3004\n", " \n", " \n", - " 7\n", - " ExtraTrees\n", - " 27.735110\n", - " 27.292827\n", - " 2.948893\n", - " 0.107601\n", + " Random Forest\n", + " 747.2463\n", + " 20.3745\n", + " 27.1317\n", + " 703.7196\n", + " 20.8853\n", + " 26.5277\n", + " 183.1579\n", + " 2.2459\n", + " 3.3341\n", " \n", " \n", - " 11\n", - " Elastic Net\n", - " 28.204873\n", - " 27.343020\n", - " 3.136312\n", - " 0.000801\n", + " Elastic Net\n", + " 747.6407\n", + " 20.5127\n", + " 27.1626\n", + " 795.5149\n", + " 22.1152\n", + " 28.2049\n", + " 168.0972\n", + " 1.9605\n", + " 3.1363\n", " \n", " \n", - " 20\n", - " Huber\n", - " 28.218937\n", - " 26.986570\n", - " 2.297568\n", - " 0.022125\n", + " Gradient Boost\n", + " 753.4367\n", + " 20.7405\n", + " 27.2509\n", + " 841.0631\n", + " 22.8157\n", + " 29.0011\n", + " 184.1624\n", + " 2.0780\n", + " 3.2898\n", " \n", " \n", - " 6\n", - " CatBoost\n", - " 28.465623\n", - " 27.322818\n", - " 3.300429\n", - " 1.320558\n", + " LightGBM\n", + " 758.7599\n", + " 20.7362\n", + " 27.3093\n", + " 859.4543\n", + " 23.2591\n", + " 29.3165\n", + " 203.9306\n", + " 2.5687\n", + " 3.6003\n", " \n", " \n", - " 14\n", - " Support Vector Machine\n", - " 28.748077\n", - " 29.775458\n", - " 3.801151\n", - " 0.006601\n", + " AdaBoost\n", + " 818.5568\n", + " 22.2063\n", + " 28.4734\n", + " 862.9119\n", + " 23.6130\n", + " 29.3754\n", + " 157.5894\n", + " 1.4543\n", + " 2.7965\n", " \n", " \n", - " 8\n", - " KNN\n", - " 28.888397\n", - " 29.704169\n", - " 3.227931\n", - " 0.000800\n", + " Bagging\n", + " 829.9909\n", + " 21.5680\n", + " 28.5668\n", + " 766.8330\n", + " 22.2857\n", + " 27.6917\n", + " 217.1026\n", + " 2.4416\n", + " 3.7323\n", " \n", " \n", - " 5\n", - " Gradient Boost\n", - " 29.001088\n", - " 27.448801\n", - " 3.289781\n", - " 0.131603\n", + " Multi-layer Perceptron\n", + " 854.7047\n", + " 22.1185\n", + " 29.0971\n", + " 907.1321\n", + " 23.0873\n", + " 30.1186\n", + " 166.6640\n", + " 2.1374\n", + " 2.8392\n", " \n", " \n", - " 4\n", - " LightGBM\n", - " 29.294440\n", - " 27.541973\n", - " 3.603834\n", - " 1.590878\n", + " XGBoost\n", + " 857.2271\n", + " 21.9636\n", + " 29.1338\n", + " 901.9107\n", + " 22.7981\n", + " 30.0318\n", + " 172.2244\n", + " 1.8080\n", + " 2.9071\n", " \n", " \n", - " 3\n", - " XGBoost\n", - " 29.375231\n", - " 29.392813\n", - " 2.936202\n", - " 4.173599\n", + " KNN\n", + " 882.3376\n", + " 22.2571\n", + " 29.5283\n", + " 834.5395\n", + " 21.7883\n", + " 28.8884\n", + " 186.7553\n", + " 2.3731\n", + " 3.2279\n", " \n", " \n", - " 9\n", - " AdaBoost\n", - " 29.375362\n", - " 28.610432\n", - " 2.796502\n", - " 0.047202\n", + " Support Vector Machine\n", + " 886.5779\n", + " 22.3483\n", + " 29.5318\n", + " 826.4519\n", + " 23.1545\n", + " 28.7481\n", + " 231.1391\n", + " 2.2385\n", + " 3.8012\n", " \n", " \n", - " 17\n", - " Multi-layer Perceptron\n", - " 30.118634\n", - " 29.235333\n", - " 2.839160\n", - " 0.215202\n", + " Orthogonal Matching Pursuit\n", + " 917.6524\n", + " 23.2431\n", + " 30.0840\n", + " 911.6742\n", + " 24.4009\n", + " 30.1939\n", + " 214.8147\n", + " 1.9912\n", + " 3.5504\n", " \n", " \n", - " 22\n", - " RANSAC\n", - " 30.749452\n", - " 30.541906\n", - " 3.006828\n", - " 0.052602\n", + " RANSAC\n", + " 932.8080\n", + " 23.3761\n", + " 30.3935\n", + " 945.5288\n", + " 25.2823\n", + " 30.7495\n", + " 186.4959\n", + " 1.7930\n", + " 3.0068\n", " \n", " \n", - " 1\n", - " Decision Tree\n", - " 37.105501\n", - " 38.796203\n", - " 3.965597\n", - " 0.002303\n", + " Decision Tree\n", + " 1505.1454\n", + " 29.0987\n", + " 38.5930\n", + " 1376.8182\n", + " 28.4416\n", + " 37.1055\n", + " 305.2433\n", + " 2.9050\n", + " 3.9656\n", " \n", " \n", - " 18\n", - " Passive Aggressive\n", - " 38.768380\n", - " 39.939330\n", - " 2.664298\n", - " 0.000800\n", + " Passive Aggressive\n", + " 1595.1501\n", + " 30.3474\n", + " 39.8504\n", + " 1502.9873\n", + " 31.3098\n", + " 38.7684\n", + " 213.7343\n", + " 2.2203\n", + " 2.6643\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Model RMSE_test RMSE_cv RMSE_std Time\n", - "10 EBM 25.559451 26.538906 2.928745 1.310682\n", - "2 Random Forest 26.527714 27.335807 3.334061 0.198201\n", - "15 Least Angle Regression 26.545738 26.156182 2.910194 0.001802\n", - "13 Ridge 26.550075 26.155185 2.915300 0.000800\n", - "19 Bayesian Ridge 26.603792 26.177551 2.961254 0.001001\n", - "12 Lasso 26.648249 26.215473 3.020647 0.000601\n", - "21 TheilSen 27.144326 26.666264 3.208906 0.557400\n", - "16 Orthogonal Matching Pursuit 27.353908 28.552806 2.754056 0.001002\n", - "0 Bagging 27.691749 28.809562 3.732251 0.021000\n", - "7 ExtraTrees 27.735110 27.292827 2.948893 0.107601\n", - "11 Elastic Net 28.204873 27.343020 3.136312 0.000801\n", - "20 Huber 28.218937 26.986570 2.297568 0.022125\n", - "6 CatBoost 28.465623 27.322818 3.300429 1.320558\n", - "14 Support Vector Machine 28.748077 29.775458 3.801151 0.006601\n", - "8 KNN 28.888397 29.704169 3.227931 0.000800\n", - "5 Gradient Boost 29.001088 27.448801 3.289781 0.131603\n", - "4 LightGBM 29.294440 27.541973 3.603834 1.590878\n", - "3 XGBoost 29.375231 29.392813 2.936202 4.173599\n", - "9 AdaBoost 29.375362 28.610432 2.796502 0.047202\n", - "17 Multi-layer Perceptron 30.118634 29.235333 2.839160 0.215202\n", - "22 RANSAC 30.749452 30.541906 3.006828 0.052602\n", - "1 Decision Tree 37.105501 38.796203 3.965597 0.002303\n", - "18 Passive Aggressive 38.768380 39.939330 2.664298 0.000800" + " CV mean_squared_error CV mean_absolute_error \\\n", + "Model \n", + "Ridge 684.0937 19.7132 \n", + "Least Angle Regression 684.1458 19.7181 \n", + "Bayesian Ridge 685.2642 19.7011 \n", + "Lasso 687.2510 19.6490 \n", + "TheilSen 711.0897 19.8694 \n", + "Huber 728.2750 20.4693 \n", + "ExtraTrees 744.8984 20.0806 \n", + "CatBoost 746.5364 19.9194 \n", + "Random Forest 747.2463 20.3745 \n", + "Elastic Net 747.6407 20.5127 \n", + "Gradient Boost 753.4367 20.7405 \n", + "LightGBM 758.7599 20.7362 \n", + "AdaBoost 818.5568 22.2063 \n", + "Bagging 829.9909 21.5680 \n", + "Multi-layer Perceptron 854.7047 22.1185 \n", + "XGBoost 857.2271 21.9636 \n", + "KNN 882.3376 22.2571 \n", + "Support Vector Machine 886.5779 22.3483 \n", + "Orthogonal Matching Pursuit 917.6524 23.2431 \n", + "RANSAC 932.8080 23.3761 \n", + "Decision Tree 1505.1454 29.0987 \n", + "Passive Aggressive 1595.1501 30.3474 \n", + "\n", + " CV root_mean_squared_error \\\n", + "Model \n", + "Ridge 25.9922 \n", + "Least Angle Regression 25.9938 \n", + "Bayesian Ridge 26.0095 \n", + "Lasso 26.0409 \n", + "TheilSen 26.4725 \n", + "Huber 26.8886 \n", + "ExtraTrees 27.1331 \n", + "CatBoost 27.1227 \n", + "Random Forest 27.1317 \n", + "Elastic Net 27.1626 \n", + "Gradient Boost 27.2509 \n", + "LightGBM 27.3093 \n", + "AdaBoost 28.4734 \n", + "Bagging 28.5668 \n", + "Multi-layer Perceptron 29.0971 \n", + "XGBoost 29.1338 \n", + "KNN 29.5283 \n", + "Support Vector Machine 29.5318 \n", + "Orthogonal Matching Pursuit 30.0840 \n", + "RANSAC 30.3935 \n", + "Decision Tree 38.5930 \n", + "Passive Aggressive 39.8504 \n", + "\n", + " Test mean_squared_error \\\n", + "Model \n", + "Ridge 704.9065 \n", + "Least Angle Regression 704.6762 \n", + "Bayesian Ridge 707.7618 \n", + "Lasso 710.1292 \n", + "TheilSen 736.8144 \n", + "Huber 796.3084 \n", + "ExtraTrees 769.2363 \n", + "CatBoost 810.2917 \n", + "Random Forest 703.7196 \n", + "Elastic Net 795.5149 \n", + "Gradient Boost 841.0631 \n", + "LightGBM 859.4543 \n", + "AdaBoost 862.9119 \n", + "Bagging 766.8330 \n", + "Multi-layer Perceptron 907.1321 \n", + "XGBoost 901.9107 \n", + "KNN 834.5395 \n", + "Support Vector Machine 826.4519 \n", + "Orthogonal Matching Pursuit 911.6742 \n", + "RANSAC 945.5288 \n", + "Decision Tree 1376.8182 \n", + "Passive Aggressive 1502.9873 \n", + "\n", + " Test mean_absolute_error \\\n", + "Model \n", + "Ridge 20.6151 \n", + "Least Angle Regression 20.6074 \n", + "Bayesian Ridge 20.6916 \n", + "Lasso 20.7857 \n", + "TheilSen 20.6382 \n", + "Huber 21.8721 \n", + "ExtraTrees 21.5691 \n", + "CatBoost 22.4741 \n", + "Random Forest 20.8853 \n", + "Elastic Net 22.1152 \n", + "Gradient Boost 22.8157 \n", + "LightGBM 23.2591 \n", + "AdaBoost 23.6130 \n", + "Bagging 22.2857 \n", + "Multi-layer Perceptron 23.0873 \n", + "XGBoost 22.7981 \n", + "KNN 21.7883 \n", + "Support Vector Machine 23.1545 \n", + "Orthogonal Matching Pursuit 24.4009 \n", + "RANSAC 25.2823 \n", + "Decision Tree 28.4416 \n", + "Passive Aggressive 31.3098 \n", + "\n", + " Test root_mean_squared_error \\\n", + "Model \n", + "Ridge 26.5501 \n", + "Least Angle Regression 26.5457 \n", + "Bayesian Ridge 26.6038 \n", + "Lasso 26.6482 \n", + "TheilSen 27.1443 \n", + "Huber 28.2189 \n", + "ExtraTrees 27.7351 \n", + "CatBoost 28.4656 \n", + "Random Forest 26.5277 \n", + "Elastic Net 28.2049 \n", + "Gradient Boost 29.0011 \n", + "LightGBM 29.3165 \n", + "AdaBoost 29.3754 \n", + "Bagging 27.6917 \n", + "Multi-layer Perceptron 30.1186 \n", + "XGBoost 30.0318 \n", + "KNN 28.8884 \n", + "Support Vector Machine 28.7481 \n", + "Orthogonal Matching Pursuit 30.1939 \n", + "RANSAC 30.7495 \n", + "Decision Tree 37.1055 \n", + "Passive Aggressive 38.7684 \n", + "\n", + " STD mean_squared_error STD mean_absolute_error \\\n", + "Model \n", + "Ridge 149.3266 2.0916 \n", + "Least Angle Regression 149.0704 2.0922 \n", + "Bayesian Ridge 151.7873 2.0711 \n", + "Lasso 155.7772 2.0639 \n", + "TheilSen 167.3953 2.1714 \n", + "Huber 123.0590 1.6999 \n", + "ExtraTrees 162.8124 1.7093 \n", + "CatBoost 187.3362 2.3122 \n", + "Random Forest 183.1579 2.2459 \n", + "Elastic Net 168.0972 1.9605 \n", + "Gradient Boost 184.1624 2.0780 \n", + "LightGBM 203.9306 2.5687 \n", + "AdaBoost 157.5894 1.4543 \n", + "Bagging 217.1026 2.4416 \n", + "Multi-layer Perceptron 166.6640 2.1374 \n", + "XGBoost 172.2244 1.8080 \n", + "KNN 186.7553 2.3731 \n", + "Support Vector Machine 231.1391 2.2385 \n", + "Orthogonal Matching Pursuit 214.8147 1.9912 \n", + "RANSAC 186.4959 1.7930 \n", + "Decision Tree 305.2433 2.9050 \n", + "Passive Aggressive 213.7343 2.2203 \n", + "\n", + " STD root_mean_squared_error \n", + "Model \n", + "Ridge 2.9153 \n", + "Least Angle Regression 2.9102 \n", + "Bayesian Ridge 2.9613 \n", + "Lasso 3.0206 \n", + "TheilSen 3.2089 \n", + "Huber 2.2976 \n", + "ExtraTrees 2.9489 \n", + "CatBoost 3.3004 \n", + "Random Forest 3.3341 \n", + "Elastic Net 3.1363 \n", + "Gradient Boost 3.2898 \n", + "LightGBM 3.6003 \n", + "AdaBoost 2.7965 \n", + "Bagging 3.7323 \n", + "Multi-layer Perceptron 2.8392 \n", + "XGBoost 2.9071 \n", + "KNN 3.2279 \n", + "Support Vector Machine 3.8012 \n", + "Orthogonal Matching Pursuit 3.5504 \n", + "RANSAC 3.0068 \n", + "Decision Tree 3.9656 \n", + "Passive Aggressive 2.6643 " ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -346,22 +592,17 @@ "metadata": {}, "source": [ "The above experiment trained all the available models with default parameters.\n", - "To perform model stacking, let's create another experiment. There is no need to run `load_data` if the data has already been copied in the data folder.\n" + "To perform model stacking, let's create another experiment. There is no need to run `load_data` if the data has already been copied in the data folder.\n", + "Stacking is started with the `start` function. There are two parameters:\n", + "`n_estimators` decides how many models to include in the final estimator\n", + "`estimators` can be `best`, `random` or `all`\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result loaded\n", - "Data loaded\n" - ] - }, { "data": { "text/html": [ @@ -383,75 +624,90 @@ " \n", " \n", " \n", + " Test mean_squared_error\n", + " Test mean_absolute_error\n", + " Test root_mean_squared_error\n", + " Estimators\n", + " N_estimators\n", + " \n", + " \n", " Model\n", - " RMSE_test\n", - " RMSE_cv\n", - " RMSE_std\n", - " Time\n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " Voting (all)\n", - " 25.959632\n", - " 25.956527\n", - " 3.301492\n", - " 8.595250\n", + " Stacking\n", + " 697.102287\n", + " 20.604786\n", + " 26.402695\n", + " ['Ridge', 'Least Angle Regression', 'Bayesian ...\n", + " 6\n", " \n", " \n", - " 1\n", - " Stacking (diverse)\n", - " 25.996212\n", - " 26.696336\n", - " 2.807487\n", - " 7.218289\n", + " Voting\n", + " 705.505953\n", + " 20.588648\n", + " 26.561362\n", + " ['Ridge', 'Least Angle Regression', 'Bayesian ...\n", + " 6\n", " \n", " \n", - " 2\n", - " Voting (diverse)\n", - " 26.310165\n", - " 27.016540\n", - " 3.538600\n", - " 1.429399\n", + " Stacking\n", + " 680.004624\n", + " 20.567459\n", + " 26.076898\n", + " [Ridge, Least Angle Regression, Bayesian Ridge...\n", + " 10\n", " \n", " \n", - " 3\n", - " Stacking (all)\n", - " 27.066082\n", - " 31.819227\n", - " 3.693124\n", - " 48.291322\n", + " Voting\n", + " 691.613451\n", + " 20.495616\n", + " 26.298545\n", + " [Ridge, Least Angle Regression, Bayesian Ridge...\n", + " 10\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Model RMSE_test RMSE_cv RMSE_std Time\n", - "0 Voting (all) 25.959632 25.956527 3.301492 8.595250\n", - "1 Stacking (diverse) 25.996212 26.696336 2.807487 7.218289\n", - "2 Voting (diverse) 26.310165 27.016540 3.538600 1.429399\n", - "3 Stacking (all) 27.066082 31.819227 3.693124 48.291322" + " Test mean_squared_error Test mean_absolute_error \\\n", + "Model \n", + "Stacking 697.102287 20.604786 \n", + "Voting 705.505953 20.588648 \n", + "Stacking 680.004624 20.567459 \n", + "Voting 691.613451 20.495616 \n", + "\n", + " Test root_mean_squared_error \\\n", + "Model \n", + "Stacking 26.402695 \n", + "Voting 26.561362 \n", + "Stacking 26.076898 \n", + "Voting 26.298545 \n", + "\n", + " Estimators N_estimators \n", + "Model \n", + "Stacking ['Ridge', 'Least Angle Regression', 'Bayesian ... 6 \n", + "Voting ['Ridge', 'Least Angle Regression', 'Bayesian ... 6 \n", + "Stacking [Ridge, Least Angle Regression, Bayesian Ridge... 10 \n", + "Voting [Ridge, Least Angle Regression, Bayesian Ridge... 10 " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "diabetes_stacking = RegressionExperiment(name = 'diabetes', stacking = True)\n", - "diabetes_stacking.start()\n", - "diabetes_stacking.result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Stacking all uses every other model as level 0 estimators.\n", - "Stacking and voting diverse only take the best model from boosting, the best for parallel models (RF, ET) and simple models." + "diabetes_stacking = RegressionExperiment(name = 'diabetes')\n", + "diabetes_stacking.stack(n_estimators=10, estimators='best')\n", + "diabetes_stacking.stack_result" ] }, { @@ -459,12 +715,12 @@ "metadata": {}, "source": [ "# Hyperparameter tuning\n", - "Hyperparameters tuning is as simple as creating another experiment with the flag `tuning = True`. You can also stack tuned models by combining the parameters." + "Hyperparameters tuning is as simple as creating another experiment with the flag `tuning`. You can choose which metric to optimize. You can also stack tuned models by starting `stack` after the tuning process saved the parameters." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -496,254 +752,491 @@ " \n", " \n", " \n", + " CV mean_squared_error\n", + " CV mean_absolute_error\n", + " CV root_mean_squared_error\n", + " Test mean_squared_error\n", + " Test mean_absolute_error\n", + " Test root_mean_squared_error\n", + " STD mean_squared_error\n", + " STD mean_absolute_error\n", + " STD root_mean_squared_error\n", + " \n", + " \n", " Model\n", - " RMSE_test\n", - " RMSE_cv\n", - " RMSE_std\n", - " Time\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " Gradient Boost\n", - " 25.391383\n", - " 26.304448\n", - " 3.162734\n", - " 0.036801\n", - " \n", - " \n", - " 1\n", - " CatBoost\n", - " 25.699729\n", - " 26.004154\n", - " 2.967139\n", - " 0.390401\n", + " CatBoost\n", + " 676.6180\n", + " 19.3714\n", + " 25.8387\n", + " 660.4535\n", + " 20.3281\n", + " 25.6993\n", + " 156.7793\n", + " 2.1262\n", + " 2.9969\n", " \n", " \n", - " 2\n", - " XGBoost\n", - " 25.729658\n", - " 26.294288\n", - " 3.030478\n", - " 0.971916\n", + " Bagging\n", + " 683.8560\n", + " 19.7232\n", + " 25.9889\n", + " 693.9194\n", + " 20.4580\n", + " 26.3423\n", + " 149.6330\n", + " 2.0606\n", + " 2.9039\n", " \n", " \n", - " 3\n", - " Random Forest\n", - " 25.737857\n", - " 26.269705\n", - " 3.185983\n", - " 0.109799\n", + " Bayesian Ridge\n", + " 684.0655\n", + " 19.7309\n", + " 25.9896\n", + " 701.3845\n", + " 20.5940\n", + " 26.4837\n", + " 150.4353\n", + " 2.0754\n", + " 2.9333\n", " \n", " \n", - " 4\n", - " EBM\n", - " 25.850996\n", - " 26.266700\n", - " 3.019551\n", - " 2.336289\n", + " Ridge\n", + " 684.0680\n", + " 19.7091\n", + " 25.9910\n", + " 705.2452\n", + " 20.6255\n", + " 26.5565\n", + " 149.6693\n", + " 2.0888\n", + " 2.9220\n", " \n", " \n", - " 5\n", - " ExtraTrees\n", - " 25.932177\n", - " 26.308251\n", - " 3.110662\n", - " 0.131400\n", + " Elastic Net\n", + " 684.0738\n", + " 19.7088\n", + " 25.9910\n", + " 705.4142\n", + " 20.6309\n", + " 26.5596\n", + " 149.7063\n", + " 2.0886\n", + " 2.9227\n", " \n", " \n", - " 6\n", - " AdaBoost\n", - " 26.138107\n", - " 26.551887\n", - " 3.302247\n", - " 0.060601\n", + " Least Angle Regression\n", + " 684.1458\n", + " 19.7181\n", + " 25.9938\n", + " 704.6762\n", + " 20.6074\n", + " 26.5457\n", + " 149.0704\n", + " 2.0922\n", + " 2.9102\n", " \n", " \n", - " 7\n", - " Huber\n", - " 26.446251\n", - " 26.183322\n", - " 3.107596\n", - " 0.049999\n", + " Orthogonal Matching Pursuit\n", + " 684.1458\n", + " 19.7181\n", + " 25.9938\n", + " 704.6762\n", + " 20.6074\n", + " 26.5457\n", + " 149.0704\n", + " 2.0922\n", + " 2.9102\n", " \n", " \n", - " 8\n", - " Bayesian Ridge\n", - " 26.483656\n", - " 26.154648\n", - " 2.933320\n", - " 0.001399\n", + " Lasso\n", + " 684.1503\n", + " 19.7177\n", + " 25.9938\n", + " 704.6500\n", + " 20.6076\n", + " 26.5452\n", + " 149.0870\n", + " 2.0924\n", + " 2.9105\n", " \n", " \n", - " 9\n", - " Lasso\n", - " 26.544964\n", - " 26.156317\n", - " 2.910634\n", - " 0.000401\n", + " XGBoost\n", + " 684.5187\n", + " 19.5088\n", + " 25.9646\n", + " 651.5563\n", + " 20.1063\n", + " 25.5256\n", + " 171.0888\n", + " 2.2113\n", + " 3.2187\n", " \n", " \n", - " 10\n", - " Orthogonal Matching Pursuit\n", - " 26.545738\n", - " 26.156182\n", - " 2.910194\n", - " 0.001001\n", + " Huber\n", + " 685.6506\n", + " 19.6731\n", + " 26.0001\n", + " 699.0842\n", + " 20.5891\n", + " 26.4402\n", + " 159.0710\n", + " 2.2705\n", + " 3.1058\n", " \n", " \n", - " 11\n", - " Least Angle Regression\n", - " 26.545738\n", - " 26.156182\n", - " 2.910194\n", - " 0.001201\n", + " Gradient Boost\n", + " 692.6052\n", + " 19.7358\n", + " 26.1287\n", + " 645.2124\n", + " 20.0168\n", + " 25.4010\n", + " 166.2710\n", + " 2.0416\n", + " 3.1459\n", " \n", " \n", - " 12\n", - " Elastic Net\n", - " 26.550968\n", - " 26.155314\n", - " 2.916315\n", - " 0.000800\n", + " ExtraTrees\n", + " 693.0793\n", + " 19.6231\n", + " 26.1426\n", + " 685.0125\n", + " 20.5549\n", + " 26.1727\n", + " 164.5836\n", + " 2.1053\n", + " 3.1059\n", " \n", " \n", - " 13\n", - " Ridge\n", - " 26.556609\n", - " 26.154694\n", - " 2.922195\n", - " 0.000400\n", + " Random Forest\n", + " 694.1602\n", + " 19.7976\n", + " 26.1586\n", + " 672.0296\n", + " 20.7556\n", + " 25.9235\n", + " 167.4893\n", + " 2.0814\n", + " 3.1443\n", " \n", " \n", - " 14\n", - " LightGBM\n", - " 26.653051\n", - " 26.337168\n", - " 3.164260\n", - " 0.721503\n", + " LightGBM\n", + " 695.5160\n", + " 19.7107\n", + " 26.1474\n", + " 684.3669\n", + " 20.5817\n", + " 26.1604\n", + " 183.2713\n", + " 2.2417\n", + " 3.4391\n", " \n", " \n", - " 15\n", - " Decision Tree\n", - " 26.679701\n", - " 27.293106\n", - " 3.269204\n", - " 0.000800\n", + " AdaBoost\n", + " 709.2120\n", + " 20.0572\n", + " 26.4378\n", + " 709.9564\n", + " 20.7782\n", + " 26.6450\n", + " 172.0645\n", + " 1.9621\n", + " 3.2023\n", " \n", " \n", - " 16\n", - " Multi-layer Perceptron\n", - " 26.732545\n", - " 26.081884\n", - " 2.988715\n", - " 0.710802\n", + " Multi-layer Perceptron\n", + " 709.3580\n", + " 19.9573\n", + " 26.4480\n", + " 919.7696\n", + " 24.0812\n", + " 30.3277\n", + " 163.8074\n", + " 2.0761\n", + " 3.1401\n", " \n", " \n", - " 17\n", - " Bagging\n", - " 26.760420\n", - " 26.137304\n", - " 2.980012\n", - " 0.002000\n", + " TheilSen\n", + " 711.0895\n", + " 19.8694\n", + " 26.4725\n", + " 736.8149\n", + " 20.6382\n", + " 27.1443\n", + " 167.3952\n", + " 2.1714\n", + " 3.2089\n", " \n", " \n", - " 18\n", - " Support Vector Machine\n", - " 26.922687\n", - " 27.404745\n", - " 3.383549\n", - " 0.017600\n", + " Passive Aggressive\n", + " 721.8817\n", + " 20.1606\n", + " 26.6849\n", + " 717.7113\n", + " 21.2244\n", + " 26.7901\n", + " 165.8263\n", + " 2.3752\n", + " 3.1302\n", " \n", " \n", - " 19\n", - " Passive Aggressive\n", - " 26.945734\n", - " 27.069196\n", - " 3.166522\n", - " 0.020298\n", + " Decision Tree\n", + " 744.9136\n", + " 20.8393\n", + " 27.0966\n", + " 711.8065\n", + " 20.8870\n", + " 26.6797\n", + " 180.4933\n", + " 1.7543\n", + " 3.2692\n", " \n", " \n", - " 20\n", - " KNN\n", - " 27.114459\n", - " 27.710887\n", - " 3.630735\n", - " 0.000599\n", + " Support Vector Machine\n", + " 751.2402\n", + " 20.1239\n", + " 27.2016\n", + " 720.4212\n", + " 20.7776\n", + " 26.8407\n", + " 183.0611\n", + " 2.3151\n", + " 3.3632\n", " \n", " \n", - " 21\n", - " TheilSen\n", - " 27.144333\n", - " 26.666261\n", - " 3.208905\n", - " 0.608800\n", + " KNN\n", + " 765.2230\n", + " 20.3443\n", + " 27.4184\n", + " 737.9767\n", + " 20.8905\n", + " 27.1657\n", + " 203.4487\n", + " 2.2831\n", + " 3.6681\n", " \n", " \n", - " 22\n", - " RANSAC\n", - " 121.321709\n", - " 412.312345\n", - " 311.184799\n", - " 0.001601\n", + " RANSAC\n", + " 170001.4696\n", + " 192.1917\n", + " 270.4912\n", + " 14718.9571\n", + " 91.2385\n", + " 121.3217\n", + " 308687.3858\n", + " 213.4380\n", + " 311.1848\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Model RMSE_test RMSE_cv RMSE_std Time\n", - "0 Gradient Boost 25.391383 26.304448 3.162734 0.036801\n", - "1 CatBoost 25.699729 26.004154 2.967139 0.390401\n", - "2 XGBoost 25.729658 26.294288 3.030478 0.971916\n", - "3 Random Forest 25.737857 26.269705 3.185983 0.109799\n", - "4 EBM 25.850996 26.266700 3.019551 2.336289\n", - "5 ExtraTrees 25.932177 26.308251 3.110662 0.131400\n", - "6 AdaBoost 26.138107 26.551887 3.302247 0.060601\n", - "7 Huber 26.446251 26.183322 3.107596 0.049999\n", - "8 Bayesian Ridge 26.483656 26.154648 2.933320 0.001399\n", - "9 Lasso 26.544964 26.156317 2.910634 0.000401\n", - "10 Orthogonal Matching Pursuit 26.545738 26.156182 2.910194 0.001001\n", - "11 Least Angle Regression 26.545738 26.156182 2.910194 0.001201\n", - "12 Elastic Net 26.550968 26.155314 2.916315 0.000800\n", - "13 Ridge 26.556609 26.154694 2.922195 0.000400\n", - "14 LightGBM 26.653051 26.337168 3.164260 0.721503\n", - "15 Decision Tree 26.679701 27.293106 3.269204 0.000800\n", - "16 Multi-layer Perceptron 26.732545 26.081884 2.988715 0.710802\n", - "17 Bagging 26.760420 26.137304 2.980012 0.002000\n", - "18 Support Vector Machine 26.922687 27.404745 3.383549 0.017600\n", - "19 Passive Aggressive 26.945734 27.069196 3.166522 0.020298\n", - "20 KNN 27.114459 27.710887 3.630735 0.000599\n", - "21 TheilSen 27.144333 26.666261 3.208905 0.608800\n", - "22 RANSAC 121.321709 412.312345 311.184799 0.001601" + " CV mean_squared_error CV mean_absolute_error \\\n", + "Model \n", + "CatBoost 676.6180 19.3714 \n", + "Bagging 683.8560 19.7232 \n", + "Bayesian Ridge 684.0655 19.7309 \n", + "Ridge 684.0680 19.7091 \n", + "Elastic Net 684.0738 19.7088 \n", + "Least Angle Regression 684.1458 19.7181 \n", + "Orthogonal Matching Pursuit 684.1458 19.7181 \n", + "Lasso 684.1503 19.7177 \n", + "XGBoost 684.5187 19.5088 \n", + "Huber 685.6506 19.6731 \n", + "Gradient Boost 692.6052 19.7358 \n", + "ExtraTrees 693.0793 19.6231 \n", + "Random Forest 694.1602 19.7976 \n", + "LightGBM 695.5160 19.7107 \n", + "AdaBoost 709.2120 20.0572 \n", + "Multi-layer Perceptron 709.3580 19.9573 \n", + "TheilSen 711.0895 19.8694 \n", + "Passive Aggressive 721.8817 20.1606 \n", + "Decision Tree 744.9136 20.8393 \n", + "Support Vector Machine 751.2402 20.1239 \n", + "KNN 765.2230 20.3443 \n", + "RANSAC 170001.4696 192.1917 \n", + "\n", + " CV root_mean_squared_error \\\n", + "Model \n", + "CatBoost 25.8387 \n", + "Bagging 25.9889 \n", + "Bayesian Ridge 25.9896 \n", + "Ridge 25.9910 \n", + "Elastic Net 25.9910 \n", + "Least Angle Regression 25.9938 \n", + "Orthogonal Matching Pursuit 25.9938 \n", + "Lasso 25.9938 \n", + "XGBoost 25.9646 \n", + "Huber 26.0001 \n", + "Gradient Boost 26.1287 \n", + "ExtraTrees 26.1426 \n", + "Random Forest 26.1586 \n", + "LightGBM 26.1474 \n", + "AdaBoost 26.4378 \n", + "Multi-layer Perceptron 26.4480 \n", + "TheilSen 26.4725 \n", + "Passive Aggressive 26.6849 \n", + "Decision Tree 27.0966 \n", + "Support Vector Machine 27.2016 \n", + "KNN 27.4184 \n", + "RANSAC 270.4912 \n", + "\n", + " Test mean_squared_error \\\n", + "Model \n", + "CatBoost 660.4535 \n", + "Bagging 693.9194 \n", + "Bayesian Ridge 701.3845 \n", + "Ridge 705.2452 \n", + "Elastic Net 705.4142 \n", + "Least Angle Regression 704.6762 \n", + "Orthogonal Matching Pursuit 704.6762 \n", + "Lasso 704.6500 \n", + "XGBoost 651.5563 \n", + "Huber 699.0842 \n", + "Gradient Boost 645.2124 \n", + "ExtraTrees 685.0125 \n", + "Random Forest 672.0296 \n", + "LightGBM 684.3669 \n", + "AdaBoost 709.9564 \n", + "Multi-layer Perceptron 919.7696 \n", + "TheilSen 736.8149 \n", + "Passive Aggressive 717.7113 \n", + "Decision Tree 711.8065 \n", + "Support Vector Machine 720.4212 \n", + "KNN 737.9767 \n", + "RANSAC 14718.9571 \n", + "\n", + " Test mean_absolute_error \\\n", + "Model \n", + "CatBoost 20.3281 \n", + "Bagging 20.4580 \n", + "Bayesian Ridge 20.5940 \n", + "Ridge 20.6255 \n", + "Elastic Net 20.6309 \n", + "Least Angle Regression 20.6074 \n", + "Orthogonal Matching Pursuit 20.6074 \n", + "Lasso 20.6076 \n", + "XGBoost 20.1063 \n", + "Huber 20.5891 \n", + "Gradient Boost 20.0168 \n", + "ExtraTrees 20.5549 \n", + "Random Forest 20.7556 \n", + "LightGBM 20.5817 \n", + "AdaBoost 20.7782 \n", + "Multi-layer Perceptron 24.0812 \n", + "TheilSen 20.6382 \n", + "Passive Aggressive 21.2244 \n", + "Decision Tree 20.8870 \n", + "Support Vector Machine 20.7776 \n", + "KNN 20.8905 \n", + "RANSAC 91.2385 \n", + "\n", + " Test root_mean_squared_error \\\n", + "Model \n", + "CatBoost 25.6993 \n", + "Bagging 26.3423 \n", + "Bayesian Ridge 26.4837 \n", + "Ridge 26.5565 \n", + "Elastic Net 26.5596 \n", + "Least Angle Regression 26.5457 \n", + "Orthogonal Matching Pursuit 26.5457 \n", + "Lasso 26.5452 \n", + "XGBoost 25.5256 \n", + "Huber 26.4402 \n", + "Gradient Boost 25.4010 \n", + "ExtraTrees 26.1727 \n", + "Random Forest 25.9235 \n", + "LightGBM 26.1604 \n", + "AdaBoost 26.6450 \n", + "Multi-layer Perceptron 30.3277 \n", + "TheilSen 27.1443 \n", + "Passive Aggressive 26.7901 \n", + "Decision Tree 26.6797 \n", + "Support Vector Machine 26.8407 \n", + "KNN 27.1657 \n", + "RANSAC 121.3217 \n", + "\n", + " STD mean_squared_error STD mean_absolute_error \\\n", + "Model \n", + "CatBoost 156.7793 2.1262 \n", + "Bagging 149.6330 2.0606 \n", + "Bayesian Ridge 150.4353 2.0754 \n", + "Ridge 149.6693 2.0888 \n", + "Elastic Net 149.7063 2.0886 \n", + "Least Angle Regression 149.0704 2.0922 \n", + "Orthogonal Matching Pursuit 149.0704 2.0922 \n", + "Lasso 149.0870 2.0924 \n", + "XGBoost 171.0888 2.2113 \n", + "Huber 159.0710 2.2705 \n", + "Gradient Boost 166.2710 2.0416 \n", + "ExtraTrees 164.5836 2.1053 \n", + "Random Forest 167.4893 2.0814 \n", + "LightGBM 183.2713 2.2417 \n", + "AdaBoost 172.0645 1.9621 \n", + "Multi-layer Perceptron 163.8074 2.0761 \n", + "TheilSen 167.3952 2.1714 \n", + "Passive Aggressive 165.8263 2.3752 \n", + "Decision Tree 180.4933 1.7543 \n", + "Support Vector Machine 183.0611 2.3151 \n", + "KNN 203.4487 2.2831 \n", + "RANSAC 308687.3858 213.4380 \n", + "\n", + " STD root_mean_squared_error \n", + "Model \n", + "CatBoost 2.9969 \n", + "Bagging 2.9039 \n", + "Bayesian Ridge 2.9333 \n", + "Ridge 2.9220 \n", + "Elastic Net 2.9227 \n", + "Least Angle Regression 2.9102 \n", + "Orthogonal Matching Pursuit 2.9102 \n", + "Lasso 2.9105 \n", + "XGBoost 3.2187 \n", + "Huber 3.1058 \n", + "Gradient Boost 3.1459 \n", + "ExtraTrees 3.1059 \n", + "Random Forest 3.1443 \n", + "LightGBM 3.4391 \n", + "AdaBoost 3.2023 \n", + "Multi-layer Perceptron 3.1401 \n", + "TheilSen 3.2089 \n", + "Passive Aggressive 3.1302 \n", + "Decision Tree 3.2692 \n", + "Support Vector Machine 3.3632 \n", + "KNN 3.6681 \n", + "RANSAC 311.1848 " ] }, - "execution_count": 7, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "diabetes_tuned = RegressionExperiment(name = 'diabetes', tuning = True)\n", - "#n_eval optional paramerer, default = 100\n", - "diabetes_tuned.start(n_eval = 100)\n", + "diabetes_tuned = RegressionExperiment(name = 'diabetes', tuning = 'mean_squared_error')\n", + "diabetes_tuned.start()\n", "diabetes_tuned.result" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result loaded\n", - "Data loaded\n" - ] - }, { "data": { "text/html": [ @@ -765,67 +1258,134 @@ " \n", " \n", " \n", + " Test mean_squared_error\n", + " Test mean_absolute_error\n", + " Test root_mean_squared_error\n", + " Estimators\n", + " N_estimators\n", + " \n", + " \n", " Model\n", - " RMSE_test\n", - " RMSE_cv\n", - " RMSE_std\n", - " Time\n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " Voting (diverse)\n", - " 25.501409\n", - " 25.962292\n", - " 3.271162\n", - " 2.733740\n", + " Stacking\n", + " 697.102287\n", + " 20.604786\n", + " 26.402695\n", + " ['Ridge', 'Least Angle Regression', 'Bayesian ...\n", + " 6\n", + " \n", + " \n", + " Voting\n", + " 705.505953\n", + " 20.588648\n", + " 26.561362\n", + " ['Ridge', 'Least Angle Regression', 'Bayesian ...\n", + " 6\n", " \n", " \n", - " 1\n", - " Stacking (diverse)\n", - " 25.948605\n", - " 26.168364\n", - " 3.114270\n", - " 14.864656\n", + " Stacking\n", + " 680.004624\n", + " 20.567459\n", + " 26.076898\n", + " ['Ridge', 'Least Angle Regression', 'Bayesian ...\n", + " 10\n", " \n", " \n", - " 2\n", - " Voting (all)\n", - " 26.771776\n", - " 31.761112\n", - " 10.639915\n", - " 6.386552\n", + " Voting\n", + " 691.613451\n", + " 20.495616\n", + " 26.298545\n", + " ['Ridge', 'Least Angle Regression', 'Bayesian ...\n", + " 10\n", " \n", " \n", - " 3\n", - " Stacking (all)\n", - " 626.631348\n", - " 172.005079\n", - " 60.608150\n", - " 36.860789\n", + " Stacking\n", + " 665.105399\n", + " 20.219939\n", + " 25.789637\n", + " ['CatBoost', 'Bagging', 'Bayesian Ridge', 'Rid...\n", + " 10\n", + " \n", + " \n", + " Voting\n", + " 692.044351\n", + " 20.483104\n", + " 26.306736\n", + " ['CatBoost', 'Bagging', 'Bayesian Ridge', 'Rid...\n", + " 10\n", + " \n", + " \n", + " Stacking\n", + " 665.105399\n", + " 20.219939\n", + " 25.789637\n", + " [CatBoost, Bagging, Bayesian Ridge, Ridge, Ela...\n", + " 10\n", + " \n", + " \n", + " Voting\n", + " 692.044351\n", + " 20.483104\n", + " 26.306736\n", + " [CatBoost, Bagging, Bayesian Ridge, Ridge, Ela...\n", + " 10\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Model RMSE_test RMSE_cv RMSE_std Time\n", - "0 Voting (diverse) 25.501409 25.962292 3.271162 2.733740\n", - "1 Stacking (diverse) 25.948605 26.168364 3.114270 14.864656\n", - "2 Voting (all) 26.771776 31.761112 10.639915 6.386552\n", - "3 Stacking (all) 626.631348 172.005079 60.608150 36.860789" + " Test mean_squared_error Test mean_absolute_error \\\n", + "Model \n", + "Stacking 697.102287 20.604786 \n", + "Voting 705.505953 20.588648 \n", + "Stacking 680.004624 20.567459 \n", + "Voting 691.613451 20.495616 \n", + "Stacking 665.105399 20.219939 \n", + "Voting 692.044351 20.483104 \n", + "Stacking 665.105399 20.219939 \n", + "Voting 692.044351 20.483104 \n", + "\n", + " Test root_mean_squared_error \\\n", + "Model \n", + "Stacking 26.402695 \n", + "Voting 26.561362 \n", + "Stacking 26.076898 \n", + "Voting 26.298545 \n", + "Stacking 25.789637 \n", + "Voting 26.306736 \n", + "Stacking 25.789637 \n", + "Voting 26.306736 \n", + "\n", + " Estimators N_estimators \n", + "Model \n", + "Stacking ['Ridge', 'Least Angle Regression', 'Bayesian ... 6 \n", + "Voting ['Ridge', 'Least Angle Regression', 'Bayesian ... 6 \n", + "Stacking ['Ridge', 'Least Angle Regression', 'Bayesian ... 10 \n", + "Voting ['Ridge', 'Least Angle Regression', 'Bayesian ... 10 \n", + "Stacking ['CatBoost', 'Bagging', 'Bayesian Ridge', 'Rid... 10 \n", + "Voting ['CatBoost', 'Bagging', 'Bayesian Ridge', 'Rid... 10 \n", + "Stacking [CatBoost, Bagging, Bayesian Ridge, Ridge, Ela... 10 \n", + "Voting [CatBoost, Bagging, Bayesian Ridge, Ridge, Ela... 10 " ] }, - "execution_count": 8, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "diabetes_stacking_tuned = RegressionExperiment(name = 'diabetes', tuning = True, stacking = True)\n", - "diabetes_stacking_tuned.start()\n", - "diabetes_stacking_tuned.result" + "diabetes_stacking_tuned = RegressionExperiment(name = 'diabetes', tuning = 'mean_squared_error')\n", + "diabetes_stacking_tuned.stack()\n", + "diabetes_stacking_tuned.stack_result" ] }, { diff --git a/pycaML/experiments.py b/pycaML/experiments.py index 401eb01..ad1b958 100644 --- a/pycaML/experiments.py +++ b/pycaML/experiments.py @@ -1,20 +1,30 @@ + +from IPython.display import clear_output import pandas as pd import numpy as np +from hyperopt import hp from sklearn.model_selection import train_test_split +from sklearn.utils._testing import ignore_warnings import os import pickle from os.path import exists from sklearn import metrics +from sklearn.metrics import * from sklearn.model_selection import cross_validate +#import convergencewarning +from sklearn.exceptions import ConvergenceWarning from datetime import datetime from statistics import mean from hyperopt import STATUS_OK, Trials, fmin, space_eval, tpe from sklearn import model_selection from .models import models_class, models_reg, models_stacking_class, models_stacking_reg -from .models import bagging_models, boosting_models, simple_models_reg, simple_models_class + + +#check if model has predict_proba method + +class Experiment: -class Experiment(): result = None names = [] params = [] @@ -30,13 +40,46 @@ class Experiment(): n_j = -2 def __init__(self, name, stacking=False, tuning=False, array=True): - # name: name of the experiment - # stacking: True or False + """ + Base class for experiments. Every experiment is a folder in the experiments folder. + It creates four subfolders: data, params, results and trials. + The data folder contains the train and test sets. + The params folder contains the optimized hyperparameters. + The results folder contains the results of the experiments. + The trials folder contains the trials of the hyperparameter optimization. + + Args: + name (str): name of the experiment + stacking (bool, optional): Whether it's a stacking experiment. Defaults to False. + tuning (bool / str, optional): Specifies the metric to optimize in + tuning experiments. Supported values are + 'accuracy', 'f1', 'precision', 'recall', 'roc_auc', 'neg_mean_squared_error', + 'neg_mean_absolute_error', 'neg_median_absolute_error', 'r2'. Defaults to False. + array (bool, optional): Whether to convert the data to numpy arrays. Defaults to True. + + + Raises: + ValueError: If tuning is not a valid metric. + """ + + if tuning not in ['accuracy', 'f1', 'precision', 'recall', 'roc_auc', + 'mean_squared_error', 'mean_absolute_error', 'root_mean_squared_error']: + raise ValueError('Tuning must be one of: accuracy, f1, precision, recall, roc_auc, mean_squared_error, mean_absolute_error, root_mean_squared_error', 'root_mean_squared_error') + self.sort_by = tuning + neg_scoring = ['mean_absolute_error', 'mean_squared_error', 'mean_squared_error', + 'log_loss'] + + self.tuning = tuning if tuning not in neg_scoring else 'neg_' + tuning - self.tuning = tuning self.stacking = stacking self.name = name + if self.tuning == 'neg_log_loss': + #delete models that don't have predict_proba method + for model in list(models_class.keys()): + if not hasattr(models_class[model]['algo'], 'predict_proba'): + self.models.pop(model) + dirs = ['params', 'trials', 'tables', 'data'] for i in dirs: path = f'experiments/{self.name}/{i}' @@ -44,18 +87,19 @@ def __init__(self, name, stacking=False, tuning=False, array=True): os.makedirs(path) print(f'Directory {path} created!') - if tuning is True and stacking is True: + if tuning != False and stacking is True: self.exp_type = 'stacking_tuning' - elif tuning is True and stacking is False: + elif tuning != False and stacking is False: self.exp_type = 'tuning' elif tuning is False and stacking is True: self.exp_type = 'stacking' else: self.exp_type = 'default' - file = f'experiments/{self.name}/tables/{self.name}_{self.exp_type}.csv' - if exists(file): - self.result = pd.read_csv(file) + #if the results file exists, load it + self.result_path = f'experiments/{self.name}/tables/{self.name}_{self.exp_type}_{tuning if tuning else ""}.csv' + if exists(self.result_path): + self.result = pd.read_csv(self.result_path, index_col = 0) print('Result loaded') for i in self.models.keys(): @@ -65,7 +109,6 @@ def __init__(self, name, stacking=False, tuning=False, array=True): with open(file, 'rb') as f: self.models[i]['opt_params'] = pickle.load(f) # print(f'Parameters for {i} loaded') - # print(self.models[i]['opt_params'] ) self.dataset = [ f'experiments/{self.name}/data/X_train.csv', @@ -73,7 +116,7 @@ def __init__(self, name, stacking=False, tuning=False, array=True): f'experiments/{self.name}/data/X_test.csv', f'experiments/{self.name}/data/y_test.csv' ] - + #if the dataset exists, load it if all(exists(i) for i in self.dataset): self.X_train = pd.read_csv(self.dataset[0]).to_numpy() if array else pd.read_csv(self.dataset[0]) self.y_train = pd.read_csv(self.dataset[1]).to_numpy().ravel() if array else pd.read_csv(self.dataset[1]) @@ -81,7 +124,18 @@ def __init__(self, name, stacking=False, tuning=False, array=True): self.y_test = pd.read_csv(self.dataset[3]).to_numpy().ravel() if array else pd.read_csv(self.dataset[3]) print('Data loaded') + def load_data(self, path, target, split=0.2, array=True): + """ + Loads data from a csv file, splits it into train and test sets, + and copies it to the experiment/data folder. + + Args: + path (str): Path to the csv file. + target (str): Name of the target column. + split (float, optional): Ratio between test and train set. Defaults to 0.2. + array (bool, optional): Whether to convert the data to numpy arrays. Defaults to True. + """ data = pd.read_csv(path) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( @@ -103,47 +157,82 @@ def load_data(self, path, target, split=0.2, array=True): self.y_train = self.y_train.to_numpy().ravel() self.y_test = self.y_test.to_numpy().ravel() - # trains the models and saves the results - def start(self, cv=5, raise_errors=False, n_eval=100): + @ignore_warnings(category=ConvergenceWarning) + def start(self, cv=5, raise_errors=False, n_eval=100, gpu = False, warnings=False): + """Start the experiment. At the end of the experiment, the results are saved in the experiment/tables folder. + + Args: + cv (int, optional): Number of folds for cross validation. Defaults to 5. + raise_errors (bool, optional): Whether to raise errors. Defaults to False. + n_eval (int, optional): Number of evaluations for hyperparameter tuning. Defaults to 100. + gpu (bool, optional): Whether to use GPU. Only supported for XGBoost, CatBoost and LightGBM. + Defaults to False. + warnings (bool, optional): Whether to show warnings. Defaults to False. + """ + #warnings off + if warnings is False: + import warnings + warnings.filterwarnings("ignore") + + if gpu: + self.models['XGBoost']['def_params']['tree_method'] = 'gpu_hist' + self.models['XGBoost']['space']['tree_method'] = hp.choice('tree_method', ['gpu_hist']) + + # del self.models['XGBoost']['opt_params']['tree_method'] + self.models['LightGBM']['def_params']['device'] = 'gpu' + self.models['LightGBM']['space']['device'] = hp.choice('device', ['gpu']) + # del self.models['LightGBM']['opt_params']['device'] + # del self.models['CatBoost']['def_params']['task_type'] + # del self.models['CatBoost']['opt_params']['task_type'] + result = { + 'Model': self.models.keys(), + } + + if self.result is not None: return + scores_reg = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'] + scores_multi = ['f1_weighted', 'neg_log_loss', 'accuracy'] + scores_bin = ['neg_log_loss', 'roc_auc', 'accuracy', 'precision', 'recall'] + if isinstance(self, RegressionExperiment): - self.scoring = ['neg_mean_squared_error'] - mean_mse = [] - std_mse = [] - mse_test = [] - duration_time = [] + self.scoring = scores_reg else: - mean_F1 = [] - std_F1 = [] - F1_test = [] - duration_time = [] - precision = [] - recall = [] self.n_classes = len(np.unique(self.y_train)) - self.scoring = ['f1_weighted'] if self.n_classes > 2 else ['f1', 'precision', 'recall'] + self.scoring = scores_multi if self.n_classes > 2 else scores_bin + if self.tuning == 'f1' and self.n_classes > 2: + self.tuning = 'f1_weighted' + self.sort_by = 'f1_weighted' + + for score_type in ['CV', 'Test', 'STD']: + for score in self.scoring: + result[f'{score_type} {score.replace("neg_", "")}'] = [] if self.stacking: self.build_stack() self.models = self.stacking_models.copy() for (k, model_name) in enumerate(self.models.keys()): - now = datetime.now() - current_time = now.strftime("%H:%M:%S") + + if isinstance(self, ClassificationExperiment) and 'neg_log_loss' not in self.scoring: + self.scoring.append('neg_log_loss') + + current_time = datetime.now().strftime("%H:%M:%S") if self.stacking is True: model = self.models[model_name] if self.exp_type == 'tuning': + model = self.models[model_name] - print(current_time, ': 📘 Optimizing', model_name, '📘') file = f'experiments/{self.name}/params/{self.name}_{model_name}.pkl' if exists(file): - print('📘', model_name, 'already tuned 📘') + #print('🐫', model_name, 'already tuned 🐫') with open(file, 'rb') as f: params = pickle.load(f) else: params = self.optimize_model(model_name=model_name, n_eval=n_eval) + clear_output(wait=True) with open(file, 'wb+') as f: pickle.dump(params, f) print(file, 'saved') @@ -154,80 +243,98 @@ def start(self, cv=5, raise_errors=False, n_eval=100): else: model = self.models[model_name]['algo'](**self.models[model_name]['def_params']) - - print(f'{current_time}: Training {model_name} {k+1}/{len(self.models)+1} ', end='\r') - scores = cross_validate(model, - self.X_train, - self.y_train, - scoring=self.scoring, - cv=cv, - n_jobs=self.n_j, - error_score="raise" - ) - + clear_output(wait=True) + print(f'{current_time}: 🐫 {k+1}/{len(self.models)+1} 🐫 Training {model_name} 🐫 ', end='\r') + + model.fit(self.X_train, self.y_train) y_pred = model.predict(self.X_test) self.y_pred = y_pred - if isinstance(self, RegressionExperiment): - mean_mse.append(np.sqrt(-mean(scores['test_neg_mean_squared_error']))) - mse_test.append(np.sqrt(metrics.mean_squared_error(self.y_test, y_pred))) - duration_time.append(mean(scores['fit_time'])) - std_mse.append(np.std(np.sqrt(-scores['test_neg_mean_squared_error']))) - else: - if self.n_classes > 2: - mean_F1.append(mean(scores['test_f1_weighted'])) - std_F1.append(np.std(scores['test_f1_weighted'])) - F1_test.append(metrics.f1_score(self.y_test, y_pred, average='weighted')) - duration_time.append(mean(scores['fit_time'])) - else: - mean_F1.append(mean(scores['test_f1'])) - std_F1.append(np.std(scores['test_f1'])) - F1_test.append(metrics.f1_score(self.y_test, y_pred)) - duration_time.append(mean(scores['fit_time'])) - precision.append(metrics.precision_score(self.y_test, y_pred)) - recall.append(metrics.recall_score(self.y_test, y_pred)) - # append to the results dataframe the results of the model - - if isinstance(self, RegressionExperiment): - result = { - 'Model': self.models.keys(), - 'RMSE_test': mse_test, - 'RMSE_cv': mean_mse, - 'RMSE_std': std_mse, - 'Time': duration_time, - } - - result = pd.DataFrame(result).sort_values(by='RMSE_test', ascending=True) - - else: - result = { - 'Model': self.models.keys(), - 'F1_test': F1_test, - 'F1_cv': mean_F1, - 'F1_Std': std_F1, - 'Time': duration_time, - } + if isinstance(self, ClassificationExperiment): + try: + y_pred_proba = model.predict_proba(self.X_test) + except: + y_pred_proba = None + self.scoring = [x for x in self.scoring if x != 'neg_log_loss'] + result[f'CV log_loss'].append(np.nan) + result[f'STD log_loss'].append(np.nan) + + + if cv: + scores = cross_validate(model, + self.X_train, + self.y_train, + scoring = self.scoring, + cv=cv, + n_jobs=self.n_j, + error_score= 'raise' + ) + + for score in self.scoring: + result[f'CV {score.replace("neg_","")}'].append(scores[f'test_{score}'].mean()) + result[f'STD {score.replace("neg_","")}'].append(scores[f'test_{score}'].std()) + + + self.result = result + if self.X_test is not None: + if isinstance(self, RegressionExperiment): + result['Test mean_squared_error'].append(mean_squared_error(self.y_test, y_pred)) + result['Test root_mean_squared_error'].append(np.sqrt(mean_squared_error(self.y_test, y_pred))) + result['Test mean_absolute_error'].append(mean_absolute_error(self.y_test, y_pred)) + elif self.n_classes > 2: + result['Test f1_weighted'].append(metrics.f1_score(self.y_test, y_pred, average='weighted')) + result['Test accuracy'].append(metrics.accuracy_score(self.y_test, y_pred)) + result['Test log_loss'].append(log_loss(self.y_test, y_pred_proba) if 'neg_log_loss' in self.scoring else np.nan) + elif self.n_classes == 2: + result['Test roc_auc'].append(metrics.roc_auc_score(self.y_test, y_pred)) + result['Test accuracy'].append(metrics.accuracy_score(self.y_test, y_pred)) + result['Test precision'].append(metrics.precision_score(self.y_test, y_pred)) + result['Test recall'].append(metrics.recall_score(self.y_test, y_pred)) + result['Test log_loss'].append(log_loss(self.y_test, y_pred_proba) if 'neg_log_loss' in self.scoring else np.nan) + + result = pd.DataFrame(result) + result_ind = result['Model'] + result.drop('Model', axis=1, inplace=True) + result = result.apply(np.abs).apply(lambda x: round(x, 4)) + result.set_index(result_ind, inplace=True) + asc = True if isinstance(self, RegressionExperiment) or self.sort_by == 'log_loss' else False + sort_by = "CV " + self.sort_by if self.exp_type == "tuning" else result.columns[0] + result = result.sort_values(by=sort_by, ascending=asc) + + self.result = result - if self.n_classes == 2: - result['Precision_test'] = precision - result['Recall_test'] = recall + result.to_csv(self.result_path) + clear_output(wait=True) - result = pd.DataFrame(result).sort_values(by='F1_test', ascending=False) + def optimize_model(self, model_name, n_eval): + """ + Function used to optimize the hyperparameters of a model using Hyperopt. - self.result = result - result.to_csv(f'experiments/{self.name}/tables/{self.name}_{self.exp_type}.csv', index=False) + Args: + model_name (_type_): model to optimize. + n_eval (_type_): number of evaluations. - def optimize_model(self, model_name, n_eval): + Returns: + _type_: _description_ + """ + @ignore_warnings(category=ConvergenceWarning) def objective(space, model=self.models[model_name]['algo']): model = model(**space) + now = datetime.now().strftime("%H:%M:%S") + + print(f'{now}: 🐫 Optimizing {model_name} 🐫 ', end='\r') + losses = model_selection.cross_val_score( model, self.X_train, self.y_train, - scoring=self.scoring[0], + scoring=self.tuning, n_jobs=self.n_j) + clear_output(wait=True) + + return {'status': STATUS_OK, 'loss': -mean(losses), 'loss_variance': np.var(losses, ddof=1)} @@ -245,48 +352,118 @@ def objective(space, model=self.models[model_name]['algo']): pickle.dump(trial, f) return space_eval(self.models[model_name]['space'], best) - # method used for passing the parameters to the model. - # Stacking diverse models takes the best model from boosting and the best model from bagging - def build_stack(self): - if self.tuning: - base_result = pd.read_csv(f'experiments/{self.name}/tables/{self.name}_tuning.csv') - estimators_stacking_all = [(i, j['algo'](**self.models[i]['opt_params'])) for i, j in self.models.items()] - else: - base_result = pd.read_csv(f'experiments/{self.name}/tables/{self.name}_default.csv') - estimators_stacking_all = [(i, j['algo'](**self.models[i]['def_params'])) for i, j in self.models.items()] - - best_boosting = base_result.loc[base_result['Model'].isin(boosting_models)].iloc[0, 0] - best_bagging = base_result.loc[base_result['Model'].isin(bagging_models)].iloc[0, 0] - self.stacking_estimators_names_diverse.append(best_bagging) - self.stacking_estimators_names_diverse.append(best_boosting) - - self.stacking_models['Stacking (all)']['def_params']['estimators'] = estimators_stacking_all - voting_estimators = [i for i in estimators_stacking_all if i[0] != 'Ridge Classifier'] - self.stacking_models['Voting (all)']['def_params']['estimators'] = voting_estimators - self.stacking_models_diverse = {m: self.models[m] for m in self.stacking_estimators_names_diverse} - - stack_dict = self.stacking_models_diverse.items() - if self.tuning is False: - estimators_stacking_diverse = [(i, j['algo'](**self.models[i]['def_params'])) for i, j in stack_dict] + def stack(self, n_estimators = 10, estimators = 'best'): + """Start stacking process. This function trains a stacking and + + Args: + n_estimators (int, optional): _description_. Defaults to 10. + estimators (str, optional): One of "best", "all", "random". + Best takes the n best models, all takes all models, random takes n random models. + + Returns: + _type_: _description_ + """ + from .models import models_stacking_class, models_stacking_reg + if isinstance(self, ClassificationExperiment): + self.n_classes = len(np.unique(self.y_train)) + self.stacking_models = models_stacking_class else: - estimators_stacking_diverse = [(i, j['algo'](**self.models[i]['opt_params'])) for i, j in stack_dict] + self.stacking_models = models_stacking_reg - self.stacking_models['Stacking (diverse)']['def_params']['estimators'] = estimators_stacking_diverse - self.stacking_models['Voting (diverse)']['def_params']['estimators'] = estimators_stacking_diverse + self.stack_result_path = f'experiments/{self.name}/tables/{self.name}_stacking.csv' + if exists(self.stack_result_path): + self.stack_result = pd.read_csv(self.stack_result_path, index_col=0) + self.stack_existing = pd.read_csv(self.stack_result_path, index_col=0) + else: + self.stack_result = pd.read_csv(self.result_path, nrows = 0 ) + self.stack_result = self.stack_result[[x for x in self.stack_result.columns if 'Test' in x]] + + if estimators == 'best': + self.stack_estimators = self.result.index[:n_estimators].to_list() + elif estimators == 'all': + self.stack_estimators = self.result.index.to_list() + elif estimators == 'random': + self.stack_estimators = np.random.choice(self.result.index.to_list(), n_estimators, replace = False) + else: + self.stack_estimators = estimators + + self.stack = [] + for model_name in self.stack_estimators: + if model_name != 'CatBoost': + try: + self.stack.append((model_name, self.models[model_name]['algo'](**self.models[model_name]['opt_params']))) + except: + self.stack.append((model_name, self.models[model_name]['algo'](**self.models[model_name]['def_params']))) + + + + stacknames = [] + stackest = [] + self.predictions = [] + self.predictions_proba = [] + for est in self.models.values(): + clear_output() + modello = est['algo'](**est['def_params']).fit(self.X_train, self.y_train) + self.predictions.append(modello.predict(self.X_test)) + if hasattr(modello, 'predict_proba'): + self.predictions_proba.append(modello.predict_proba(self.X_test)) + # self.predictions_proba.append(modello.predict_proba(self.X_test)) + self.stack_result = self.stack_result.to_dict() + for i in self.stack_result: + self.stack_result[i] = [] + + for i in self.stacking_models.keys(): + self.stacking_models[i].estimators = self.stack + if 'soft' in i: + self.soft_models = [(i[0],i[1]) for i in self.stack if hasattr(i[1], 'predict_proba')] + self.stacking_models[i].estimators = self.soft_models + + model = self.stacking_models[i].fit(self.X_train, self.y_train) + y_pred = model.predict(self.X_test) + #make a list of predictions for each model + try: + y_pred_proba = model.predict_proba(self.X_test) + except: + y_pred_proba = False + stacknames.append(i) + stackest.append(self.stack_estimators) + if isinstance(self, RegressionExperiment): + self.stack_result['Test mean_squared_error'].append(mean_squared_error(self.y_test, y_pred)) + self.stack_result['Test root_mean_squared_error'].append(np.sqrt(mean_squared_error(self.y_test, y_pred))) + self.stack_result['Test mean_absolute_error'].append(mean_absolute_error(self.y_test, y_pred)) + elif self.n_classes > 2: + self.stack_result['Test f1_weighted'].append(metrics.f1_score(self.y_test, y_pred, average='weighted')) + self.stack_result['Test accuracy'].append(metrics.accuracy_score(self.y_test, y_pred)) + self.stack_result['Test log_loss'].append(log_loss(self.y_test, y_pred_proba) if y_pred_proba is not False else np.nan) + elif self.n_classes == 2: + self.stack_result['Test roc_auc'].append(metrics.roc_auc_score(self.y_test, y_pred)) + self.stack_result['Test accuracy'].append(metrics.accuracy_score(self.y_test, y_pred)) + self.stack_result['Test precision'].append(metrics.precision_score(self.y_test, y_pred)) + self.stack_result['Test recall'].append(metrics.recall_score(self.y_test, y_pred)) + self.stack_result['Test log_loss'].append(log_loss(self.y_test, y_pred_proba) if y_pred_proba is not False else np.nan) + + self.stack_result['Model'] = stacknames + self.stack_result['Estimators'] = stackest + self.stack_result['N_estimators'] = [len(i) for i in stackest] + self.stack_result = pd.DataFrame(self.stack_result) + self.stack_result.set_index('Model', inplace=True) + #concat + if exists(self.stack_result_path): + self.stack_result = pd.concat([self.stack_existing, self.stack_result]) + + self.stack_result.to_csv(self.stack_result_path) + return self.stack_result class RegressionExperiment(Experiment): def __init__(self, name, tuning=False, stacking=False): self.stacking_models = models_stacking_reg self.models = models_reg - self.scoring = ['neg_mean_squared_error'] - self.stacking_estimators_names_diverse = simple_models_reg super().__init__(name=name, tuning=tuning, stacking=stacking) class ClassificationExperiment(Experiment): def __init__(self, name, tuning=False, stacking=False): - self.models = models_class self.stacking_models = models_stacking_class - self.stacking_estimators_names_diverse = simple_models_class + self.models = models_class super().__init__(name=name, tuning=tuning, stacking=stacking) diff --git a/pycaML/models.py b/pycaML/models.py index e56d9ee..6dcd0ad 100644 --- a/pycaML/models.py +++ b/pycaML/models.py @@ -1,4 +1,6 @@ -# this file is used for storing search spaces and models +""" +This module is used to store the models and search spaces used in pycaML. +""" from sklearn import linear_model from hyperopt import hp @@ -111,7 +113,7 @@ 'alpha': hp.choice('alpha', [0, hp.loguniform('alpha_positive', -16, 2)]), 'lambda': hp.choice('lambda', [0, hp.loguniform('lambda_positive', -16, 2)]), 'gamma': hp.choice('gamma', [0, hp.loguniform('gamma_positive', -16, 2)]), - 'tree_method': hp.choice('tree_method', ['gpu_hist']) + # 'tree_method': hp.choice('tree_method', ['gpu_hist']) } space_lgbm = { @@ -122,8 +124,7 @@ 'max_depth': hp.choice('max_depth', [-1, scope.int(hp.uniform('max_depth2', 1, 8))]), 'feature_fraction': hp.uniform('feature_fraction', 0.5, 1), 'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]), - 'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]), - 'device': hp.choice('device', ['gpu']) + 'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]) } space_adab = { @@ -249,9 +250,8 @@ 'n_nonzero_coefs': scope.int(hp.uniform('n_nonzero_coefs', 1, 100)), 'tol': hp.loguniform('tol', -10, -1), 'precompute': hp.choice('precompute', ['auto', True, False]), - 'normalize': hp.choice('normalize', [True, False]), + 'normalize': hp.choice('normalize', [False]), 'fit_intercept': hp.choice('fit_intercept', [True, False]), - } space_knn = { @@ -263,7 +263,7 @@ space_lars = { 'fit_intercept': hp.choice('fit_intercept', [True, False]), - 'normalize': hp.choice('normalize', [True, False]), + 'normalize': hp.choice('normalize', [False]), 'n_nonzero_coefs': scope.int(hp.uniform('n_nonzero_coefs', 1, 100)), 'eps': hp.loguniform('eps', -10, -1), 'random_state': 322, @@ -281,8 +281,8 @@ 'cache_size': hp.loguniform('cache_size', 1, 4), 'class_weight': hp.choice('class_weight', [None, 'balanced']), 'max_iter': scope.int(hp.uniform('max_iter', -1, 10000)), - 'decision_function_shape': hp.choice('decision_function_shape', ['ovo', 'ovr']), - 'break_ties': hp.choice('break_ties', [False, True]), + 'decision_function_shape': hp.choice('decision_function_shape', ['ovr']), + # 'break_ties': hp.choice('break_ties', [False]), 'random_state': 322 } @@ -387,7 +387,7 @@ PassiveAggressiveClassifier(), Perceptron(), PassiveAggressiveClassifier(), - SVC(), SVR(), MLPClassifier(), + MLPClassifier(), DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB() @@ -416,42 +416,14 @@ } models_stacking_reg = { - 'Stacking (all)': { - 'algo': StackingRegressor, - 'def_params': {'final_estimator': linear_model.LinearRegression()}, - }, - 'Voting (all)': { - 'algo': VotingRegressor, - 'def_params': {}, - }, - - 'Stacking (diverse)': { - 'algo': StackingRegressor, - 'def_params': {'final_estimator': linear_model.LinearRegression()}, - }, - 'Voting (diverse)': { - 'algo': VotingRegressor, - 'def_params': {}, - }, + 'Stacking': StackingRegressor(estimators = []), + 'Voting': VotingRegressor(estimators = []), } models_stacking_class = { - 'Stacking (all)': { - 'algo': StackingClassifier, - 'def_params': {'final_estimator': linear_model.LogisticRegression()}, - }, - 'Voting (all)': { - 'algo': VotingClassifier, - 'def_params': {'voting': 'hard'}, - }, - 'Stacking (diverse)': { - 'algo': StackingClassifier, - 'def_params': {'final_estimator': linear_model.LogisticRegression()}, - }, - 'Voting (diverse)': { - 'algo': VotingClassifier, - 'def_params': {'voting': 'hard'}, - }, + 'Stacking': StackingClassifier(estimators = []), + 'Voting (hard)': VotingClassifier(estimators = [], voting = 'hard'), + 'Voting (soft)': VotingClassifier(estimators = [], voting = 'soft'), } @@ -478,13 +450,13 @@ 'algo': XGBRegressor, 'space': space_xgb, 'opt_params': {}, - 'def_params': {'tree_method': 'gpu_hist', 'random_state': 322} + 'def_params': {'random_state': 322} }, 'LightGBM': { 'algo': LGBMRegressor, 'space': space_lgbm, 'opt_params': {}, - 'def_params': {'device': 'gpu', 'random_state': 322} + 'def_params': {'random_state': 322} }, 'Gradient Boost': { 'algo': HistGradientBoostingRegressor, @@ -516,12 +488,12 @@ 'opt_params': {}, 'def_params': {'random_state': 322} }, - 'EBM': { - 'algo': ExplainableBoostingRegressor, - 'space': space_ebm, - 'opt_params': {}, - 'def_params': {'random_state': 322} - }, + # 'EBM': { + # 'algo': ExplainableBoostingRegressor, + # 'space': space_ebm, + # 'opt_params': {}, + # 'def_params': {'random_state': 322} + # }, 'Elastic Net': { 'algo': ElasticNet, 'space': space_en, @@ -550,13 +522,13 @@ 'algo': Lars, 'space': space_lars, 'opt_params': {}, - 'def_params': {'random_state': 322} + 'def_params': {'normalize': False, 'random_state': 322} }, 'Orthogonal Matching Pursuit': { 'algo': OrthogonalMatchingPursuit, 'space': space_omp, 'opt_params': {}, - 'def_params': {} + 'def_params': {'normalize': False}, }, 'Multi-layer Perceptron': { 'algo': MLPRegressor, @@ -593,8 +565,9 @@ 'space': space_ransac, 'opt_params': {}, 'def_params': {'random_state': 322} - }, -} + } + + } models_class = { 'Bagging': { @@ -649,13 +622,13 @@ 'algo': XGBClassifier, 'space': space_xgb, 'opt_params': {}, - 'def_params': {'tree_method': 'gpu_hist', 'random_state': 322} + 'def_params': {'random_state': 322} }, 'LightGBM': { 'algo': LGBMClassifier, 'space': space_lgbm, 'opt_params': {}, - 'def_params': {'device': 'gpu', 'random_state': 322} + 'def_params': {'random_state': 322} }, 'Gradient Boost': { 'algo': HistGradientBoostingClassifier, @@ -687,12 +660,12 @@ 'opt_params': {}, 'def_params': {'random_state': 322} }, - 'EBM': { - 'algo': ExplainableBoostingClassifier, - 'space': space_ebm, - 'opt_params': {}, - 'def_params': {'random_state': 322} - }, + # 'EBM': { + # 'algo': ExplainableBoostingClassifier, + # 'space': space_ebm, + # 'opt_params': {}, + # 'def_params': {'random_state': 322} + # }, 'Logistic Regression': { 'algo': LogisticRegression, 'space': space_LOGReg, @@ -717,10 +690,10 @@ 'opt_params': {}, 'def_params': {'random_state': 322} }, - 'Stochastic Gradient Descent': { - 'algo': SGDClassifier, - 'space': space_sgd, - 'opt_params': {}, - 'def_params': {'random_state': 322} - }, + # 'Stochastic Gradient Descent': { + # 'algo': SGDClassifier, + # 'space': space_sgd, + # 'opt_params': {}, + # 'def_params': {'random_state': 322} + # }, } diff --git a/setup.py b/setup.py index 3900c43..a9e3315 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,10 @@ from setuptools import setup, find_packages setup( name='pycaML', - version='0.2.0', + version='0.3.0', author='Donato Riccio', description='Python Comparative Analysis for Machine Learning', - long_description='pycaML is an easy machine learning model comparison tool with optimization. It allows to generate a table comparing multiple machine learning models, to see which one is best for your data. The unique feature of pycaML is built-in hyperparameters tuning using Bayesian Optimization. It also supports meta-models like Stacking and Voting ensembles. You can setup and optimize 16 models with one line of code.', + long_description='pycaML is an easy machine learning model comparison tool with optimization. It allows to generate a table comparing multiple machine learning models, to see which one is best for your data. The unique feature of pycaML is built-in hyperparameters tuning using Bayesian Optimization. It also supports meta-models like Stacking and Voting ensembles. You can setup and optimize 25 models with one line of code.', url='https://github.com/reese3222/pycaML', keywords='machine learning, optimization, stacking', python_requires='>=3.7, <4',