|
1855 | 1855 | fields: |
1856 | 1856 | notebook: 4 |
1857 | 1857 | cell_type: 1 |
1858 | | - contents: "\U0001F6A7Work in Progress\U0001F6A7\n=====================\n\nDocumentation |
1859 | | - coming soon! Give it a try for now anyway." |
1860 | | - rendering: "<article class=\"markdown-body\"><h1>\U0001F6A7Work in Progress\U0001F6A7</h1>\n<p>Documentation |
1861 | | - coming soon! Give it a try for now anyway.</p></article>" |
| 1858 | + contents: 'So far we''ve focussed on Classification tasks which divide the world |
| 1859 | + into discrete groups. Sometimes we need to take a more nuanced view when issues |
| 1860 | + are not black and white. Sometimes there are no hard boundaries between options, |
| 1861 | + or sometimes one sort of classification error might be much more painful than |
| 1862 | + another. There are many algorithms that can produce a raw score rather than |
| 1863 | + a discrete class for us. These are "Regression" tasks instead of "Classification". |
| 1864 | +
|
| 1865 | +
|
| 1866 | + For this example, we''ll look at several medical indicators that correlate with |
| 1867 | + the progression of diabetes one year later. Let''s load up the data and take |
| 1868 | + a look' |
| 1869 | + rendering: '<article class="markdown-body"><p>So far we''ve focussed on Classification |
| 1870 | + tasks which divide the world into discrete groups. Sometimes we need to take |
| 1871 | + a more nuanced view when issues are not black and white. Sometimes there are |
| 1872 | + no hard boundaries between options, or sometimes one sort of classification |
| 1873 | + error might be much more painful than another. There are many algorithms that |
| 1874 | + can produce a raw score rather than a discrete class for us. These are "Regression" |
| 1875 | + tasks instead of "Classification".</p> |
| 1876 | +
|
| 1877 | + <p>For this example, we''ll look at several medical indicators that correlate |
| 1878 | + with the progression of diabetes one year later. Let''s load up the data and |
| 1879 | + take a look</p></article>' |
1862 | 1880 | execution_time: null |
1863 | 1881 | cell_number: 1 |
1864 | 1882 | version: 1 |
|
1868 | 1886 | fields: |
1869 | 1887 | notebook: 4 |
1870 | 1888 | cell_type: 3 |
1871 | | - contents: "-- This example trains models on the sklean diabetes dataset\n-- Source |
1872 | | - URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n-- For more |
1873 | | - information see:\n-- Bradley Efron, Trevor Hastie, Iain Johnstone and Robert |
1874 | | - Tibshirani (2004)\n-- \"Least Angle Regression,\" Annals of Statistics (with |
1875 | | - discussion), 407-499\n-- https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf\n\n--\n-- |
1876 | | - This demonstrates using a table with individual columns as features\n-- for |
1877 | | - regression.\n\nSELECT pgml.load_dataset('diabetes');\n\n-- view the dataset\nSELECT |
1878 | | - * FROM pgml.diabetes LIMIT 10;\n\n-- train a simple model on the data\nSELECT |
1879 | | - * FROM pgml.train('Diabetes Progression', 'regression', 'pgml.diabetes', 'target');\n\n-- |
1880 | | - check out the predictions\nSELECT target, pgml.predict('Diabetes Progression', |
1881 | | - ARRAY[age, sex, bmi, bp, s1, s2, s3, s4, s5, s6]) AS prediction\nFROM pgml.diabetes |
1882 | | - \nLIMIT 10;\n\n-- Check predictions against a specific model id\nSELECT model_id, |
1883 | | - target, pgml.model_predict(model_id, ARRAY[age, sex, bmi, bp, s1, s2, s3, s4, |
1884 | | - s5, s6]) AS prediction\nFROM pgml.diabetes\nCROSS JOIN LATERAL (\n SELECT |
1885 | | - pgml.models.id AS model_id FROM pgml.models\n INNER JOIN pgml.projects\n |
1886 | | - \ ON pgml.models.project_id = pgml.projects.id\n WHERE pgml.projects.name |
1887 | | - = 'Diabetes Progression'\n LIMIT 1\n) models\nLIMIT 10;\n\n--\n-- After a |
1888 | | - project has been trained, ommited parameters will be reused from previous training |
1889 | | - runs\n-- In these examples we'll reuse the training data snapshots from the |
1890 | | - initial call.\n--\n\n-- linear models\nSELECT * FROM pgml.train('Diabetes Progression', |
1891 | | - algorithm => 'ridge');\nSELECT * FROM pgml.train('Diabetes Progression', algorithm |
1892 | | - => 'lasso');\nSELECT * FROM pgml.train('Diabetes Progression', algorithm => |
1893 | | - 'elastic_net');\nSELECT * FROM pgml.train('Diabetes Progression', algorithm |
1894 | | - => 'least_angle');\nSELECT * FROM pgml.train('Diabetes Progression', algorithm |
1895 | | - => 'lasso_least_angle');\nSELECT * FROM pgml.train('Diabetes Progression', algorithm |
1896 | | - => 'orthoganl_matching_pursuit');\nSELECT * FROM pgml.train('Diabetes Progression', |
1897 | | - algorithm => 'bayesian_ridge');\nSELECT * FROM pgml.train('Diabetes Progression', |
1898 | | - algorithm => 'automatic_relevance_determination');\nSELECT * FROM pgml.train('Diabetes |
1899 | | - Progression', algorithm => 'stochastic_gradient_descent');\nSELECT * FROM pgml.train('Diabetes |
1900 | | - Progression', algorithm => 'passive_aggressive');\nSELECT * FROM pgml.train('Diabetes |
1901 | | - Progression', algorithm => 'ransac');\nSELECT * FROM pgml.train('Diabetes Progression', |
1902 | | - algorithm => 'theil_sen', hyperparams => '{\"max_iter\": 10, \"max_subpopulation\": |
1903 | | - 100}');\nSELECT * FROM pgml.train('Diabetes Progression', algorithm => 'huber');\n-- |
1904 | | - Quantile Regression too expensive for normal tests on even a toy dataset\n-- |
1905 | | - SELECT * FROM pgml.train('Diabetes Progression', algorithm => 'quantile');\n\n-- |
1906 | | - support vector machines\nSELECT * FROM pgml.train('Diabetes Progression', algorithm |
1907 | | - => 'svm', hyperparams => '{\"max_iter\": 100}');\nSELECT * FROM pgml.train('Diabetes |
1908 | | - Progression', algorithm => 'nu_svm', hyperparams => '{\"max_iter\": 10}');\nSELECT |
1909 | | - * FROM pgml.train('Diabetes Progression', algorithm => 'linear_svm', hyperparams |
1910 | | - => '{\"max_iter\": 100}');\n\n-- ensembles\nSELECT * FROM pgml.train('Diabetes |
1911 | | - Progression', algorithm => 'ada_boost', hyperparams => '{\"n_estimators\": 5}');\nSELECT |
1912 | | - * FROM pgml.train('Diabetes Progression', algorithm => 'bagging', hyperparams |
1913 | | - => '{\"n_estimators\": 5}');\nSELECT * FROM pgml.train('Diabetes Progression', |
1914 | | - algorithm => 'extra_trees', hyperparams => '{\"n_estimators\": 5}');\nSELECT |
1915 | | - * FROM pgml.train('Diabetes Progression', algorithm => 'gradient_boosting_trees', |
1916 | | - hyperparams => '{\"n_estimators\": 5}');\nSELECT * FROM pgml.train('Diabetes |
1917 | | - Progression', algorithm => 'random_forest', hyperparams => '{\"n_estimators\": |
1918 | | - 5}');\n\n-- other\n-- Kernel Ridge is too expensive for normal tests on even |
1919 | | - a toy dataset\n-- SELECT * FROM pgml.train('Diabetes Progression', algorithm |
1920 | | - => 'kernel_ridge');\n-- Gaussian Process is too expensive for normal tests on |
1921 | | - even a toy dataset\n-- SELECT * FROM pgml.train('Diabetes Progression', algorithm |
1922 | | - => 'gaussian_process');\n\n-- gradient boosting\nSELECT * FROM pgml.train('Diabetes |
1923 | | - Progression', algorithm => 'xgboost', hyperparams => '{\"n_estimators\": 10}');\nSELECT |
1924 | | - * FROM pgml.train('Diabetes Progression', algorithm => 'xgboost_random_forest', |
1925 | | - hyperparams => '{\"n_estimators\": 10}');\nSELECT * FROM pgml.train('Diabetes |
1926 | | - Progression', algorithm => 'lightgbm', hyperparams => '{\"n_estimators\": 1}');\n-- |
1927 | | - Histogram Gradient Boosting is too expensive for normal tests on even a toy |
1928 | | - dataset\n-- SELECT * FROM pgml.train('Diabetes Progression', algorithm => 'hist_gradient_boosting', |
1929 | | - hyperparams => '{\"max_iter\": 10}');\n\n\n-- check out all that hard work\nSELECT |
1930 | | - trained_models.* FROM pgml.trained_models \nJOIN pgml.models on models.id = |
1931 | | - trained_models.id\nORDER BY models.metrics->>'mean_squared_error' DESC LIMIT |
1932 | | - 5;\n\n-- deploy the random_forest model for prediction use\nSELECT * FROM pgml.deploy('Diabetes |
1933 | | - Progression', 'most_recent', 'random_forest');\n-- check out that throughput\nSELECT |
1934 | | - * FROM pgml.deployed_models ORDER BY deployed_at DESC LIMIT 5;\n\n-- do a hyperparam |
1935 | | - search on your favorite algorithm\nSELECT pgml.train(\n 'Diabetes Progression', |
1936 | | - \n algorithm => 'xgboost', \n search => 'grid', \n search_params => |
1937 | | - '{\n \"max_depth\": [1, 2], \n \"n_estimators\": [20, 40],\n \"learning_rate\": |
1938 | | - [0.1, 0.2]\n }'\n);\n\n-- deploy the \"best\" model for prediction use\nSELECT |
1939 | | - * FROM pgml.deploy('Diabetes Progression', 'best_score');\nSELECT * FROM pgml.deploy('Diabetes |
1940 | | - Progression', 'most_recent');\nSELECT * FROM pgml.deploy('Diabetes Progression', |
1941 | | - 'rollback');\nSELECT * FROM pgml.deploy('Diabetes Progression', 'best_score', |
1942 | | - 'svm');\n\n-- check out the improved predictions\nSELECT target, pgml.predict('Diabetes |
1943 | | - Progression', ARRAY[age, sex, bmi, bp, s1, s2, s3, s4, s5, s6]) AS prediction\nFROM |
1944 | | - pgml.diabetes \nLIMIT 10;" |
| 1889 | + contents: SELECT pgml.load_dataset('diabetes'); |
1945 | 1890 | rendering: null |
1946 | 1891 | execution_time: null |
1947 | 1892 | cell_number: 2 |
|
2622 | 2567 | cell_number: 19 |
2623 | 2568 | version: 1 |
2624 | 2569 | deleted_at: null |
| 2570 | +- model: app.notebookcell |
| 2571 | + pk: 120 |
| 2572 | + fields: |
| 2573 | + notebook: 4 |
| 2574 | + cell_type: 3 |
| 2575 | + contents: "SELECT * \nFROM pgml.diabetes \nLIMIT 10;" |
| 2576 | + rendering: null |
| 2577 | + execution_time: null |
| 2578 | + cell_number: 3 |
| 2579 | + version: 1 |
| 2580 | + deleted_at: null |
| 2581 | +- model: app.notebookcell |
| 2582 | + pk: 121 |
| 2583 | + fields: |
| 2584 | + notebook: 4 |
| 2585 | + cell_type: 1 |
| 2586 | + contents: In this case, the `target` is a number that represents the severity |
| 2587 | + of the disease progression one year later, with larger values indicating worse |
| 2588 | + outcomes. Building a Regression model uses the same PostgresML API as Classification, |
| 2589 | + just with a different task. You're going to start breezing through these tutorials |
| 2590 | + faster and faster. |
| 2591 | + rendering: <article class="markdown-body"><p>In this case, the <code>target</code> |
| 2592 | + is a number that represents the severity of the disease progression one year |
| 2593 | + later, with larger values indicating worse outcomes. Building a Regression model |
| 2594 | + uses the same PostgresML API as Classification, just with a different task. |
| 2595 | + You're going to start breezing through these tutorials faster and faster.</p></article> |
| 2596 | + execution_time: null |
| 2597 | + cell_number: 4 |
| 2598 | + version: 1 |
| 2599 | + deleted_at: null |
| 2600 | +- model: app.notebookcell |
| 2601 | + pk: 122 |
| 2602 | + fields: |
| 2603 | + notebook: 4 |
| 2604 | + cell_type: 3 |
| 2605 | + contents: "SELECT * FROM pgml.train(\n project_name => 'Diabetes Progression', |
| 2606 | + \n task => 'regression', \n relation_name => 'pgml.diabetes', \n y_column_name |
| 2607 | + => 'target'\n);" |
| 2608 | + rendering: null |
| 2609 | + execution_time: null |
| 2610 | + cell_number: 5 |
| 2611 | + version: 1 |
| 2612 | + deleted_at: null |
| 2613 | +- model: app.notebookcell |
| 2614 | + pk: 123 |
| 2615 | + fields: |
| 2616 | + notebook: 4 |
| 2617 | + cell_type: 1 |
| 2618 | + contents: With our baseline model automatically deployed, we can sample some of |
| 2619 | + the predictions |
| 2620 | + rendering: <article class="markdown-body"><p>With our baseline model automatically |
| 2621 | + deployed, we can sample some of the predictions</p></article> |
| 2622 | + execution_time: null |
| 2623 | + cell_number: 6 |
| 2624 | + version: 1 |
| 2625 | + deleted_at: null |
| 2626 | +- model: app.notebookcell |
| 2627 | + pk: 124 |
| 2628 | + fields: |
| 2629 | + notebook: 4 |
| 2630 | + cell_type: 3 |
| 2631 | + contents: "SELECT target, pgml.predict('Diabetes Progression', ARRAY[age, sex, |
| 2632 | + bmi, bp, s1, s2, s3, s4, s5, s6]) AS prediction\nFROM pgml.diabetes \nLIMIT |
| 2633 | + 10;" |
| 2634 | + rendering: null |
| 2635 | + execution_time: null |
| 2636 | + cell_number: 7 |
| 2637 | + version: 1 |
| 2638 | + deleted_at: null |
| 2639 | +- model: app.notebookcell |
| 2640 | + pk: 125 |
| 2641 | + fields: |
| 2642 | + notebook: 4 |
| 2643 | + cell_type: 1 |
| 2644 | + contents: To get an objective measure of just how far off every single prediction |
| 2645 | + is from the target, we can look at the key metrics recorded during training. |
| 2646 | + rendering: <article class="markdown-body"><p>To get an objective measure of just |
| 2647 | + how far off every single prediction is from the target, we can look at the key |
| 2648 | + metrics recorded during training.</p></article> |
| 2649 | + execution_time: null |
| 2650 | + cell_number: 8 |
| 2651 | + version: 1 |
| 2652 | + deleted_at: null |
| 2653 | +- model: app.notebookcell |
| 2654 | + pk: 126 |
| 2655 | + fields: |
| 2656 | + notebook: 4 |
| 2657 | + cell_type: 3 |
| 2658 | + contents: "SELECT \n projects.name,\n models.algorithm_name,\n round((models.metrics->>'r2')::numeric, |
| 2659 | + 4) AS r2_score\nFROM pgml.models\nJOIN pgml.projects on projects.id = models.project_id\n |
| 2660 | + \ AND projects.name = 'Diabetes Progression'\nORDER BY models.created_at DESC |
| 2661 | + LIMIT 5;" |
| 2662 | + rendering: null |
| 2663 | + execution_time: null |
| 2664 | + cell_number: 9 |
| 2665 | + version: 1 |
| 2666 | + deleted_at: null |
| 2667 | +- model: app.notebookcell |
| 2668 | + pk: 127 |
| 2669 | + fields: |
| 2670 | + notebook: 4 |
| 2671 | + cell_type: 1 |
| 2672 | + contents: I like to look at the R2 score, since it is fixed between 0 and 1 it |
| 2673 | + can help us compare the performance of different algorithms on our data. Let's |
| 2674 | + throw our bag of tricks at the problem and see what sticks. |
| 2675 | + rendering: <article class="markdown-body"><p>I like to look at the R2 score, since |
| 2676 | + it is fixed between 0 and 1 it can help us compare the performance of different |
| 2677 | + algorithms on our data. Let's throw our bag of tricks at the problem and see |
| 2678 | + what sticks.</p></article> |
| 2679 | + execution_time: null |
| 2680 | + cell_number: 10 |
| 2681 | + version: 1 |
| 2682 | + deleted_at: null |
| 2683 | +- model: app.notebookcell |
| 2684 | + pk: 128 |
| 2685 | + fields: |
| 2686 | + notebook: 4 |
| 2687 | + cell_type: 3 |
| 2688 | + contents: '-- linear models |
| 2689 | +
|
| 2690 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''ridge''); |
| 2691 | +
|
| 2692 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''lasso''); |
| 2693 | +
|
| 2694 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''elastic_net''); |
| 2695 | +
|
| 2696 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''least_angle''); |
| 2697 | +
|
| 2698 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''lasso_least_angle''); |
| 2699 | +
|
| 2700 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''orthoganl_matching_pursuit''); |
| 2701 | +
|
| 2702 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''bayesian_ridge''); |
| 2703 | +
|
| 2704 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''automatic_relevance_determination''); |
| 2705 | +
|
| 2706 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''stochastic_gradient_descent''); |
| 2707 | +
|
| 2708 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''passive_aggressive''); |
| 2709 | +
|
| 2710 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''ransac''); |
| 2711 | +
|
| 2712 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''theil_sen'', |
| 2713 | + hyperparams => ''{"max_iter": 10, "max_subpopulation": 100}''); |
| 2714 | +
|
| 2715 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''huber''); |
| 2716 | +
|
| 2717 | +
|
| 2718 | + -- support vector machines |
| 2719 | +
|
| 2720 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''svm'', hyperparams |
| 2721 | + => ''{"max_iter": 100}''); |
| 2722 | +
|
| 2723 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''nu_svm'', |
| 2724 | + hyperparams => ''{"max_iter": 10}''); |
| 2725 | +
|
| 2726 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''linear_svm'', |
| 2727 | + hyperparams => ''{"max_iter": 100}''); |
| 2728 | +
|
| 2729 | +
|
| 2730 | + -- ensembles |
| 2731 | +
|
| 2732 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''ada_boost'', |
| 2733 | + hyperparams => ''{"n_estimators": 5}''); |
| 2734 | +
|
| 2735 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''bagging'', |
| 2736 | + hyperparams => ''{"n_estimators": 5}''); |
| 2737 | +
|
| 2738 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''extra_trees'', |
| 2739 | + hyperparams => ''{"n_estimators": 5}''); |
| 2740 | +
|
| 2741 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''gradient_boosting_trees'', |
| 2742 | + hyperparams => ''{"n_estimators": 5}''); |
| 2743 | +
|
| 2744 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''random_forest'', |
| 2745 | + hyperparams => ''{"n_estimators": 5}''); |
| 2746 | +
|
| 2747 | +
|
| 2748 | + -- gradient boosting |
| 2749 | +
|
| 2750 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''xgboost'', |
| 2751 | + hyperparams => ''{"n_estimators": 10}''); |
| 2752 | +
|
| 2753 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''xgboost_random_forest'', |
| 2754 | + hyperparams => ''{"n_estimators": 10}''); |
| 2755 | +
|
| 2756 | + SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''lightgbm'', |
| 2757 | + hyperparams => ''{"n_estimators": 1}'');' |
| 2758 | + rendering: null |
| 2759 | + execution_time: null |
| 2760 | + cell_number: 11 |
| 2761 | + version: 1 |
| 2762 | + deleted_at: null |
| 2763 | +- model: app.notebookcell |
| 2764 | + pk: 129 |
| 2765 | + fields: |
| 2766 | + notebook: 4 |
| 2767 | + cell_type: 1 |
| 2768 | + contents: It's that easy, and that fast, to test all the algorithm's in our toolkit |
| 2769 | + to see what fares the best, and the best one has automatically been deployed. |
| 2770 | + Once we've honed in on a few good candidate algorithms, we can check the docs |
| 2771 | + for their hyperparams, and then do another brute force search across all combinations |
| 2772 | + to find the best set. |
| 2773 | + rendering: <article class="markdown-body"><p>It's that easy, and that fast, to |
| 2774 | + test all the algorithm's in our toolkit to see what fares the best, and the |
| 2775 | + best one has automatically been deployed. Once we've honed in on a few good |
| 2776 | + candidate algorithms, we can check the docs for their hyperparams, and then |
| 2777 | + do another brute force search across all combinations to find the best set.</p></article> |
| 2778 | + execution_time: null |
| 2779 | + cell_number: 12 |
| 2780 | + version: 1 |
| 2781 | + deleted_at: null |
| 2782 | +- model: app.notebookcell |
| 2783 | + pk: 130 |
| 2784 | + fields: |
| 2785 | + notebook: 4 |
| 2786 | + cell_type: 3 |
| 2787 | + contents: "SELECT * FROM pgml.train(\n 'Diabetes Progression', \n algorithm |
| 2788 | + => 'xgboost', \n search => 'grid', \n search_params => '{\n \"max_depth\": |
| 2789 | + [1, 2], \n \"n_estimators\": [20, 40],\n \"learning_rate\": [0.1, |
| 2790 | + 0.2]\n }'\n);" |
| 2791 | + rendering: null |
| 2792 | + execution_time: null |
| 2793 | + cell_number: 13 |
| 2794 | + version: 1 |
| 2795 | + deleted_at: null |
0 commit comments