Skip to content

Commit

Permalink
add quantile regression, change description and documentation #7
Browse files Browse the repository at this point in the history
  • Loading branch information
sdpython committed May 7, 2018
1 parent 8c80b53 commit 1fccc7e
Show file tree
Hide file tree
Showing 13 changed files with 691 additions and 12 deletions.
5 changes: 4 additions & 1 deletion HISTORY.rst
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@

.. _l-HISTORY:

=======
History
=======

current - 2018-04-14 - 0.00Mb
current - 2018-05-07 - 0.00Mb
=============================

* `7`: add quantile regression (2018-05-07)
* `5`: replace flake8 by code style (2018-04-14)
* `1`: change background for cells in notebooks converted into rst then in html, highlight-ipython3 (2018-01-05)

Expand Down
297 changes: 297 additions & 0 deletions _doc/notebooks/quantile_regression.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion _doc/sphinxdoc/source/blog/2017/2017-10-18_first_day.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
:categories: blog

Machine learned models are black boxes.
The modules tries to implements some functions
The module tries to implements some functions
to get insights on machine learned models.
18 changes: 18 additions & 0 deletions _doc/sphinxdoc/source/blog/2018/2018-05-07_quantile_regression.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

.. blogpost::
:title: Quantile regression with scikit-learn.
:keywords: scikit-learn, quantile regression
:date: 2018-05-07
:categories: machine learning

:epkg:`scikit-learn` does not have any quantile regression.
:epkg:`statsmodels` does have one
`QuantReg <http://www.statsmodels.org/dev/generated/statsmodels.regression.quantile_regression.QuantReg.html>`_
but I wanted to try something I did for my teachings
`Régression Quantile <http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx3/notebooks/td_note_2017_2.html?highlight=mediane>`_
based on `Iteratively reweighted least squares <https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares>`_.
I thought it was a good case study to turn a simple algorithm into
a learner :epkg:`scikit-learn` can reused in a pipeline.
The notebook :ref:`quantileregressionrst` demonstrates it
and it is implemented in
:class:`QuantileLinearRegression <mlinsights.mlmodel.quantile_regression.QuantileLinearRegression>`.
15 changes: 13 additions & 2 deletions _doc/sphinxdoc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import datetime
import re
import sphinx_redactor_theme
import sphinx_rtd_theme


sys.path.insert(0, os.path.abspath(os.path.join(os.path.split(__file__)[0])))
Expand All @@ -24,7 +24,8 @@

from pyquickhelper.helpgen.default_conf import set_sphinx_variables, get_default_stylesheet
set_sphinx_variables(__file__, "mlinsights", "Xavier Dupré", 2018,
"sphinx_redactor_theme", sphinx_redactor_theme.get_html_theme_path(),
"sphinx_rtd_theme", [
sphinx_rtd_theme.get_html_theme_path()],
locals(), extlinks=dict(
issue=('https://github.com/sdpython/mlinsights/issues/%s', 'issue')),
title="mlinsights", book=True)
Expand All @@ -43,6 +44,16 @@

mathdef_link_only = True

epkg_dictionary['keras'] = 'https://keras.io/'
epkg_dictionary['Iris'] = 'http://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html'
epkg_dictionary['RandomForestRegressor'] = 'http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html'
epkg_dictionary['REST API'] = "https://en.wikipedia.org/wiki/Representational_state_transfer"

epkg_dictionary.update({
'pandas': ('http://pandas.pydata.org/pandas-docs/stable/',
('http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html', 1),
('http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html', 2)),
'sklearn': ('http://scikit-learn.org/stable/',
('http://scikit-learn.org/stable/modules/generated/{0}.html', 1),
('http://scikit-learn.org/stable/modules/generated/{0}.{1}.html', 2)),
})
65 changes: 65 additions & 0 deletions _unittests/ut_documentation/test_nb_quantile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
"""
@brief test log(time=13s)
"""

import sys
import os
import unittest


try:
import pyquickhelper as skip_
except ImportError:
path = os.path.normpath(
os.path.abspath(
os.path.join(
os.path.split(__file__)[0],
"..",
"..",
"..",
"pyquickhelper",
"src")))
if path not in sys.path:
sys.path.append(path)
import pyquickhelper as skip_

try:
import src
except ImportError:
path = os.path.normpath(
os.path.abspath(
os.path.join(
os.path.split(__file__)[0],
"..",
"..")))
if path not in sys.path:
sys.path.append(path)
import src

from pyquickhelper.loghelper import fLOG
from pyquickhelper.pycode import add_missing_development_version
from pyquickhelper.ipythonhelper import test_notebook_execution_coverage
import src.mlinsights


class TestNotebookQuantile(unittest.TestCase):

def setUp(self):
add_missing_development_version(["jyquickhelper"], __file__, hide=True)

def test_notebook_quantile(self):
fLOG(
__file__,
self._testMethodName,
OutputPrint=__name__ == "__main__")

self.assertTrue(src.mlinsights is not None)
folder = os.path.join(os.path.dirname(__file__),
"..", "..", "_doc", "notebooks")
test_notebook_execution_coverage(
__file__, "quantile", folder, 'mlinsights', fLOG=fLOG)


if __name__ == "__main__":
unittest.main()
136 changes: 136 additions & 0 deletions _unittests/ut_mlmodel/test_quantile_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
"""
@brief test log(time=2s)
"""

import sys
import os
import unittest
import numpy
import pandas
from sklearn.linear_model import LinearRegression


try:
import pyquickhelper as skip_
except ImportError:
path = os.path.normpath(
os.path.abspath(
os.path.join(
os.path.split(__file__)[0],
"..",
"..",
"..",
"pyquickhelper",
"src")))
if path not in sys.path:
sys.path.append(path)
import pyquickhelper as skip_


try:
import src
except ImportError:
path = os.path.normpath(
os.path.abspath(
os.path.join(
os.path.split(__file__)[0],
"..",
"..")))
if path not in sys.path:
sys.path.append(path)
import src

from pyquickhelper.pycode import ExtTestCase, get_temp_folder
from src.mlinsights.mlmodel import QuantileLinearRegression


class TestQuantileRegression(ExtTestCase):

def test_quantile_regression_no_intercept(self):
X = numpy.array([[0.1, 0.2], [0.2, 0.3]])
Y = numpy.array([1., 1.1])
clr = LinearRegression(fit_intercept=False)
clr.fit(X, Y)
clq = QuantileLinearRegression(fit_intercept=False)
clq.fit(X, Y)
self.assertEqual(clr.intercept_, 0)
self.assertEqualArray(clr.coef_, clq.coef_)
self.assertEqual(clq.intercept_, 0)
self.assertEqualArray(clr.intercept_, clq.intercept_)

def test_quantile_regression_intercept(self):
X = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.3]])
Y = numpy.array([1., 1.1, 1.2])
clr = LinearRegression(fit_intercept=True)
clr.fit(X, Y)
clq = QuantileLinearRegression(verbose=False, fit_intercept=True)
clq.fit(X, Y)
self.assertNotEqual(clr.intercept_, 0)
self.assertNotEqual(clq.intercept_, 0)
self.assertEqualArray(clr.intercept_, clq.intercept_)
self.assertEqualArray(clr.coef_, clq.coef_)

def test_quantile_regression_diff(self):
X = numpy.array([[0.1], [0.2], [0.3], [0.4]])
Y = numpy.array([1., 1.1, 1.2, 10])
clr = LinearRegression(fit_intercept=True)
clr.fit(X, Y)
clq = QuantileLinearRegression(verbose=False, fit_intercept=True)
clq.fit(X, Y)
self.assertNotEqual(clr.intercept_, 0)
self.assertNotEqual(clq.intercept_, 0)
self.assertNotEqualArray(clr.coef_, clq.coef_)
self.assertNotEqualArray(clr.intercept_, clq.intercept_)
self.assertLesser(clq.n_iter_, 10)

def test_quantile_regression_pandas(self):
X = pandas.DataFrame(numpy.array([[0.1, 0.2], [0.2, 0.3]]))
Y = numpy.array([1., 1.1])
clr = LinearRegression(fit_intercept=False)
clr.fit(X, Y)
clq = QuantileLinearRegression(fit_intercept=False)
clq.fit(X, Y)
self.assertEqual(clr.intercept_, 0)
self.assertEqualArray(clr.coef_, clq.coef_)
self.assertEqual(clq.intercept_, 0)
self.assertEqualArray(clr.intercept_, clq.intercept_)

def test_quantile_regression_list(self):
X = [[0.1, 0.2], [0.2, 0.3]]
Y = numpy.array([1., 1.1])
clq = QuantileLinearRegression(fit_intercept=False)
self.assertRaise(lambda: clq.fit(X, Y), TypeError)

def test_quantile_regression_list(self):
X = numpy.random.random(1000)
eps1 = (numpy.random.random(900) - 0.5) * 0.1
eps2 = numpy.random.random(100) * 2
eps = numpy.hstack([eps1, eps2])
X = X.reshape((1000, 1))
Y = X * 3.4 + 5.6 + eps

clq = QuantileLinearRegression(verbose=False, fit_intercept=True)
self.assertRaise(lambda: clq.fit(X, Y), ValueError)

Y = X.ravel() * 3.4 + 5.6 + eps

clq = QuantileLinearRegression(verbose=False, fit_intercept=True)
clq.fit(X, Y)

clr = LinearRegression(fit_intercept=True)
clr.fit(X, Y)

self.assertNotEqual(clr.intercept_, 0)
self.assertNotEqual(clq.intercept_, 0)
self.assertNotEqualArray(clr.coef_, clq.coef_)
self.assertNotEqualArray(clr.intercept_, clq.intercept_)
self.assertLesser(clq.n_iter_, 10)

pr = clr.predict(X)
pq = clq.predict(X)
self.assertEqual(pr.shape, pq.shape)


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ install:
- "%PYTHON%\\Scripts\\pymy_install3 --set=pyquickhelper"
- "%PYTHON%\\Scripts\\pymy_install3 --task=tool --source=zip graphviz"
- "%PYTHON%\\Scripts\\pip install pyquickhelper --no-deps"
- "%PYTHON%\\Scripts\\pip install wcwidth guzzle_sphinx_theme backcall absl-py"
- "%PYTHON%\\Scripts\\pip install wcwidth sphinx_rtd_theme backcall absl-py"
- "%PYTHON%\\Scripts\\pymy_install3 scikit-learn scipy tensorflow protobuf h5py"
- "%PYTHON%\\Scripts\\pip install keras pandas_streaming kiwisolver"
- "set PATH=%PATH%;C:\\projects\\jyquickhelper\\build\\update_modules\\Graphviz\\bin"
Expand Down
4 changes: 1 addition & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,5 @@ guzzle_sphinx_theme
jyquickhelper
pandas_streaming
pyquickhelper>=1.7.2539
sphinx-redactor-theme
sphinxjp.themes.basicstrap
sphinxjp.themes.sphinxjp
sphinx_rtd_theme
wheel
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
requirements = None

KEYWORDS = project_var_name + ', Xavier Dupré'
DESCRIPTION = """Look for insights about machine learned models"""
DESCRIPTION = """Extends scikit-learn with a couple of new models, transform, metrics, plotting."""
CLASSIFIERS = [
'Programming Language :: Python :: 3',
'Intended Audience :: Developers',
Expand Down Expand Up @@ -215,7 +215,6 @@ def write_version():
package_data=package_data,
# data_files=data_files,
install_requires=['scikit-learn', 'pandas',
'pillow', 'matplotlib', 'h5py',
'pandas_streaming'],
'matplotlib', 'pandas_streaming'],
# include_package_data=True,
)
2 changes: 1 addition & 1 deletion src/mlinsights/featurizers/ml_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def wrap_predict_sklearn(X, fct, many):

def model_featurizer_lr(model):
"""
Builds a featurizer from a :epkg:`scikit-learn:linear_model:LogisticRegresion`.
Builds a featurizer from a :epkg:`scikit-learn:linear_model:LogisticRegression`.
It returns a function which returns ``model.decision_function(X)``.
@param model model to use to featurize a vector
Expand Down
5 changes: 5 additions & 0 deletions src/mlinsights/mlmodel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
@file
@brief Shortcuts to *mlmodel*.
"""
from .quantile_regression import QuantileLinearRegression
Loading

0 comments on commit 1fccc7e

Please sign in to comment.