add recommandation

sdpython · Feb 11, 2018 · 3fbd6e1 · 3fbd6e1
1 parent eee6537
commit 3fbd6e1
Show file tree

Hide file tree

Showing 14 changed files with 426 additions and 28 deletions.
diff --git a/_doc/notebooks/lectures/movielens_fm.ipynb b/_doc/notebooks/lectures/movielens_fm.ipynb
@@ -0,0 +1,229 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Factorisation de matrice et recommandations\n",
+        "\n",
+        "Le notebook utilise la factorisation de matrice pour calculer des recommandations sur la base [movielens](https://grouplens.org/datasets/movielens/). On utilise le jeu de donn\u00e9es [ml-latest-small.zip](http://files.grouplens.org/datasets/movielens/ml-latest-small.zip)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "['links', 'movies', 'ratings', 'tags']"
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from papierstat.datasets import load_movielens_dataset\n",
+        "data = load_movielens_dataset()\n",
+        "list(sorted(data))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>movieId</th>\n",
+              "      <th>title</th>\n",
+              "      <th>genres</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>9123</th>\n",
+              "      <td>164977</td>\n",
+              "      <td>The Gay Desperado (1936)</td>\n",
+              "      <td>Comedy</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>9124</th>\n",
+              "      <td>164979</td>\n",
+              "      <td>Women of '69, Unboxed</td>\n",
+              "      <td>Documentary</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "      movieId                     title       genres\n",
+              "9123   164977  The Gay Desperado (1936)       Comedy\n",
+              "9124   164979     Women of '69, Unboxed  Documentary"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "data['movies'].tail(n=2)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>userId</th>\n",
+              "      <th>movieId</th>\n",
+              "      <th>rating</th>\n",
+              "      <th>timestamp</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>100002</th>\n",
+              "      <td>671</td>\n",
+              "      <td>6385</td>\n",
+              "      <td>2.5</td>\n",
+              "      <td>1070979663</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>100003</th>\n",
+              "      <td>671</td>\n",
+              "      <td>6565</td>\n",
+              "      <td>3.5</td>\n",
+              "      <td>1074784724</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "        userId  movieId  rating   timestamp\n",
+              "100002     671     6385     2.5  1070979663\n",
+              "100003     671     6565     3.5  1074784724"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "rate = data[\"ratings\"]\n",
+        "rate.tail(n=2)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(671, 9066)"
+            ]
+          },
+          "execution_count": 6,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "len(set(rate['userId'])), len(set(rate['movieId']))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "671 utilisateurs et 9066 films. C'est petit mais assez pour voir la factorisation et le temps que cela prend."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {},
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.4"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/_doc/sphinxdoc/source/blog/2018/2018-02-08_sessions2.rst b/_doc/sphinxdoc/source/blog/2018/2018-02-08_sessions2.rst
@@ -15,18 +15,7 @@
       `Ridge <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html>`_,
       `Lasso <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso>`_,
       `ElasticNet <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet>`_
-    * `clustering <http://scikit-learn.org/stable/modules/clustering.html#clustering>`_,
-      `k-means <http://www.xavierdupre.fr/app/mlstatpy/helpsphinx/c_clus/kmeans.html>`_
-      un exemple avec
-      les `vélos à Chicago <http://www.xavierdupre.fr/app/ensae_projects/helpsphinx/notebooks/city_bike_challenge.html>`_
-      et l'utilisation du clustering pour trouver les
-      `profils de cyclistes à Chicago <http://www.xavierdupre.fr/app/ensae_projects/helpsphinx/notebooks/city_bike_solution_cluster_start.html>`_
-    * `ranking <https://github.com/dmlc/xgboost/tree/master/demo/rank>`_
-    * recommandations,
-      `Factorisation de matrices non-négatives <http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html>`_,
-      `Liens entre factorisation de matrices, ACP, k-means <http://www.xavierdupre.fr/app/mlstatpy/helpsphinx/c_ml/missing_values_mf.html>`_
     * :ref:`l-cheatsheet-ml`
-    * résumé de l'`interface scikit-learn <http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx3/notebooks/02_basic_of_machine_learning_with_scikit-learn.html#a-recap-on-scikit-learn-s-estimator-interface>`_
 
     Exercices :
 

diff --git a/_doc/sphinxdoc/source/blog/2018/2018-02-11_sessions3.rst b/_doc/sphinxdoc/source/blog/2018/2018-02-11_sessions3.rst
@@ -0,0 +1,32 @@
+
+.. blogpost::
+    :title: Session 3
+    :keywords: session 3
+    :date: 2018-02-11
+    :categories: session
+
+    Suite et fin :
+
+    * :ref:`l-regclass`
+    * formalisation de la :ref:`régression <l-regression-f>`,
+      de la :ref:`classification <l-classification-f>`
+    * :ref:`classification multi-classe <l-multiclass>`
+
+    Prétraitements :
+
+    * :ref:`l-preprocessing`
+
+    * `clustering <http://scikit-learn.org/stable/modules/clustering.html#clustering>`_,
+      `k-means <http://www.xavierdupre.fr/app/mlstatpy/helpsphinx/c_clus/kmeans.html>`_
+      un exemple avec
+      les `vélos à Chicago <http://www.xavierdupre.fr/app/ensae_projects/helpsphinx/notebooks/city_bike_challenge.html>`_
+      et l'utilisation du clustering pour trouver les
+      `profils de cyclistes à Chicago <http://www.xavierdupre.fr/app/ensae_projects/helpsphinx/notebooks/city_bike_solution_cluster_start.html>`_
+    * `ranking <https://github.com/dmlc/xgboost/tree/master/demo/rank>`_
+    * :ref:`recommandations <l-recsys-section>`
+    * :ref:`l-cheatsheet-ml`
+    * résumé de l'`interface scikit-learn <http://www.xavierdupre.fr/app/ensae_teaching_cs/helpsphinx3/notebooks/02_basic_of_machine_learning_with_scikit-learn.html#a-recap-on-scikit-learn-s-estimator-interface>`_
+
+    Exercices :
+
+    * :ref:`Utiliser deux learners dans un pipeline scikit-learn <ex-pipe2learner>`
diff --git a/_doc/sphinxdoc/source/lectures/images/pondrec.png b/_doc/sphinxdoc/source/lectures/images/pondrec.png
diff --git a/_doc/sphinxdoc/source/lectures/images/rec.png b/_doc/sphinxdoc/source/lectures/images/rec.png
diff --git a/_doc/sphinxdoc/source/lectures/index.rst b/_doc/sphinxdoc/source/lectures/index.rst
@@ -1,6 +1,6 @@
-========
-Lectures
-========
+================================
+Lectures sur le machine learning
+================================
 
 Le :epkg:`machine learning` répond à de plus en plus de
 problématiques, beaucoup d'entre elles sont citées
@@ -15,13 +15,13 @@ pour explorer quelques points récurrents ou techniques.
     :maxdepth: 1
 
     regclass
-    pipeline
-    gradienttree
-    seriestemp
     nonsupervise
+    otherml
+    preprocessing
+    gradienttree
     textembed
+    seriestemp
     imagedeep
-    reinforce
 
 Le machine learning cache bien des choses mais
 au final il s'agit de calculer une prédiction

diff --git a/_doc/sphinxdoc/source/lectures/nonsupervise.rst b/_doc/sphinxdoc/source/lectures/nonsupervise.rst
@@ -10,6 +10,8 @@ Hors texte.
 Réduction de dimension
 ++++++++++++++++++++++
 
+* `Analyse en Composantes Principales <https://fr.wikipedia.org/wiki/Analyse_en_composantes_principales>`_
+
 Clustering
 ++++++++++
 

diff --git a/_doc/sphinxdoc/source/lectures/nuage.png b/_doc/sphinxdoc/source/lectures/nuage.png