update slim doc and add sklearn compt doc page

scikit-mine · May 20, 2020 · 090f27f · 090f27f
1 parent 806d2ee
commit 090f27f
Show file tree

Hide file tree

Showing 4 changed files with 437 additions and 51 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -21,3 +21,5 @@ Welcome to scikit-mine's documentation!
 .. toctree::
    :maxdepth: 1
    :caption: About
+
+   sklearn_compat.rst
diff --git a/docs/sklearn_compat.rst b/docs/sklearn_compat.rst
@@ -0,0 +1,49 @@
+. highlight:: shell
+
+===============================
+Compatibility with Scikit-Learn
+===============================
+`scikit-learn <https://scikit-learn.org/stable/>`_ is the golden standard for general
+puprose machine learning. As a rule of thumb, we follow scikit-learn functional definitions.
+
+-----------------
+
+*scikit-learn* is a library for statistical learning, or **machine-learning**.
+
+*scikit-mine*, on its side, is a library for (yet statistical) **pattern mining**.
+
+So what does this change ?
+*scikit-mine* gives more attention to discrete values, because **it looks for co-occuring symbols in the data**.
+To this purpose, we sometimes need to extend scikit-learn capabilities to tightly integrate the notion
+of symbols in our learning processes.
+
+
+Preprocessing
+-------------
+The Preprocessing module implements a set of Transformers/Encoders
+to get you from raw data to a more advanced, structured kind of data : 
+the kind a data that is easily manageable and prone to give you the best performance.
+
+Sometimes *scikit-learn* provides us the tools we exactly need, sometimes not.
+**Scikit-mine addresses data ingestion by implementing its own preprocessing blocks,
+in a fully scikit-learn compatible way**.
+
+The *preprocessing* module is designed to take all of the burden off you, and manage ingestion
+in a smooth way : use it !!
+
+
+Pipelines
+---------
+scikit-mine models are designed for possible integration in `scikit-learn pipelines <https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html>`_.
+This makes possible to build "symbolic classifiers", using scikit-mine pattern encoding schemes
+to serve predictions. See the tutorials sections.
+
+
+Other implementation details
+----------------------------
+We use `joblib <https://joblib.readthedocs.io/en/latest/>`_ as default to parallelise our code.
+We also set the *prefer* parameter when instantiating `joblib.Parallel <https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html>`_, 
+so users don't have to manually choose between threads and processes for optimal execution.
+
+
+Finally, we also leverage `Cython <https://cython.org/>`_ code where performance matters.
diff --git a/docs/tutorials/itemsets/SLIM.ipynb b/docs/tutorials/itemsets/SLIM.ipynb
@@ -20,7 +20,8 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "from skmine.itemsets import SLIM"
+    "from skmine.itemsets import SLIM\n",
+    "from skmine.preprocessing import TransactionEncoder"
    ]
   },
   {
@@ -39,13 +40,84 @@
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>bananas</th>\n",
+       "      <th>butter</th>\n",
+       "      <th>cookies</th>\n",
+       "      <th>milk</th>\n",
+       "      <th>tea</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "0             [bananas, milk]\n",
-       "1    [milk, bananas, cookies]\n",
-       "2      [cookies, butter, tea]\n",
-       "3                       [tea]\n",
-       "4        [milk, bananas, tea]\n",
-       "dtype: object"
+       "   bananas  butter  cookies  milk  tea\n",
+       "0        1       0        0     1    0\n",
+       "1        1       0        1     1    0\n",
+       "2        0       1        1     0    1\n",
+       "3        0       0        0     0    1\n",
+       "4        1       0        0     1    1"
       ]
      },
      "execution_count": 2,
@@ -54,13 +126,14 @@
     }
    ],
    "source": [
-    "D = pd.Series([  # SLIM takes a pd.Series as input\n",
+    "transactions = [\n",
     "    ['bananas', 'milk'],\n",
     "    ['milk', 'bananas', 'cookies'],\n",
     "    ['cookies', 'butter', 'tea'],\n",
     "    ['tea'], \n",
     "    ['milk', 'bananas', 'tea'],\n",
-    "])\n",
+    "]\n",
+    "D = TransactionEncoder().fit_transform(transactions)\n",
     "D"
    ]
   },
@@ -72,10 +145,10 @@
     {
      "data": {
       "text/plain": [
-       "(milk, bananas, cookies)       [1]\n",
-       "(butter, tea, cookies)         [2]\n",
-       "(milk, bananas)             [0, 4]\n",
-       "(tea)                       [3, 4]\n",
+       "(milk, bananas)    [0, 1, 4]\n",
+       "(tea)              [2, 3, 4]\n",
+       "(cookies)             [1, 2]\n",
+       "(butter)                 [2]\n",
        "dtype: object"
       ]
      },
@@ -105,14 +178,100 @@
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>bananas</th>\n",
+       "      <th>butter</th>\n",
+       "      <th>cookies</th>\n",
+       "      <th>jelly</th>\n",
+       "      <th>milk</th>\n",
+       "      <th>tea</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "0              [bananas, milk]\n",
-       "1     [milk, bananas, cookies]\n",
-       "2       [cookies, butter, tea]\n",
-       "3                        [tea]\n",
-       "4         [milk, bananas, tea]\n",
-       "5    [jelly, bananas, cookies]\n",
-       "dtype: object"
+       "   bananas  butter  cookies  jelly  milk  tea\n",
+       "0        1       0        0      0     1    0\n",
+       "1        1       0        1      0     1    0\n",
+       "2        0       1        1      0     0    1\n",
+       "3        0       0        0      0     0    1\n",
+       "4        1       0        0      0     1    1\n",
+       "5        1       0        1      1     0    0"
       ]
      },
      "execution_count": 4,
@@ -121,7 +280,8 @@
     }
    ],
    "source": [
-    "D[len(D)] = ['jelly', 'bananas', 'cookies']\n",
+    "transactions.append(['jelly', 'bananas', 'cookies'])\n",
+    "D = TransactionEncoder().fit_transform(transactions)\n",
     "D"
    ]
   },
@@ -140,11 +300,11 @@
     {
      "data": {
       "text/plain": [
-       "(milk, bananas)              [0, 1, 4]\n",
-       "(cookies)                          [1]\n",
-       "(butter, tea, cookies)             [2]\n",
-       "(tea)                           [3, 4]\n",
-       "(jelly, bananas, cookies)          [5]\n",
+       "(jelly, bananas)          [5]\n",
+       "(tea, butter)             [2]\n",
+       "(milk, bananas)     [0, 1, 4]\n",
+       "(cookies)           [1, 2, 5]\n",
+       "(tea)                  [3, 4]\n",
        "dtype: object"
       ]
      },