diff --git a/examples/classification_with_NODE.ipynb b/examples/classification_with_NODE.ipynb index cccf343e..6b8ef3fb 100644 --- a/examples/classification_with_NODE.ipynb +++ b/examples/classification_with_NODE.ipynb @@ -3,31 +3,10 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], "source": [ "import os\n", "os.chdir(\"..\")\n", "from sklearn.datasets import fetch_covtype\n", - "import random\n", - "import numpy as np\n", - "import pandas as pd\n", - "import lightgbm as lgb\n", - "from sklearn.metrics import accuracy_score, f1_score\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], - "source": [ "from sklearn.datasets import make_classification\n", "from sklearn.model_selection import train_test_split\n", "import random\n", @@ -35,29 +14,50 @@ "import pandas as pd\n", "import lightgbm as lgb\n", "from sklearn.metrics import accuracy_score, f1_score\n", - "import os\n", - "os.chdir(\"..\")\n", + "\n", "%load_ext autoreload\n", "%autoreload 2" - ] + ], + "outputs": [], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, "source": [ "# Utility Functions" - ] + ], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], "source": [ + "\n", + "def make_mixed_classification(n_samples, n_features, n_categories):\n", + " X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)\n", + " cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)\n", + " num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]\n", + " for col in cat_cols:\n", + " X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)\n", + " col_names = [] \n", + " num_col_names=[]\n", + " cat_col_names=[]\n", + " for i in range(X.shape[-1]):\n", + " if i in cat_cols:\n", + " col_names.append(f\"cat_col_{i}\")\n", + " cat_col_names.append(f\"cat_col_{i}\")\n", + " if i in num_cols:\n", + " col_names.append(f\"num_col_{i}\")\n", + " num_col_names.append(f\"num_col_{i}\")\n", + " X = pd.DataFrame(X, columns=col_names)\n", + " y = pd.Series(y, name=\"target\")\n", + " data = X.join(y)\n", + " return data, cat_col_names, num_col_names\n", + " \n", "def load_classification_data():\n", " dataset = fetch_covtype(data_home=\"data\")\n", " data = np.hstack([dataset.data, dataset.target.reshape(-1, 1)])\n", @@ -83,104 +83,106 @@ " val_acc = accuracy_score(y_true, y_pred)\n", " val_f1 = f1_score(y_true, y_pred)\n", " print(f\"{tag} Acc: {val_acc} | {tag} F1: {val_f1}\")" - ] + ], + "outputs": [], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, "source": [ "# Generate Synthetic Data \n", "\n", "First of all, let's create a synthetic data which is a mix of numerical and categorical features" - ] + ], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "code", "execution_count": 3, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], "source": [ "data, cat_col_names, num_col_names = make_mixed_classification(n_samples=10000, n_features=20, n_categories=4)\n", "train, test = train_test_split(data, random_state=42)\n", "train, val = train_test_split(train, random_state=42)" - ] + ], + "outputs": [], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, "source": [ "## Baseline\n", "\n", "Let's use the default LightGBM model as a baseline." - ] + ], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "code", "execution_count": 4, - "metadata": { - "Collapsed": "false" - }, + "source": [ + "clf = lgb.LGBMClassifier(random_state=42)\n", + "clf.fit(train.drop(columns='target'), train['target'], categorical_feature=cat_col_names)\n", + "val_pred = clf.predict(val.drop(columns='target'))\n", + "print_metrics(val['target'], val_pred, \"Validation\")\n", + "test_pred = clf.predict(test.drop(columns='target'))\n", + "print_metrics(test['target'], test_pred, \"Holdout\")" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stderr", "text": [ - "Validation Acc: 0.9290666666666667 | Validation F1: 0.9285330467490596\n", - "Holdout Acc: 0.9344 | Holdout F1: 0.9346613545816733\n" + "/home/fonnesbeck/anaconda3/envs/pitch_effect/lib/python3.9/site-packages/lightgbm/basic.py:1702: UserWarning: Using categorical_feature in Dataset.\n", + " _log_warning('Using categorical_feature in Dataset.')\n" ] }, { - "name": "stderr", "output_type": "stream", + "name": "stdout", "text": [ - "D:\\miniconda3\\envs\\df_encoder\\lib\\site-packages\\lightgbm\\basic.py:1551: UserWarning: Using categorical_feature in Dataset.\n", - " warnings.warn('Using categorical_feature in Dataset.')\n" + "Validation Acc: 0.9328 | Validation F1: 0.9322580645161291\n", + "Holdout Acc: 0.9328 | Holdout F1: 0.9330677290836654\n" ] } ], - "source": [ - "clf = lgb.LGBMClassifier(random_state=42)\n", - "clf.fit(train.drop(columns='target'), train['target'], categorical_feature=cat_col_names)\n", - "val_pred = clf.predict(val.drop(columns='target'))\n", - "print_metrics(val['target'], val_pred, \"Validation\")\n", - "test_pred = clf.predict(test.drop(columns='target'))\n", - "print_metrics(test['target'], test_pred, \"Holdout\")" - ] + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, "source": [ "# Importing the Library" - ] + ], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], + "execution_count": 18, "source": [ "from pytorch_tabular import TabularModel\n", "from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig\n", "from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig\n", - "from pytorch_tabular.category_encoders import CategoricalEmbeddingTransformer" - ] + "from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer\n", + "\n" + ], + "outputs": [], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "markdown", - "metadata": { - "Collapsed": "false" - }, "source": [ "## Define the Configs\n", "\n", @@ -191,15 +193,14 @@ "* TrainerConfig - This let's you configure the training process by setting things like batch_size, epochs, early stopping, etc. The vast majority of parameters are directly borrowed from PyTorch Lightning and is passed to the underlying Trainer object during training\n", "* OptimizerConfig - This let's you define and use different Optimizers and LearningRate Schedulers. Standard PyTorch Optimizers and Learning RateSchedulers are supported. For custom optimizers, you can use the parameter in the fit method to overwrite this. The custom optimizer should be PyTorch compatible\n", "* ExperimentConfig - This is an optional parameter. If set, this defines the Experiment Tracking. Right now, only two experiment tracking frameworks are supported: Tensorboard and Weights&Biases. W&B experiment tracker has more features like tracking the gradients and logits across epochs." - ] + ], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "Collapsed": "false" - }, - "outputs": [], + "execution_count": 11, "source": [ "data_config = DataConfig(\n", " target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented\n", @@ -212,7 +213,8 @@ " auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate\n", " batch_size=1024,\n", " max_epochs=1000,\n", - " gpus=1, #index of the GPU to use. 0, means CPU\n", + " auto_select_gpus=False,\n", + " gpus=0, #index of the GPU to use. 0, means CPU\n", ")\n", "optimizer_config = OptimizerConfig()\n", "model_config = CategoryEmbeddingModelConfig(\n", @@ -228,276 +230,182 @@ " optimizer_config=optimizer_config,\n", " trainer_config=trainer_config,\n", ")" - ] + ], + "outputs": [], + "metadata": { + "Collapsed": "false" + } }, { "cell_type": "markdown", - "metadata": { - "Collapsed": "true" - }, "source": [ "## Training the Model \n", "Now that we have defined the configs and the TabularModel. We just need to call the `fit` method and pass the train and test dataframes. We can also pass in validation dataframe. But if omitted, TabularModel will separate 20% of the data as validation." - ] + ], + "metadata": { + "Collapsed": "true" + } }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "Collapsed": "false", - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "tags": [] - }, + "execution_count": 12, + "source": [ + "tabular_model.fit(train=train, test=test)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "D:\\miniconda3\\envs\\df_encoder\\lib\\site-packages\\pytorch_lightning\\utilities\\distributed.py:45: UserWarning: Checkpoint directory saved_models exists and is not empty. With save_top_k=1, all files in this directory will be deleted when a checkpoint is saved!\n", - " warnings.warn(*args, **kwargs)\n", - "GPU available: True, used: False\n", - "GPU available: True, used: False\n", - "TPU available: False, using: 0 TPU cores\n", + "GPU available: False, used: False\n", "TPU available: False, using: 0 TPU cores\n", - "D:\\miniconda3\\envs\\df_encoder\\lib\\site-packages\\pytorch_lightning\\utilities\\distributed.py:45: UserWarning: GPU available but not used. Set the --gpus flag when calling the script.\n", - " warnings.warn(*args, **kwargs)\n", - "GPU available: True, used: True\n", - "GPU available: True, used: True\n", - "TPU available: False, using: 0 TPU cores\n", - "TPU available: False, using: 0 TPU cores\n", - "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n", - "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n", "\n", - " | Name | Type | Params\n", - "------------------------------------------------------------\n", - "0 | embedding_layers | ModuleList | 60 \n", - "1 | normalizing_batch_norm | BatchNorm1d | 32 \n", - "2 | linear_layers | Sequential | 19.0 M\n", - "3 | loss | CrossEntropyLoss | 0 \n", - "\n", - " | Name | Type | Params\n", - "------------------------------------------------------------\n", - "0 | embedding_layers | ModuleList | 60 \n", - "1 | normalizing_batch_norm | BatchNorm1d | 32 \n", - "2 | linear_layers | Sequential | 19.0 M\n", - "3 | loss | CrossEntropyLoss | 0 \n", - "\n", - "Finding best initial lr: 0%| | 0/100 [00:00\n", "