scoutbee · jeffrey-hsu · Apr 4, 2019 · Apr 4, 2019
diff --git a/1_BoW_text_classification.ipynb b/1_BoW_text_classification.ipynb
@@ -7,42 +7,65 @@
     "id": "KthJSHkGQR7Z"
    },
    "source": [
-    "<center><h1>Bag of Words Text Classification</h1></center>\n",
+    "# Bag of Words Text Classification\n",
     "\n",
     "In this tutorial we will show how to build a simple Bag of Words (BoW) text classifier using PyTorch. The classifier is trained on IMDB movie reviews dataset. \n",
     "\n",
     "\n",
-    "<h4>\n",
-    "The concepts covered in this tutorial are: \n",
-    "<br>\n",
-    "<br> 1. NLP text <i><b>pre-processing</b></i>\n",
-    "<br>\n",
-    "<br> 2. Split of <i><b>training, validation and testing datasets</b></i>\n",
-    "<br>\n",
-    "<br> 3. How to build a simple <i><b>feed-forward neural net classifier</b></i> using PyTorch \n",
-    "<br>\n",
-    "<br> 4. Training the model and the balance of <i><b>Under-fitting v.s. Over-fitting</b></i> \n",
-    "<br>\n",
-    "<br> 5. <i><b>BoW</b></i> and  <i><b>TF-IDF</b></i> text classifier \n",
-    "</h4>"
+    "## Concepts covered in this tutorial\n",
+    "1. NLP text pre-processing\n",
+    "2. Split of training, validation and testing datasets\n",
+    "3. How to build a simple feed-forward neural net classifier using PyTorch \n",
+    "4. Training the model and the balance of Under-fitting v.s. Over-fitting\n",
+    "5. BoW and TF-IDF text classifier "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {
-    "colab": {},
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 102
+    },
     "colab_type": "code",
-    "id": "P4HGMiy0QR7b"
+    "executionInfo": {
+     "elapsed": 704,
+     "status": "ok",
+     "timestamp": 1553183711589,
+     "user": {
+      "displayName": "",
+      "photoUrl": "",
+      "userId": ""
+     },
+     "user_tz": -60
+    },
+    "id": "ZniLdSpeQR7l",
+    "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to /home/suzil/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to /home/suzil/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    }
+   ],
    "source": [
-    "!pip install pypeln -q"
+    "import nltk\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')\n",
+    "\n",
+    "!pip install googledrivedownloader -q"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -67,13 +90,11 @@
     {
      "data": {
       "text/plain": [
-       "device(type='cuda')"
+       "device(type='cpu')"
       ]
      },
-     "execution_count": 3,
-     "metadata": {
-      "tags": []
-     },
+     "execution_count": 6,
+     "metadata": {},
      "output_type": "execute_result"
     }
    ],
@@ -89,7 +110,6 @@
     "import pandas as pd\n",
     "from google_drive_downloader import GoogleDriveDownloader as gdd\n",
     "from IPython.core.display import display, HTML\n",
-    "from pypeln import process as pr # multi-processing\n",
     "from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF\n",
     "from sklearn.metrics import classification_report\n",
     "from tqdm import tqdm, tqdm_notebook # show progress bar\n",
@@ -119,65 +139,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 102
-    },
+    "colab": {},
     "colab_type": "code",
-    "executionInfo": {
-     "elapsed": 704,
-     "status": "ok",
-     "timestamp": 1553183711589,
-     "user": {
-      "displayName": "",
-      "photoUrl": "",
-      "userId": ""
-     },
-     "user_tz": -60
-    },
-    "id": "ZniLdSpeQR7l",
-    "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
+    "id": "j8-WlORVQR7n"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
-      "[nltk_data]   Package stopwords is already up-to-date!\n",
-      "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
-      "[nltk_data]   Package wordnet is already up-to-date!\n"
+      "Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... "
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {
-      "tags": []
-     },
-     "output_type": "execute_result"
     }
    ],
-   "source": [
-    "nltk.download('stopwords')\n",
-    "nltk.download('wordnet')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "j8-WlORVQR7n"
-   },
-   "outputs": [],
    "source": [
     "DATA_PATH = 'data/imdb_reviews.csv'\n",
     "if not Path(DATA_PATH).is_file():\n",
@@ -187,24 +163,6 @@
     "    )"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {},
-    "colab_type": "code",
-    "id": "SNv1gn0yQR7p"
-   },
-   "outputs": [],
-   "source": [
-    "## To run locally\n",
-    "#DATA_PATH = '{path_to_file}/imdb_reviews.csv'\n",
-    "#df = pd.read_csv(\n",
-    "#    DATA_PATH,\n",
-    "#    encoding='ISO-8859-1',\n",
-    "#)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -217,7 +175,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/",
@@ -238,63 +196,10 @@
     "id": "FnsKvqrXQR7t",
     "outputId": "c8e9a905-160f-42a5-da52-816cb2db6f17"
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>review</th>\n",
-       "      <th>label</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>55</th>\n",
-       "      <td>Seeing this film for the first time twenty yea...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12361</th>\n",
-       "      <td>I went and saw this movie last night after bei...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                  review  label\n",
-       "55     Seeing this film for the first time twenty yea...      0\n",
-       "12361  I went and saw this movie last night after bei...      1"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {
-      "tags": []
-     },
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "df = pd.read_csv(DATA_PATH)\n",
-    "df.loc[[55, 12361], :]"
+    "df.sample(5)"
    ]
   },
   {
@@ -7262,7 +7167,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.8"
+   "version": "3.7.1"
   }
  },
  "nbformat": 4,