diff --git a/1_BoW_text_classification.ipynb b/1_BoW_text_classification.ipynb
index 4067b3e..88fb23c 100644
--- a/1_BoW_text_classification.ipynb
+++ b/1_BoW_text_classification.ipynb
@@ -7,42 +7,65 @@
"id": "KthJSHkGQR7Z"
},
"source": [
- "
Bag of Words Text Classification
\n",
+ "# Bag of Words Text Classification\n",
"\n",
"In this tutorial we will show how to build a simple Bag of Words (BoW) text classifier using PyTorch. The classifier is trained on IMDB movie reviews dataset. \n",
"\n",
"\n",
- "\n",
- "The concepts covered in this tutorial are: \n",
- "
\n",
- "
1. NLP text pre-processing\n",
- "
\n",
- "
2. Split of training, validation and testing datasets\n",
- "
\n",
- "
3. How to build a simple feed-forward neural net classifier using PyTorch \n",
- "
\n",
- "
4. Training the model and the balance of Under-fitting v.s. Over-fitting \n",
- "
\n",
- "
5. BoW and TF-IDF text classifier \n",
- "
"
+ "## Concepts covered in this tutorial\n",
+ "1. NLP text pre-processing\n",
+ "2. Split of training, validation and testing datasets\n",
+ "3. How to build a simple feed-forward neural net classifier using PyTorch \n",
+ "4. Training the model and the balance of Under-fitting v.s. Over-fitting\n",
+ "5. BoW and TF-IDF text classifier "
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
- "colab": {},
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 102
+ },
"colab_type": "code",
- "id": "P4HGMiy0QR7b"
+ "executionInfo": {
+ "elapsed": 704,
+ "status": "ok",
+ "timestamp": 1553183711589,
+ "user": {
+ "displayName": "",
+ "photoUrl": "",
+ "userId": ""
+ },
+ "user_tz": -60
+ },
+ "id": "ZniLdSpeQR7l",
+ "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /home/suzil/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n",
+ "[nltk_data] Downloading package wordnet to /home/suzil/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n"
+ ]
+ }
+ ],
"source": [
- "!pip install pypeln -q"
+ "import nltk\n",
+ "nltk.download('stopwords')\n",
+ "nltk.download('wordnet')\n",
+ "\n",
+ "!pip install googledrivedownloader -q"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -67,13 +90,11 @@
{
"data": {
"text/plain": [
- "device(type='cuda')"
+ "device(type='cpu')"
]
},
- "execution_count": 3,
- "metadata": {
- "tags": []
- },
+ "execution_count": 6,
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -89,7 +110,6 @@
"import pandas as pd\n",
"from google_drive_downloader import GoogleDriveDownloader as gdd\n",
"from IPython.core.display import display, HTML\n",
- "from pypeln import process as pr # multi-processing\n",
"from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF\n",
"from sklearn.metrics import classification_report\n",
"from tqdm import tqdm, tqdm_notebook # show progress bar\n",
@@ -119,65 +139,21 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 102
- },
+ "colab": {},
"colab_type": "code",
- "executionInfo": {
- "elapsed": 704,
- "status": "ok",
- "timestamp": 1553183711589,
- "user": {
- "displayName": "",
- "photoUrl": "",
- "userId": ""
- },
- "user_tz": -60
- },
- "id": "ZniLdSpeQR7l",
- "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
+ "id": "j8-WlORVQR7n"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
- "[nltk_data] Package stopwords is already up-to-date!\n",
- "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
- "[nltk_data] Package wordnet is already up-to-date!\n"
+ "Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... "
]
- },
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 2,
- "metadata": {
- "tags": []
- },
- "output_type": "execute_result"
}
],
- "source": [
- "nltk.download('stopwords')\n",
- "nltk.download('wordnet')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "j8-WlORVQR7n"
- },
- "outputs": [],
"source": [
"DATA_PATH = 'data/imdb_reviews.csv'\n",
"if not Path(DATA_PATH).is_file():\n",
@@ -187,24 +163,6 @@
" )"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "SNv1gn0yQR7p"
- },
- "outputs": [],
- "source": [
- "## To run locally\n",
- "#DATA_PATH = '{path_to_file}/imdb_reviews.csv'\n",
- "#df = pd.read_csv(\n",
- "# DATA_PATH,\n",
- "# encoding='ISO-8859-1',\n",
- "#)"
- ]
- },
{
"cell_type": "markdown",
"metadata": {
@@ -217,7 +175,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -238,63 +196,10 @@
"id": "FnsKvqrXQR7t",
"outputId": "c8e9a905-160f-42a5-da52-816cb2db6f17"
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " review | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 55 | \n",
- " Seeing this film for the first time twenty yea... | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | 12361 | \n",
- " I went and saw this movie last night after bei... | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " review label\n",
- "55 Seeing this film for the first time twenty yea... 0\n",
- "12361 I went and saw this movie last night after bei... 1"
- ]
- },
- "execution_count": 5,
- "metadata": {
- "tags": []
- },
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df = pd.read_csv(DATA_PATH)\n",
- "df.loc[[55, 12361], :]"
+ "df.sample(5)"
]
},
{
@@ -7262,7 +7167,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.7.1"
}
},
"nbformat": 4,