diff --git a/1_BoW_text_classification.ipynb b/1_BoW_text_classification.ipynb index 4067b3e..88fb23c 100644 --- a/1_BoW_text_classification.ipynb +++ b/1_BoW_text_classification.ipynb @@ -7,42 +7,65 @@ "id": "KthJSHkGQR7Z" }, "source": [ - "

Bag of Words Text Classification

\n", + "# Bag of Words Text Classification\n", "\n", "In this tutorial we will show how to build a simple Bag of Words (BoW) text classifier using PyTorch. The classifier is trained on IMDB movie reviews dataset. \n", "\n", "\n", - "

\n", - "The concepts covered in this tutorial are: \n", - "
\n", - "
1. NLP text pre-processing\n", - "
\n", - "
2. Split of training, validation and testing datasets\n", - "
\n", - "
3. How to build a simple feed-forward neural net classifier using PyTorch \n", - "
\n", - "
4. Training the model and the balance of Under-fitting v.s. Over-fitting \n", - "
\n", - "
5. BoW and TF-IDF text classifier \n", - "

" + "## Concepts covered in this tutorial\n", + "1. NLP text pre-processing\n", + "2. Split of training, validation and testing datasets\n", + "3. How to build a simple feed-forward neural net classifier using PyTorch \n", + "4. Training the model and the balance of Under-fitting v.s. Over-fitting\n", + "5. BoW and TF-IDF text classifier " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { - "colab": {}, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 + }, "colab_type": "code", - "id": "P4HGMiy0QR7b" + "executionInfo": { + "elapsed": 704, + "status": "ok", + "timestamp": 1553183711589, + "user": { + "displayName": "", + "photoUrl": "", + "userId": "" + }, + "user_tz": -60 + }, + "id": "ZniLdSpeQR7l", + "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/suzil/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /home/suzil/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], "source": [ - "!pip install pypeln -q" + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "!pip install googledrivedownloader -q" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -67,13 +90,11 @@ { "data": { "text/plain": [ - "device(type='cuda')" + "device(type='cpu')" ] }, - "execution_count": 3, - "metadata": { - "tags": [] - }, + "execution_count": 6, + "metadata": {}, "output_type": "execute_result" } ], @@ -89,7 +110,6 @@ "import pandas as pd\n", "from google_drive_downloader import GoogleDriveDownloader as gdd\n", "from IPython.core.display import display, HTML\n", - "from pypeln import process as pr # multi-processing\n", "from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF\n", "from sklearn.metrics import classification_report\n", "from tqdm import tqdm, tqdm_notebook # show progress bar\n", @@ -119,65 +139,21 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 - }, + "colab": {}, "colab_type": "code", - "executionInfo": { - "elapsed": 704, - "status": "ok", - "timestamp": 1553183711589, - "user": { - "displayName": "", - "photoUrl": "", - "userId": "" - }, - "user_tz": -60 - }, - "id": "ZniLdSpeQR7l", - "outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772" + "id": "j8-WlORVQR7n" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n", - "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", - "[nltk_data] Package wordnet is already up-to-date!\n" + "Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... " ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 2, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" } ], - "source": [ - "nltk.download('stopwords')\n", - "nltk.download('wordnet')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "j8-WlORVQR7n" - }, - "outputs": [], "source": [ "DATA_PATH = 'data/imdb_reviews.csv'\n", "if not Path(DATA_PATH).is_file():\n", @@ -187,24 +163,6 @@ " )" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "SNv1gn0yQR7p" - }, - "outputs": [], - "source": [ - "## To run locally\n", - "#DATA_PATH = '{path_to_file}/imdb_reviews.csv'\n", - "#df = pd.read_csv(\n", - "# DATA_PATH,\n", - "# encoding='ISO-8859-1',\n", - "#)" - ] - }, { "cell_type": "markdown", "metadata": { @@ -217,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -238,63 +196,10 @@ "id": "FnsKvqrXQR7t", "outputId": "c8e9a905-160f-42a5-da52-816cb2db6f17" }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
reviewlabel
55Seeing this film for the first time twenty yea...0
12361I went and saw this movie last night after bei...1
\n", - "
" - ], - "text/plain": [ - " review label\n", - "55 Seeing this film for the first time twenty yea... 0\n", - "12361 I went and saw this movie last night after bei... 1" - ] - }, - "execution_count": 5, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df = pd.read_csv(DATA_PATH)\n", - "df.loc[[55, 12361], :]" + "df.sample(5)" ] }, { @@ -7262,7 +7167,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.1" } }, "nbformat": 4,