Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 54 additions & 149 deletions 1_BoW_text_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,42 +7,65 @@
"id": "KthJSHkGQR7Z"
},
"source": [
"<center><h1>Bag of Words Text Classification</h1></center>\n",
"# Bag of Words Text Classification\n",
"\n",
"In this tutorial we will show how to build a simple Bag of Words (BoW) text classifier using PyTorch. The classifier is trained on IMDB movie reviews dataset. \n",
"\n",
"\n",
"<h4>\n",
"The concepts covered in this tutorial are: \n",
"<br>\n",
"<br> 1. NLP text <i><b>pre-processing</b></i>\n",
"<br>\n",
"<br> 2. Split of <i><b>training, validation and testing datasets</b></i>\n",
"<br>\n",
"<br> 3. How to build a simple <i><b>feed-forward neural net classifier</b></i> using PyTorch \n",
"<br>\n",
"<br> 4. Training the model and the balance of <i><b>Under-fitting v.s. Over-fitting</b></i> \n",
"<br>\n",
"<br> 5. <i><b>BoW</b></i> and <i><b>TF-IDF</b></i> text classifier \n",
"</h4>"
"## Concepts covered in this tutorial\n",
"1. NLP text pre-processing\n",
"2. Split of training, validation and testing datasets\n",
"3. How to build a simple feed-forward neural net classifier using PyTorch \n",
"4. Training the model and the balance of Under-fitting v.s. Over-fitting\n",
"5. BoW and TF-IDF text classifier "
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"colab": {},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"colab_type": "code",
"id": "P4HGMiy0QR7b"
"executionInfo": {
"elapsed": 704,
"status": "ok",
"timestamp": 1553183711589,
"user": {
"displayName": "",
"photoUrl": "",
"userId": ""
},
"user_tz": -60
},
"id": "ZniLdSpeQR7l",
"outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /home/suzil/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/suzil/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"!pip install pypeln -q"
"import nltk\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"!pip install googledrivedownloader -q"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
Expand All @@ -67,13 +90,11 @@
{
"data": {
"text/plain": [
"device(type='cuda')"
"device(type='cpu')"
]
},
"execution_count": 3,
"metadata": {
"tags": []
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
Expand All @@ -89,7 +110,6 @@
"import pandas as pd\n",
"from google_drive_downloader import GoogleDriveDownloader as gdd\n",
"from IPython.core.display import display, HTML\n",
"from pypeln import process as pr # multi-processing\n",
"from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF\n",
"from sklearn.metrics import classification_report\n",
"from tqdm import tqdm, tqdm_notebook # show progress bar\n",
Expand Down Expand Up @@ -119,65 +139,21 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 704,
"status": "ok",
"timestamp": 1553183711589,
"user": {
"displayName": "",
"photoUrl": "",
"userId": ""
},
"user_tz": -60
},
"id": "ZniLdSpeQR7l",
"outputId": "dc5cd6e2-401f-43da-c732-dffa7a7e8772"
"id": "j8-WlORVQR7n"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
"Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... "
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"nltk.download('stopwords')\n",
"nltk.download('wordnet')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "j8-WlORVQR7n"
},
"outputs": [],
"source": [
"DATA_PATH = 'data/imdb_reviews.csv'\n",
"if not Path(DATA_PATH).is_file():\n",
Expand All @@ -187,24 +163,6 @@
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "SNv1gn0yQR7p"
},
"outputs": [],
"source": [
"## To run locally\n",
"#DATA_PATH = '{path_to_file}/imdb_reviews.csv'\n",
"#df = pd.read_csv(\n",
"# DATA_PATH,\n",
"# encoding='ISO-8859-1',\n",
"#)"
]
},
{
"cell_type": "markdown",
"metadata": {
Expand All @@ -217,7 +175,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
Expand All @@ -238,63 +196,10 @@
"id": "FnsKvqrXQR7t",
"outputId": "c8e9a905-160f-42a5-da52-816cb2db6f17"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>review</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>Seeing this film for the first time twenty yea...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12361</th>\n",
" <td>I went and saw this movie last night after bei...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" review label\n",
"55 Seeing this film for the first time twenty yea... 0\n",
"12361 I went and saw this movie last night after bei... 1"
]
},
"execution_count": 5,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"df = pd.read_csv(DATA_PATH)\n",
"df.loc[[55, 12361], :]"
"df.sample(5)"
]
},
{
Expand Down Expand Up @@ -7262,7 +7167,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.7.1"
}
},
"nbformat": 4,
Expand Down