diff --git a/Ch4/04_FastText_Example.ipynb b/Ch4/04_FastText_Example.ipynb
index 8a98d80..b02811f 100644
--- a/Ch4/04_FastText_Example.ipynb
+++ b/Ch4/04_FastText_Example.ipynb
@@ -1,542 +1,522 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6FIToZHAhz2O"
+ },
+ "source": [
+ "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n",
+ "**Note**: This notebook uses an older version of fasttext."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
"colab": {
- "name": "FastText_Example.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
+ "base_uri": "https://localhost:8080/"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.10"
+ "id": "xC9f1uA-OX8J",
+ "outputId": "8596b211-ad7c-4efd-8fc1-bbbc4babd2e2"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: fasttext==0.9.2 in /usr/local/lib/python3.7/dist-packages (0.9.2)\n",
+ "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (56.1.0)\n",
+ "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (2.6.2)\n",
+ "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (1.19.5)\n"
+ ]
}
+ ],
+ "source": [
+ "!pip install fasttext==0.9.2"
+ ]
},
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "6FIToZHAhz2O"
- },
- "source": [
- "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n",
- "**Note**: This notebook uses an older version of fasttext."
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "xC9f1uA-OX8J",
- "colab_type": "code",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 235
- },
- "outputId": "02dfd2ba-06ee-403a-dde6-a2b3ae61014c"
- },
- "source": [
- "!pip install fasttext==0.9.2"
- ],
- "execution_count": 15,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Collecting fasttext==0.9.2\n",
- "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)\n",
- "\r\u001b[K |████▊ | 10kB 17.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 20kB 1.7MB/s eta 0:00:01\r\u001b[K |██████████████▎ | 30kB 2.2MB/s eta 0:00:01\r\u001b[K |███████████████████ | 40kB 2.5MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 51kB 2.0MB/s eta 0:00:01\r\u001b[K |████████████████████████████▋ | 61kB 2.2MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 71kB 1.9MB/s \n",
- "\u001b[?25hRequirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (2.5.0)\n",
- "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (49.2.0)\n",
- "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (1.18.5)\n",
- "Building wheels for collected packages: fasttext\n",
- " Building wheel for fasttext (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- " Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3020301 sha256=b65b73f96edcad8906e88913fc050bb28348ae2d57360e7df66ab675a73822e0\n",
- " Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592\n",
- "Successfully built fasttext\n",
- "Installing collected packages: fasttext\n",
- "Successfully installed fasttext-0.9.2\n"
- ],
- "name": "stdout"
- }
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "YKgZXvTGb61z",
- "colab": {}
- },
- "source": [
- "#necessary imports\n",
- "import pandas as pd"
- ],
- "execution_count": 1,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "l6CfW7C3L4EB",
- "colab_type": "code",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 588
- },
- "outputId": "f9708301-c566-4094-ad24-ecee460052db"
- },
- "source": [
- "# downloading the data\n",
- "!wget -P DATAPATH https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n",
- "\n",
- "# untaring the reuqired file\n",
- "!tar -xvf DATAPATH/dbpedia_csv.tar.gz -C DATAPATH\n",
- "\n",
- "# sneek peek in the folder structure\n",
- "!ls -lah DATAPATH\n"
- ],
- "execution_count": 2,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "--2020-08-10 15:31:17-- https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n",
- "Resolving github.com (github.com)... 140.82.114.4\n",
- "Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n",
- "HTTP request sent, awaiting response... 301 Moved Permanently\n",
- "Location: https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz [following]\n",
- "--2020-08-10 15:31:17-- https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz\n",
- "Reusing existing connection to github.com:443.\n",
- "HTTP request sent, awaiting response... 302 Found\n",
- "Location: https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz [following]\n",
- "--2020-08-10 15:31:17-- https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... failed: Connection timed out.\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.64.133|:443... connected.\n",
- "HTTP request sent, awaiting response... 200 OK\n",
- "Length: 68431223 (65M) [application/octet-stream]\n",
- "Saving to: ‘DATAPATH/dbpedia_csv.tar.gz.1’\n",
- "\n",
- "dbpedia_csv.tar.gz. 100%[===================>] 65.26M 81.3MB/s in 0.8s \n",
- "\n",
- "2020-08-10 15:31:50 (81.3 MB/s) - ‘DATAPATH/dbpedia_csv.tar.gz.1’ saved [68431223/68431223]\n",
- "\n",
- "dbpedia_csv/\n",
- "dbpedia_csv/test.csv\n",
- "dbpedia_csv/classes.txt\n",
- "dbpedia_csv/train.csv\n",
- "dbpedia_csv/readme.txt\n",
- "total 328M\n",
- "drwxr-xr-x 3 root root 4.0K Aug 10 15:31 .\n",
- "drwxr-xr-x 1 root root 4.0K Aug 10 12:14 ..\n",
- "drwxrwxr-x 2 1000 1000 4.0K Mar 29 2015 dbpedia_csv\n",
- "-rw-r--r-- 1 root root 66M Aug 10 12:14 dbpedia_csv.tar.gz\n",
- "-rw-r--r-- 1 root root 66M Aug 10 15:31 dbpedia_csv.tar.gz.1\n",
- "-rw-r--r-- 1 root root 22M Aug 10 12:15 dbpedia_test.csv\n",
- "-rw-r--r-- 1 root root 175M Aug 10 12:15 dbpedia_train.csv\n"
- ],
- "name": "stdout"
- }
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "lMoRw3oQb62I",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 34
- },
- "outputId": "4a9015af-de57-41f3-d932-fa255e48063f"
- },
- "source": [
- "data_path = 'DATAPATH'\n",
- "# Loading train data\n",
- "train_file = data_path + '/dbpedia_csv/train.csv'\n",
- "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n",
- "# Loading test data\n",
- "test_file = data_path + '/dbpedia_csv/test.csv'\n",
- "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n",
- "# Data we have\n",
- "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))\n"
- ],
- "execution_count": 3,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Train:(560000, 3) Test:(70000, 3)\n"
- ],
- "name": "stdout"
- }
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "gaz226vXb62W",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 195
- },
- "outputId": "9136af68-a5d9-4041-d13c-52d26c38a59c"
- },
- "source": [
- "# Since we have no clue about the classes lets build one\n",
- "# Mapping from class number to class name\n",
- "class_dict={\n",
- " 1:'Company',\n",
- " 2:'EducationalInstitution',\n",
- " 3:'Artist',\n",
- " 4:'Athlete',\n",
- " 5:'OfficeHolder',\n",
- " 6:'MeanOfTransportation',\n",
- " 7:'Building',\n",
- " 8:'NaturalPlace',\n",
- " 9:'Village',\n",
- " 10:'Animal',\n",
- " 11:'Plant',\n",
- " 12:'Album',\n",
- " 13:'Film',\n",
- " 14:'WrittenWork'\n",
- " }\n",
- "\n",
- "# Mapping the classes\n",
- "df['class_name'] = df['class'].map(class_dict)\n",
- "df.head()"
- ],
- "execution_count": 4,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " class | \n",
- " name | \n",
- " description | \n",
- " class_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " E. D. Abbott Ltd | \n",
- " Abbott of Farnham E D Abbott Limited was a Br... | \n",
- " Company | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1 | \n",
- " Schwan-Stabilo | \n",
- " Schwan-STABILO is a German maker of pens for ... | \n",
- " Company | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 1 | \n",
- " Q-workshop | \n",
- " Q-workshop is a Polish company located in Poz... | \n",
- " Company | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 1 | \n",
- " Marvell Software Solutions Israel | \n",
- " Marvell Software Solutions Israel known as RA... | \n",
- " Company | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 1 | \n",
- " Bergan Mercy Medical Center | \n",
- " Bergan Mercy Medical Center is a hospital loc... | \n",
- " Company | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " class ... class_name\n",
- "0 1 ... Company\n",
- "1 1 ... Company\n",
- "2 1 ... Company\n",
- "3 1 ... Company\n",
- "4 1 ... Company\n",
- "\n",
- "[5 rows x 4 columns]"
- ]
- },
- "metadata": {
- "tags": []
- },
- "execution_count": 4
- }
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "si7VC_Rub62a",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 269
- },
- "outputId": "8c8e6220-8028-460c-8042-95b9fa25152b"
- },
- "source": [
- "df[\"class_name\"].value_counts()"
- ],
- "execution_count": 5,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "Athlete 40000\n",
- "Animal 40000\n",
- "MeanOfTransportation 40000\n",
- "Artist 40000\n",
- "OfficeHolder 40000\n",
- "Building 40000\n",
- "Plant 40000\n",
- "WrittenWork 40000\n",
- "EducationalInstitution 40000\n",
- "Village 40000\n",
- "NaturalPlace 40000\n",
- "Company 40000\n",
- "Film 40000\n",
- "Album 40000\n",
- "Name: class_name, dtype: int64"
- ]
- },
- "metadata": {
- "tags": []
- },
- "execution_count": 5
- }
- ]
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "YKgZXvTGb61z"
+ },
+ "outputs": [],
+ "source": [
+ "#necessary imports\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "import wget\n",
+ "import tarfile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "l6CfW7C3L4EB",
+ "outputId": "53b9c39f-41fb-4a51-af80-9abc1deb89a8"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "Sn-3kIqMb62d",
- "colab": {}
- },
- "source": [
- "# Lets do some cleaning of this text\n",
- "def clean_it(text,normalize=True):\n",
- " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n",
- " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n",
- " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n",
- " \n",
- " # normalizing / encoding the text\n",
- " if normalize:\n",
- " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n",
- " \n",
- " return s\n",
- "\n",
- "# Now lets define a small function where we can use above cleaning on datasets\n",
- "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n",
- " # Defining the new data\n",
- " df = data[['name','description']].copy(deep=True)\n",
- " df['class'] = label_prefix + data['class'].astype(str) + ' '\n",
- " \n",
- " # cleaning it\n",
- " if cleanit:\n",
- " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n",
- " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n",
- " \n",
- " # shuffling it\n",
- " if shuffleit:\n",
- " df.sample(frac=1).reset_index(drop=True)\n",
- " \n",
- " return df"
- ],
- "execution_count": 6,
- "outputs": []
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2021-05-31 06:44:37-- https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n",
+ "Resolving github.com (github.com)... 192.30.255.113\n",
+ "Connecting to github.com (github.com)|192.30.255.113|:443... connected.\n",
+ "HTTP request sent, awaiting response... 301 Moved Permanently\n",
+ "Location: https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz [following]\n",
+ "--2021-05-31 06:44:37-- https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz\n",
+ "Reusing existing connection to github.com:443.\n",
+ "HTTP request sent, awaiting response... 302 Found\n",
+ "Location: https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz [following]\n",
+ "--2021-05-31 06:44:37-- https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 68431223 (65M) [application/octet-stream]\n",
+ "Saving to: ‘DATAPATH/dbpedia_csv.tar.gz.1’\n",
+ "\n",
+ "dbpedia_csv.tar.gz. 100%[===================>] 65.26M 124MB/s in 0.5s \n",
+ "\n",
+ "2021-05-31 06:44:38 (124 MB/s) - ‘DATAPATH/dbpedia_csv.tar.gz.1’ saved [68431223/68431223]\n",
+ "\n",
+ "dbpedia_csv/\n",
+ "dbpedia_csv/test.csv\n",
+ "dbpedia_csv/classes.txt\n",
+ "dbpedia_csv/train.csv\n",
+ "dbpedia_csv/readme.txt\n",
+ "total 328M\n",
+ "drwxr-xr-x 3 root root 4.0K May 31 06:44 .\n",
+ "drwxr-xr-x 1 root root 4.0K May 31 06:40 ..\n",
+ "drwxrwxr-x 2 1000 1000 4.0K Mar 29 2015 dbpedia_csv\n",
+ "-rw-r--r-- 1 root root 66M May 31 06:40 dbpedia_csv.tar.gz\n",
+ "-rw-r--r-- 1 root root 66M May 31 06:44 dbpedia_csv.tar.gz.1\n",
+ "-rw-r--r-- 1 root root 22M May 31 06:41 dbpedia_test.csv\n",
+ "-rw-r--r-- 1 root root 175M May 31 06:41 dbpedia_train.csv\n"
+ ]
+ }
+ ],
+ "source": [
+ "try :\n",
+ " \n",
+ " from google.colab import files\n",
+ " \n",
+ " # downloading the data\n",
+ " !wget -P DATAPATH https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n",
+ "\n",
+ " # untaring the required file\n",
+ " !tar -xvf DATAPATH/dbpedia_csv.tar.gz -C DATAPATH\n",
+ "\n",
+ " # sneek peek in the folder structure\n",
+ " !ls -lah DATAPATH\n",
+ " \n",
+ " # specifying the data_path\n",
+ " data_path = 'DATAPATH'\n",
+ " \n",
+ "except ModuleNotFoundError:\n",
+ " \n",
+ " if not os.path.exists(os.getcwd()+'\\\\Data\\\\dbpedia_csv') :\n",
+ " # downloading the data\n",
+ " url=\"https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\"\n",
+ " path=os.getcwd()+'\\Data'\n",
+ " wget.download(url,path)\n",
+ "\n",
+ " # untaring the required file\n",
+ " temp=path+'\\dbpedia_csv.tar.gz'\n",
+ " tar = tarfile.open(temp, \"r:gz\")\n",
+ " tar.extractall(path) \n",
+ " tar.close()\n",
+ " \n",
+ " # specifying the data_path\n",
+ " data_path='Data'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "lMoRw3oQb62I",
+ "outputId": "3a263fa1-1940-42d3-d1e6-aa59c5fb6f09"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "r_DRvdFcb62m",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 50
- },
- "outputId": "140ec680-99f8-4f2d-c00f-ebbb1cd5e9ae"
- },
- "source": [
- "%%time\n",
- "# Transform the datasets using the above clean functions\n",
- "df_train_cleaned = clean_df(df, True, True)\n",
- "df_test_cleaned = clean_df(df_test, True, True)"
- ],
- "execution_count": 7,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "CPU times: user 9 s, sys: 1.16 s, total: 10.2 s\n",
- "Wall time: 10.5 s\n"
- ],
- "name": "stdout"
- }
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train:(560000, 3) Test:(70000, 3)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Loading train data\n",
+ "train_file = data_path + '/dbpedia_csv/train.csv'\n",
+ "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n",
+ "# Loading test data\n",
+ "test_file = data_path + '/dbpedia_csv/test.csv'\n",
+ "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n",
+ "# Data we have\n",
+ "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 195
},
+ "id": "gaz226vXb62W",
+ "outputId": "d1877377-b282-4038-9f87-3589f6621597"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "imMZ9-Bkb62t",
- "colab": {}
- },
- "source": [
- "# Write files to disk as fastText classifier API reads files from disk.\n",
- "train_file = data_path + '/dbpedia_train.csv'\n",
- "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n",
- "\n",
- "test_file = data_path + '/dbpedia_test.csv'\n",
- "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " class | \n",
+ " name | \n",
+ " description | \n",
+ " class_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " E. D. Abbott Ltd | \n",
+ " Abbott of Farnham E D Abbott Limited was a Br... | \n",
+ " Company | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " Schwan-Stabilo | \n",
+ " Schwan-STABILO is a German maker of pens for ... | \n",
+ " Company | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1 | \n",
+ " Q-workshop | \n",
+ " Q-workshop is a Polish company located in Poz... | \n",
+ " Company | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1 | \n",
+ " Marvell Software Solutions Israel | \n",
+ " Marvell Software Solutions Israel known as RA... | \n",
+ " Company | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1 | \n",
+ " Bergan Mercy Medical Center | \n",
+ " Bergan Mercy Medical Center is a hospital loc... | \n",
+ " Company | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "execution_count": 8,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "bWZTSzd9b62x"
- },
- "source": [
- "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!"
+ "text/plain": [
+ " class ... class_name\n",
+ "0 1 ... Company\n",
+ "1 1 ... Company\n",
+ "2 1 ... Company\n",
+ "3 1 ... Company\n",
+ "4 1 ... Company\n",
+ "\n",
+ "[5 rows x 4 columns]"
]
+ },
+ "execution_count": 5,
+ "metadata": {
+ "tags": []
+ },
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Since we have no clue about the classes lets build one\n",
+ "# Mapping from class number to class name\n",
+ "class_dict={\n",
+ " 1:'Company',\n",
+ " 2:'EducationalInstitution',\n",
+ " 3:'Artist',\n",
+ " 4:'Athlete',\n",
+ " 5:'OfficeHolder',\n",
+ " 6:'MeanOfTransportation',\n",
+ " 7:'Building',\n",
+ " 8:'NaturalPlace',\n",
+ " 9:'Village',\n",
+ " 10:'Animal',\n",
+ " 11:'Plant',\n",
+ " 12:'Album',\n",
+ " 13:'Film',\n",
+ " 14:'WrittenWork'\n",
+ " }\n",
+ "\n",
+ "# Mapping the classes\n",
+ "df['class_name'] = df['class'].map(class_dict)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "si7VC_Rub62a",
+ "outputId": "9acc53e2-c417-478f-e0a2-61b9ee57cadc"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "a-H1wouCb62x",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 52
- },
- "outputId": "c4f88ae8-0314-480e-e201-6a905a0b3fdb"
- },
- "source": [
- "%%time\n",
- "## Using fastText for feature extraction and training\n",
- "from fasttext import train_supervised \n",
- "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n",
- "label_prefix refers to the prefix before label string in the dataset.\n",
- "default is __label__. In our dataset, it is __class__. \n",
- "There are several other parameters which can be seen in: \n",
- "https://pypi.org/project/fasttext/\n",
- "\"\"\"\n",
- "model = train_supervised(input=train_file, label=\"__class__\", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)"
- ],
- "execution_count": 18,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "CPU times: user 1h 30min 19s, sys: 25 s, total: 1h 30min 44s\n",
- "Wall time: 46min 33s\n"
- ],
- "name": "stdout"
- }
+ "data": {
+ "text/plain": [
+ "Building 40000\n",
+ "Animal 40000\n",
+ "Village 40000\n",
+ "OfficeHolder 40000\n",
+ "MeanOfTransportation 40000\n",
+ "Company 40000\n",
+ "Athlete 40000\n",
+ "NaturalPlace 40000\n",
+ "Plant 40000\n",
+ "EducationalInstitution 40000\n",
+ "WrittenWork 40000\n",
+ "Album 40000\n",
+ "Artist 40000\n",
+ "Film 40000\n",
+ "Name: class_name, dtype: int64"
]
+ },
+ "execution_count": 6,
+ "metadata": {
+ "tags": []
+ },
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[\"class_name\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "Sn-3kIqMb62d"
+ },
+ "outputs": [],
+ "source": [
+ "# Lets do some cleaning of this text\n",
+ "def clean_it(text,normalize=True):\n",
+ " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n",
+ " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n",
+ " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n",
+ " \n",
+ " # normalizing / encoding the text\n",
+ " if normalize:\n",
+ " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n",
+ " \n",
+ " return s\n",
+ "\n",
+ "# Now lets define a small function where we can use above cleaning on datasets\n",
+ "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n",
+ " # Defining the new data\n",
+ " df = data[['name','description']].copy(deep=True)\n",
+ " df['class'] = label_prefix + data['class'].astype(str) + ' '\n",
+ " \n",
+ " # cleaning it\n",
+ " if cleanit:\n",
+ " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n",
+ " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n",
+ " \n",
+ " # shuffling it\n",
+ " if shuffleit:\n",
+ " df.sample(frac=1).reset_index(drop=True)\n",
+ " \n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "r_DRvdFcb62m",
+ "outputId": "59a687ed-359a-4c95-b26f-77fcc084ad8b"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "sAyN3ZDbQFq-",
- "colab_type": "code",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 106
- },
- "outputId": "7b121c71-6605-4e10-86e9-753a24ae02a1"
- },
- "source": [
- "for k in range(1,6):\n",
- " results = model.test(test_file,k=k)\n",
- " print(f\"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}\")"
- ],
- "execution_count": 19,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Test Samples: 70000 Precision@1 : 93.2914 Recall@1 : 93.2914\n",
- "Test Samples: 70000 Precision@2 : 48.7243 Recall@2 : 97.4486\n",
- "Test Samples: 70000 Precision@3 : 32.6376 Recall@3 : 97.9129\n",
- "Test Samples: 70000 Precision@4 : 24.6793 Recall@4 : 98.7171\n",
- "Test Samples: 70000 Precision@5 : 19.8411 Recall@5 : 99.2057\n"
- ],
- "name": "stdout"
- }
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 4.31 s, sys: 196 ms, total: 4.5 s\n",
+ "Wall time: 4.5 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "# Transform the datasets using the above clean functions\n",
+ "df_train_cleaned = clean_df(df, True, True)\n",
+ "df_test_cleaned = clean_df(df_test, True, True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "imMZ9-Bkb62t"
+ },
+ "outputs": [],
+ "source": [
+ "# Write files to disk as fastText classifier API reads files from disk.\n",
+ "train_file = data_path + '/dbpedia_train.csv'\n",
+ "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n",
+ "\n",
+ "test_file = data_path + '/dbpedia_test.csv'\n",
+ "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bWZTSzd9b62x"
+ },
+ "source": [
+ "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "a-H1wouCb62x",
+ "outputId": "1d4d5272-adc8-4ed9-e6a1-002e79b4d147"
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "nrxSYRs3b621"
- },
- "source": [
- "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 93% Precision and Recall are hard numbers to beat, too!"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 1h 4s, sys: 13.6 s, total: 1h 18s\n",
+ "Wall time: 30min 41s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "## Using fastText for feature extraction and training\n",
+ "from fasttext import train_supervised \n",
+ "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n",
+ "label_prefix refers to the prefix before label string in the dataset.\n",
+ "default is __label__. In our dataset, it is __class__. \n",
+ "There are several other parameters which can be seen in: \n",
+ "https://pypi.org/project/fasttext/\n",
+ "\"\"\"\n",
+ "model = train_supervised(input=train_file, label=\"__class__\", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
+ "id": "sAyN3ZDbQFq-",
+ "outputId": "6f861f1f-a174-495c-97eb-db149fa73766"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "Bp9w8RScruz7",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- ""
- ],
- "execution_count": null,
- "outputs": []
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Test Samples: 70000 Precision@1 : 94.0343 Recall@1 : 94.0343\n",
+ "Test Samples: 70000 Precision@2 : 48.4336 Recall@2 : 96.8671\n",
+ "Test Samples: 70000 Precision@3 : 32.3905 Recall@3 : 97.1714\n",
+ "Test Samples: 70000 Precision@4 : 24.6318 Recall@4 : 98.5271\n",
+ "Test Samples: 70000 Precision@5 : 19.8137 Recall@5 : 99.0686\n"
+ ]
}
- ]
-}
\ No newline at end of file
+ ],
+ "source": [
+ "for k in range(1,6):\n",
+ " results = model.test(test_file,k=k)\n",
+ " print(f\"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nrxSYRs3b621"
+ },
+ "source": [
+ "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 93% Precision and Recall are hard numbers to beat, too!"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "name": "04_FastText_Example.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/Ch4/05_DeepNN_Example.ipynb b/Ch4/05_DeepNN_Example.ipynb
index a93c032..b771228 100644
--- a/Ch4/05_DeepNN_Example.ipynb
+++ b/Ch4/05_DeepNN_Example.ipynb
@@ -1,513 +1,557 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.5"
- },
- "colab": {
- "name": "DeepNN_Example.ipynb",
- "provenance": []
- }
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "aLNg_Puse6EX"
+ },
+ "source": [
+ "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset. "
+ ]
},
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "aLNg_Puse6EX",
- "colab_type": "text"
- },
- "source": [
- "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset. "
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "xqUcb7NBb5--",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "#Make the necessary imports\n",
- "import os\n",
- "import sys\n",
- "import numpy as np\n",
- "from keras.preprocessing.text import Tokenizer\n",
- "from keras.preprocessing.sequence import pad_sequences\n",
- "from keras.utils import to_categorical\n",
- "from keras.layers import Dense, Input, GlobalMaxPooling1D\n",
- "from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM\n",
- "from keras.models import Model, Sequential\n",
- "from keras.initializers import Constant"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "0MqW5vWwfiCP",
- "colab_type": "text"
- },
- "source": [
- "Here we set all the paths of all the external datasets and models such as [glove](https://nlp.stanford.edu/projects/glove/) and [IMDB reviews dataset](http://ai.stanford.edu/~amaas/data/sentiment/)."
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "k6SeU1jIb5_E",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "#Declaring some of the constants we will use\n",
- "BASE_DIR = 'DATAPATH' #change this to your local folder with these below datasets\n",
- "GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')#source: https://nlp.stanford.edu/projects/glove/\n",
- "TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/train') #source: http://ai.stanford.edu/~amaas/data/sentiment/\n",
- "TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/test') \n",
- "\n",
- "#Within these, I only have a pos/ and a neg/ folder containing text files \n",
- "MAX_SEQUENCE_LENGTH = 1000\n",
- "MAX_NUM_WORDS = 20000 \n",
- "EMBEDDING_DIM = 100 \n",
- "VALIDATION_SPLIT = 0.2\n",
- "\n",
- "#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py\n",
- "#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "EmifkoA8b5_N",
- "colab_type": "text"
- },
- "source": [
- "### Loading and Preprocessing\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "WI4O1usEb5_O",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.\n",
- "def get_data(data_dir):\n",
- " texts = [] # list of text samples\n",
- " labels_index = {'pos':1, 'neg':0} # dictionary mapping label name to numeric id\n",
- " labels = [] # list of label ids\n",
- " for name in sorted(os.listdir(data_dir)):\n",
- " path = os.path.join(data_dir, name)\n",
- " if os.path.isdir(path):\n",
- " label_id = labels_index[name]\n",
- " for fname in sorted(os.listdir(path)):\n",
- " fpath = os.path.join(path, fname)\n",
- " text = open(fpath).read()\n",
- " texts.append(text)\n",
- " labels.append(label_id)\n",
- " return texts, labels\n",
- "\n",
- "train_texts, train_labels = get_data(TRAIN_DATA_DIR)\n",
- "test_texts, test_labels = get_data(TEST_DATA_DIR)\n",
- "labels_index = {'pos':1, 'neg':0} \n",
- "\n",
- "#Just to see how the data looks like. \n",
- "#print(train_texts[0])\n",
- "#print(train_labels[0])\n",
- "#print(test_texts[24999])\n",
- "#print(test_labels[24999])"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "TJy6mEqub5_X",
- "colab_type": "code",
- "colab": {},
- "outputId": "6f30eb12-b0e2-47d0-968c-bd13fb806dbc"
- },
- "source": [
- "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer\n",
- "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.\n",
- "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)\n",
- "tokenizer.fit_on_texts(train_texts)\n",
- "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes\n",
- "test_sequences = tokenizer.texts_to_sequences(test_texts)\n",
- "word_index = tokenizer.word_index\n",
- "print('Found %s unique tokens.' % len(word_index))\n"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Found 88582 unique tokens.\n"
- ],
- "name": "stdout"
- }
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "_e0V1-bBb5_d",
- "colab_type": "code",
- "colab": {},
- "outputId": "deb3c74b-30b8-451a-cc37-49df30520da1"
- },
- "source": [
- "#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier\n",
- " #initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH\n",
- "trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
- "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
- "trainvalid_labels = to_categorical(np.asarray(train_labels))\n",
- "test_labels = to_categorical(np.asarray(test_labels))\n",
- "\n",
- "# split the training data into a training set and a validation set\n",
- "indices = np.arange(trainvalid_data.shape[0])\n",
- "np.random.shuffle(indices)\n",
- "trainvalid_data = trainvalid_data[indices]\n",
- "trainvalid_labels = trainvalid_labels[indices]\n",
- "num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])\n",
- "x_train = trainvalid_data[:-num_validation_samples]\n",
- "y_train = trainvalid_labels[:-num_validation_samples]\n",
- "x_val = trainvalid_data[-num_validation_samples:]\n",
- "y_val = trainvalid_labels[-num_validation_samples:]\n",
- "#This is the data we will use for CNN and RNN training\n",
- "print('Splitting the train data into train and valid is done')"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Splitting the train data into train and valid is done\n"
- ],
- "name": "stdout"
- }
- ]
- },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "xqUcb7NBb5--"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "WUHqg2vvb5_l",
- "colab_type": "code",
- "colab": {},
- "outputId": "32c014cc-cee4-4a84-896b-7db39a24eb21"
- },
- "source": [
- "print('Preparing embedding matrix.')\n",
- "\n",
- "# first, build index mapping words in the embeddings set\n",
- "# to their embedding vector\n",
- "embeddings_index = {}\n",
- "with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:\n",
- " for line in f:\n",
- " values = line.split()\n",
- " word = values[0]\n",
- " coefs = np.asarray(values[1:], dtype='float32')\n",
- " embeddings_index[word] = coefs\n",
- "\n",
- "print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))\n",
- "#print(embeddings_index[\"google\"])\n",
- "\n",
- "# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.\n",
- "num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n",
- "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n",
- "for word, i in word_index.items():\n",
- " if i > MAX_NUM_WORDS:\n",
- " continue\n",
- " embedding_vector = embeddings_index.get(word)\n",
- " if embedding_vector is not None:\n",
- " # words not found in embedding index will be all-zeros.\n",
- " embedding_matrix[i] = embedding_vector\n",
- "\n",
- "# load these pre-trained word embeddings into an Embedding layer\n",
- "# note that we set trainable = False so as to keep the embeddings fixed\n",
- "embedding_layer = Embedding(num_words,\n",
- " EMBEDDING_DIM,\n",
- " embeddings_initializer=Constant(embedding_matrix),\n",
- " input_length=MAX_SEQUENCE_LENGTH,\n",
- " trainable=False)\n",
- "print(\"Preparing of embedding matrix is done\")"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Preparing embedding matrix.\n",
- "Found 400000 word vectors in Glove embeddings.\n",
- "Preparing of embedding matrix is done\n"
- ],
- "name": "stdout"
- }
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "vEastnX8gdxR",
- "colab_type": "text"
- },
- "source": [
- "### 1D CNN Model with pre-trained embedding"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "TTY-4K-Ob5_t",
- "colab_type": "code",
- "colab": {},
- "outputId": "22425d40-4639-476a-872d-2916e9771195"
- },
- "source": [
- "print('Define a 1D CNN model.')\n",
- "\n",
- "cnnmodel = Sequential()\n",
- "cnnmodel.add(embedding_layer)\n",
- "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
- "cnnmodel.add(MaxPooling1D(5))\n",
- "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
- "cnnmodel.add(MaxPooling1D(5))\n",
- "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
- "cnnmodel.add(GlobalMaxPooling1D())\n",
- "cnnmodel.add(Dense(128, activation='relu'))\n",
- "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n",
- "\n",
- "cnnmodel.compile(loss='categorical_crossentropy',\n",
- " optimizer='rmsprop',\n",
- " metrics=['acc'])\n",
- "#Train the model. Tune to validation set. \n",
- "cnnmodel.fit(x_train, y_train,\n",
- " batch_size=128,\n",
- " epochs=1, validation_data=(x_val, y_val))\n",
- "#Evaluate on test set:\n",
- "score, acc = cnnmodel.evaluate(test_data, test_labels)\n",
- "print('Test accuracy with CNN:', acc)"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Define a 1D CNN model.\n",
- "Train on 20000 samples, validate on 5000 samples\n",
- "Epoch 1/1\n",
- "20000/20000 [==============================] - 88s 4ms/step - loss: 0.6868 - acc: 0.5985 - val_loss: 0.5843 - val_acc: 0.6858\n",
- "25000/25000 [==============================] - 42s 2ms/step\n",
- "Test accuracy with CNN: 0.68772\n"
- ],
- "name": "stdout"
- }
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Using TensorFlow backend.\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Make the necessary imports\n",
+ "import os\n",
+ "import sys\n",
+ "import numpy as np\n",
+ "import tarfile\n",
+ "import wget\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\") \n",
+ "from zipfile import ZipFile\n",
+ "from keras.preprocessing.text import Tokenizer\n",
+ "from keras.preprocessing.sequence import pad_sequences\n",
+ "from keras.utils import to_categorical\n",
+ "from keras.layers import Dense, Input, GlobalMaxPooling1D\n",
+ "from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM\n",
+ "from keras.models import Model, Sequential\n",
+ "from keras.initializers import Constant"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "0MqW5vWwfiCP"
+ },
+ "source": [
+ "Here we set all the paths of all the external datasets and models such as [glove](https://nlp.stanford.edu/projects/glove/) and [IMDB reviews dataset](http://ai.stanford.edu/~amaas/data/sentiment/)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "try:\n",
+ " \n",
+ " from google.colab import files\n",
+ " \n",
+ " !wget -P DATAPATH http://nlp.stanford.edu/data/glove.6B.zip\n",
+ " !unzip DATAPATH/glove.6B.zip -C DATAPATH\n",
+ " \n",
+ " !wget -P DATAPATH http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n",
+ " !tar -xvf DATAPATH/aclImdb_v1.tar.gz -C DATAPATH\n",
+ " \n",
+ " BASE_DIR = 'DATAPATH'\n",
+ " \n",
+ "except ModuleNotFoundError:\n",
+ " \n",
+ " if not os.path.exists(os.getcwd()+'\\\\Data\\\\glove.6B'):\n",
+ " os.makdir(os.getcwd()+'\\\\Data\\\\glove.6B')\n",
+ " \n",
+ " url='http://nlp.stanford.edu/data/glove.6B.zip' \n",
+ " path=os.getcwd()+'\\Data' \n",
+ " wget.download(url,path) \n",
+ " \n",
+ " temp=path+'\\glove.6B.zip' \n",
+ " file = ZipFile(temp) \n",
+ " file.extractall(path+'\\glove.6B') \n",
+ " file.close()\n",
+ " \n",
+ " \n",
+ " \n",
+ " if not os.path.exists(os.getcwd()+'\\\\Data\\\\aclImdb'):\n",
+ " url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' \n",
+ " path=os.getcwd()+'\\Data' \n",
+ " wget.download(url,path)\n",
+ " \n",
+ " temp=path+'\\aclImdb_v1.tar.gz' \n",
+ " tar = tarfile.open(temp, \"r:gz\")\n",
+ " tar.extractall(path) \n",
+ " tar.close()\n",
+ " \n",
+ " BASE_DIR = 'Data'\n",
+ "\n",
+ "GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')\n",
+ "TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb\\\\train')\n",
+ "TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb\\\\test')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Within these, I only have a pos/ and a neg/ folder containing text files \n",
+ "MAX_SEQUENCE_LENGTH = 1000\n",
+ "MAX_NUM_WORDS = 20000 \n",
+ "EMBEDDING_DIM = 100 \n",
+ "VALIDATION_SPLIT = 0.2\n",
+ "\n",
+ "#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py\n",
+ "#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "EmifkoA8b5_N"
+ },
+ "source": [
+ "### Loading and Preprocessing\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "WI4O1usEb5_O"
+ },
+ "outputs": [],
+ "source": [
+ "#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.\n",
+ "def get_data(data_dir):\n",
+ " texts = [] # list of text samples\n",
+ " labels_index = {'pos':1, 'neg':0} # dictionary mapping label name to numeric id\n",
+ " labels = [] # list of label ids\n",
+ " for name in sorted(os.listdir(data_dir)):\n",
+ " path = os.path.join(data_dir, name)\n",
+ " if os.path.isdir(path):\n",
+ " if name=='pos' or name=='neg':\n",
+ " label_id = labels_index[name]\n",
+ " for fname in sorted(os.listdir(path)):\n",
+ " fpath = os.path.join(path, fname)\n",
+ " text = open(fpath,encoding='utf8').read()\n",
+ " texts.append(text)\n",
+ " labels.append(label_id)\n",
+ " return texts, labels\n",
+ "\n",
+ "train_texts, train_labels = get_data(TRAIN_DATA_DIR)\n",
+ "test_texts, test_labels = get_data(TEST_DATA_DIR)\n",
+ "labels_index = {'pos':1, 'neg':0} \n",
+ "\n",
+ "#Just to see how the data looks like. \n",
+ "#print(train_texts[0])\n",
+ "#print(train_labels[0])\n",
+ "#print(test_texts[24999])\n",
+ "#print(test_labels[24999])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "VdDj2FJzgi_W",
- "colab_type": "text"
- },
- "source": [
- "### 1D CNN model with training your own embedding"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Found 88582 unique tokens.\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer \n",
+ "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data. \n",
+ "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) \n",
+ "tokenizer.fit_on_texts(train_texts) \n",
+ "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes \n",
+ "test_sequences = tokenizer.texts_to_sequences(test_texts) \n",
+ "word_index = tokenizer.word_index \n",
+ "print('Found %s unique tokens.' % len(word_index))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "_e0V1-bBb5_d",
+ "outputId": "deb3c74b-30b8-451a-cc37-49df30520da1"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "zI0bISwRb5_w",
- "colab_type": "code",
- "colab": {},
- "outputId": "1e96c6be-66c2-4a59-da1f-fc80e90889bd"
- },
- "source": [
- "print(\"Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\")\n",
- "cnnmodel = Sequential()\n",
- "cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n",
- "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
- "cnnmodel.add(MaxPooling1D(5))\n",
- "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
- "cnnmodel.add(MaxPooling1D(5))\n",
- "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
- "cnnmodel.add(GlobalMaxPooling1D())\n",
- "cnnmodel.add(Dense(128, activation='relu'))\n",
- "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n",
- "\n",
- "cnnmodel.compile(loss='categorical_crossentropy',\n",
- " optimizer='rmsprop',\n",
- " metrics=['acc'])\n",
- "#Train the model. Tune to validation set. \n",
- "cnnmodel.fit(x_train, y_train,\n",
- " batch_size=128,\n",
- " epochs=1, validation_data=(x_val, y_val))\n",
- "#Evaluate on test set:\n",
- "score, acc = cnnmodel.evaluate(test_data, test_labels)\n",
- "print('Test accuracy with CNN:', acc)"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n",
- "Train on 20000 samples, validate on 5000 samples\n",
- "Epoch 1/1\n",
- "20000/20000 [==============================] - 120s 6ms/step - loss: 0.5233 - acc: 0.7028 - val_loss: 0.3207 - val_acc: 0.8638\n",
- "25000/25000 [==============================] - 43s 2ms/step\n",
- "Test accuracy with CNN: 0.84352\n"
- ],
- "name": "stdout"
- }
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Splitting the train data into train and valid is done\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier\n",
+ "#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH\n",
+ "trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
+ "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
+ "trainvalid_labels = to_categorical(np.asarray(train_labels))\n",
+ "test_labels = to_categorical(np.asarray(test_labels))\n",
+ "\n",
+ "# split the training data into a training set and a validation set\n",
+ "indices = np.arange(trainvalid_data.shape[0])\n",
+ "np.random.shuffle(indices)\n",
+ "trainvalid_data = trainvalid_data[indices]\n",
+ "trainvalid_labels = trainvalid_labels[indices]\n",
+ "num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])\n",
+ "x_train = trainvalid_data[:-num_validation_samples]\n",
+ "y_train = trainvalid_labels[:-num_validation_samples]\n",
+ "x_val = trainvalid_data[-num_validation_samples:]\n",
+ "y_val = trainvalid_labels[-num_validation_samples:]\n",
+ "#This is the data we will use for CNN and RNN training\n",
+ "print('Splitting the train data into train and valid is done')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "WUHqg2vvb5_l",
+ "outputId": "32c014cc-cee4-4a84-896b-7db39a24eb21"
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "6GwhXpmSgt4H",
- "colab_type": "text"
- },
- "source": [
- "### LSTM Model with training your own embedding "
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Preparing embedding matrix.\n",
+ "Found 400000 word vectors in Glove embeddings.\n",
+ "Preparing of embedding matrix is done\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Preparing embedding matrix.')\n",
+ "\n",
+ "# first, build index mapping words in the embeddings set\n",
+ "# to their embedding vector\n",
+ "embeddings_index = {}\n",
+ "with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf8') as f:\n",
+ " for line in f:\n",
+ " values = line.split()\n",
+ " word = values[0]\n",
+ " coefs = np.asarray(values[1:], dtype='float32')\n",
+ " embeddings_index[word] = coefs\n",
+ "\n",
+ "print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))\n",
+ "#print(embeddings_index[\"google\"])\n",
+ "\n",
+ "# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.\n",
+ "num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n",
+ "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n",
+ "for word, i in word_index.items():\n",
+ " if i > MAX_NUM_WORDS:\n",
+ " continue\n",
+ " embedding_vector = embeddings_index.get(word)\n",
+ " if embedding_vector is not None:\n",
+ " # words not found in embedding index will be all-zeros.\n",
+ " embedding_matrix[i] = embedding_vector\n",
+ "\n",
+ "# load these pre-trained word embeddings into an Embedding layer\n",
+ "# note that we set trainable = False so as to keep the embeddings fixed\n",
+ "embedding_layer = Embedding(num_words,\n",
+ " EMBEDDING_DIM,\n",
+ " embeddings_initializer=Constant(embedding_matrix),\n",
+ " input_length=MAX_SEQUENCE_LENGTH,\n",
+ " trainable=False)\n",
+ "print(\"Preparing of embedding matrix is done\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "vEastnX8gdxR"
+ },
+ "source": [
+ "### 1D CNN Model with pre-trained embedding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "TTY-4K-Ob5_t",
+ "outputId": "22425d40-4639-476a-872d-2916e9771195"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "SvBt2Brib5_4",
- "colab_type": "code",
- "colab": {},
- "outputId": "1436903e-66f2-4add-d04b-143e551c534d"
- },
- "source": [
- "print(\"Defining and training an LSTM model, training embedding layer on the fly\")\n",
- "\n",
- "#model\n",
- "rnnmodel = Sequential()\n",
- "rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n",
- "rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n",
- "rnnmodel.add(Dense(2, activation='sigmoid'))\n",
- "rnnmodel.compile(loss='binary_crossentropy',\n",
- " optimizer='adam',\n",
- " metrics=['accuracy'])\n",
- "print('Training the RNN')\n",
- "\n",
- "rnnmodel.fit(x_train, y_train,\n",
- " batch_size=32,\n",
- " epochs=1,\n",
- " validation_data=(x_val, y_val))\n",
- "score, acc = rnnmodel.evaluate(test_data, test_labels,\n",
- " batch_size=32)\n",
- "print('Test accuracy with RNN:', acc)"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Defining and training an LSTM model, training embedding layer on the fly\n",
- "Training the RNN\n",
- "Train on 20000 samples, validate on 5000 samples\n",
- "Epoch 1/1\n",
- "20000/20000 [==============================] - 435s 22ms/step - loss: 0.5136 - acc: 0.7484 - val_loss: 0.4333 - val_acc: 0.8007\n",
- "25000/25000 [==============================] - 103s 4ms/step\n",
- "Test accuracy with RNN: 0.79736\n"
- ],
- "name": "stdout"
- }
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Define a 1D CNN model.\n",
+ "WARNING:tensorflow:From D:\\Anaconda\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
+ "\n",
+ "WARNING:tensorflow:From D:\\Anaconda\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
+ "\n",
+ "Train on 20000 samples, validate on 5000 samples\n",
+ "Epoch 1/1\n",
+ "20000/20000 [==============================] - 49s 2ms/step - loss: 0.6787 - acc: 0.6084 - val_loss: 0.5057 - val_acc: 0.7748\n",
+ "25000/25000 [==============================] - 15s 617us/step\n",
+ "Test accuracy with CNN: 0.7612800002098083\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Define a 1D CNN model.')\n",
+ "\n",
+ "cnnmodel = Sequential()\n",
+ "cnnmodel.add(embedding_layer)\n",
+ "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
+ "cnnmodel.add(MaxPooling1D(5))\n",
+ "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
+ "cnnmodel.add(MaxPooling1D(5))\n",
+ "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
+ "cnnmodel.add(GlobalMaxPooling1D())\n",
+ "cnnmodel.add(Dense(128, activation='relu'))\n",
+ "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n",
+ "\n",
+ "cnnmodel.compile(loss='categorical_crossentropy',\n",
+ " optimizer='rmsprop',\n",
+ " metrics=['acc'])\n",
+ "#Train the model. Tune to validation set. \n",
+ "cnnmodel.fit(x_train, y_train,\n",
+ " batch_size=128,\n",
+ " epochs=1, validation_data=(x_val, y_val))\n",
+ "#Evaluate on test set:\n",
+ "score, acc = cnnmodel.evaluate(test_data, test_labels)\n",
+ "print('Test accuracy with CNN:', acc)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "VdDj2FJzgi_W"
+ },
+ "source": [
+ "### 1D CNN model with training your own embedding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "zI0bISwRb5_w",
+ "outputId": "1e96c6be-66c2-4a59-da1f-fc80e90889bd"
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "tJYzsZFSg9z-",
- "colab_type": "text"
- },
- "source": [
- "### LSTM Model using pre-trained Embedding Layer"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n",
+ "Train on 20000 samples, validate on 5000 samples\n",
+ "Epoch 1/1\n",
+ "20000/20000 [==============================] - 67s 3ms/step - loss: 0.5023 - acc: 0.7254 - val_loss: 0.2894 - val_acc: 0.8848\n",
+ "25000/25000 [==============================] - 18s 708us/step\n",
+ "Test accuracy with CNN: 0.8749200105667114\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\")\n",
+ "cnnmodel = Sequential()\n",
+ "cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n",
+ "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
+ "cnnmodel.add(MaxPooling1D(5))\n",
+ "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
+ "cnnmodel.add(MaxPooling1D(5))\n",
+ "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n",
+ "cnnmodel.add(GlobalMaxPooling1D())\n",
+ "cnnmodel.add(Dense(128, activation='relu'))\n",
+ "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n",
+ "\n",
+ "cnnmodel.compile(loss='categorical_crossentropy',\n",
+ " optimizer='rmsprop',\n",
+ " metrics=['acc'])\n",
+ "#Train the model. Tune to validation set. \n",
+ "cnnmodel.fit(x_train, y_train,\n",
+ " batch_size=128,\n",
+ " epochs=1, validation_data=(x_val, y_val))\n",
+ "#Evaluate on test set:\n",
+ "score, acc = cnnmodel.evaluate(test_data, test_labels)\n",
+ "print('Test accuracy with CNN:', acc)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "6GwhXpmSgt4H"
+ },
+ "source": [
+ "### LSTM Model with training your own embedding "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "SvBt2Brib5_4",
+ "outputId": "1436903e-66f2-4add-d04b-143e551c534d"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "Eymx0IyCb5_-",
- "colab_type": "code",
- "colab": {},
- "outputId": "7d665d21-4f34-46bb-ac3a-2fab1055268b"
- },
- "source": [
- "print(\"Defining and training an LSTM model, using pre-trained embedding layer\")\n",
- "\n",
- "rnnmodel2 = Sequential()\n",
- "rnnmodel2.add(embedding_layer)\n",
- "rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n",
- "rnnmodel2.add(Dense(2, activation='sigmoid'))\n",
- "rnnmodel2.compile(loss='binary_crossentropy',\n",
- " optimizer='adam',\n",
- " metrics=['accuracy'])\n",
- "print('Training the RNN')\n",
- "\n",
- "rnnmodel2.fit(x_train, y_train,\n",
- " batch_size=32,\n",
- " epochs=1,\n",
- " validation_data=(x_val, y_val))\n",
- "score, acc = rnnmodel2.evaluate(test_data, test_labels,\n",
- " batch_size=32)\n",
- "print('Test accuracy with RNN:', acc)"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Defining and training an LSTM model, using pre-trained embedding layer\n",
- "Training the RNN\n",
- "Train on 20000 samples, validate on 5000 samples\n",
- "Epoch 1/1\n",
- "20000/20000 [==============================] - 361s 18ms/step - loss: 0.6102 - acc: 0.6615 - val_loss: 0.5131 - val_acc: 0.7766\n",
- "25000/25000 [==============================] - 112s 4ms/step\n",
- "Test accuracy with RNN: 0.77078\n"
- ],
- "name": "stdout"
- }
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Defining and training an LSTM model, training embedding layer on the fly\n",
+ "WARNING:tensorflow:From D:\\Anaconda\\lib\\site-packages\\tensorflow\\python\\ops\\nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
+ "Instructions for updating:\n",
+ "Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
+ "Training the RNN\n",
+ "Train on 20000 samples, validate on 5000 samples\n",
+ "Epoch 1/1\n",
+ "20000/20000 [==============================] - 445s 22ms/step - loss: 0.4829 - accuracy: 0.7736 - val_loss: 0.4104 - val_accuracy: 0.8264\n",
+ "25000/25000 [==============================] - 135s 5ms/step\n",
+ "Test accuracy with RNN: 0.8212599754333496\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Defining and training an LSTM model, training embedding layer on the fly\")\n",
+ "\n",
+ "#model\n",
+ "rnnmodel = Sequential()\n",
+ "rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n",
+ "rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n",
+ "rnnmodel.add(Dense(2, activation='sigmoid'))\n",
+ "rnnmodel.compile(loss='binary_crossentropy',\n",
+ " optimizer='adam',\n",
+ " metrics=['accuracy'])\n",
+ "print('Training the RNN')\n",
+ "\n",
+ "rnnmodel.fit(x_train, y_train,\n",
+ " batch_size=32,\n",
+ " epochs=1,\n",
+ " validation_data=(x_val, y_val))\n",
+ "score, acc = rnnmodel.evaluate(test_data, test_labels,\n",
+ " batch_size=32)\n",
+ "print('Test accuracy with RNN:', acc)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "tJYzsZFSg9z-"
+ },
+ "source": [
+ "### LSTM Model using pre-trained Embedding Layer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "Eymx0IyCb5_-",
+ "outputId": "7d665d21-4f34-46bb-ac3a-2fab1055268b"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "metadata": {
- "id": "6tKQaeHxb6AB",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- ""
- ],
- "execution_count": 0,
- "outputs": []
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Defining and training an LSTM model, using pre-trained embedding layer\n",
+ "Training the RNN\n",
+ "Train on 20000 samples, validate on 5000 samples\n",
+ "Epoch 1/1\n",
+ "20000/20000 [==============================] - 349s 17ms/step - loss: 0.6123 - accuracy: 0.6603 - val_loss: 0.4820 - val_accuracy: 0.7859\n",
+ "25000/25000 [==============================] - 131s 5ms/step\n",
+ "Test accuracy with RNN: 0.7855600118637085\n"
+ ]
}
- ]
-}
\ No newline at end of file
+ ],
+ "source": [
+ "print(\"Defining and training an LSTM model, using pre-trained embedding layer\")\n",
+ "\n",
+ "rnnmodel2 = Sequential()\n",
+ "rnnmodel2.add(embedding_layer)\n",
+ "rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n",
+ "rnnmodel2.add(Dense(2, activation='sigmoid'))\n",
+ "rnnmodel2.compile(loss='binary_crossentropy',\n",
+ " optimizer='adam',\n",
+ " metrics=['accuracy'])\n",
+ "print('Training the RNN')\n",
+ "\n",
+ "rnnmodel2.fit(x_train, y_train,\n",
+ " batch_size=32,\n",
+ " epochs=1,\n",
+ " validation_data=(x_val, y_val))\n",
+ "score, acc = rnnmodel2.evaluate(test_data, test_labels,\n",
+ " batch_size=32)\n",
+ "print('Test accuracy with RNN:', acc)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "DeepNN_Example.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}