From d33d378887658f78f6bb0ed9e1f83e9a67fac4c5 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Sun, 13 Aug 2023 08:06:39 +0000 Subject: [PATCH 01/14] Updated Ch4/01_OnePipeline_ManyClassifiers.ipynb by removing errors and updating libraries --- Ch4/01_OnePipeline_ManyClassifiers.ipynb | 191 +++++++++++------------ 1 file changed, 92 insertions(+), 99 deletions(-) diff --git a/Ch4/01_OnePipeline_ManyClassifiers.ipynb b/Ch4/01_OnePipeline_ManyClassifiers.ipynb index 87ea672..0c99c5f 100644 --- a/Ch4/01_OnePipeline_ManyClassifiers.ipynb +++ b/Ch4/01_OnePipeline_ManyClassifiers.ipynb @@ -37,30 +37,43 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting scikit-learn==0.21.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9f/c5/e5267eb84994e9a92a2c6a6ee768514f255d036f3c8378acfa694e9f2c99/scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7MB)\n", - "\u001b[K |████████████████████████████████| 6.7MB 3.0MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Installing collected packages: scikit-learn\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed scikit-learn-0.21.3\n", - "Requirement already satisfied: matplotlib==3.2.2 in /usr/local/lib/python3.7/dist-packages (3.2.2)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.3.1)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.8.1)\n", - "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.19.5)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.4.7)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (0.10.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib==3.2.2) (1.15.0)\n" + "Requirement already satisfied: numpy in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (1.24.3)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: pandas in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (2.0.3)\n", + "Requirement already satisfied: tzdata>=2022.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: numpy>=1.20.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (1.24.3)\n", + "Requirement already satisfied: pytz>=2020.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", + "Requirement already satisfied: six>=1.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: scikit-learn in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (1.3.0)\n", + "Requirement already satisfied: numpy>=1.17.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.24.3)\n", + "Requirement already satisfied: scipy>=1.5.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.11.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (3.2.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: matplotlib in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (3.7.2)\n", + "Requirement already satisfied: numpy>=1.20 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (1.24.3)\n", + "Requirement already satisfied: packaging>=20.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (23.1)\n", + "Requirement already satisfied: cycler>=0.10 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: pillow>=6.2.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (10.0.0)\n", + "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (3.0.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: importlib-resources>=3.2.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (6.0.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (1.4.4)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (4.42.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (1.1.0)\n", + "Requirement already satisfied: zipp>=3.1.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.16.2)\n", + "Requirement already satisfied: six>=1.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], @@ -69,10 +82,10 @@ "\n", "# ===========================\n", "\n", - "!pip install numpy==1.19.5\n", - "!pip install pandas==1.1.5\n", - "!pip install scikit-learn==0.21.3\n", - "!pip install matplotlib==3.2.2\n", + "!pip install numpy\n", + "!pip install pandas\n", + "!pip install scikit-learn\n", + "!pip install matplotlib\n", "\n", "# ===========================" ] @@ -118,7 +131,7 @@ "\n", "# import feature extraction methods from sklearn\n", "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.feature_extraction import stop_words\n", + "from sklearn.feature_extraction import _stop_words # This Module has become private after sklearn 0.24 thus stop_words changed to _stop_words\n", "\n", "# pre-processing of text\n", "import string\n", @@ -159,29 +172,7 @@ "id": "fVD8N_E51lk7", "outputId": "b5893f5e-1123-43f7-d3a5-2e4fb92bfdc9" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2021-07-16 08:09:13-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 12383529 (12M) [text/plain]\n", - "Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv’\n", - "\n", - "Full-Economic-News- 100%[===================>] 11.81M 22.9MB/s in 0.5s \n", - "\n", - "2021-07-16 08:09:14 (22.9 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv’ saved [12383529/12383529]\n", - "\n", - "total 12M\n", - "drwxr-xr-x 2 root root 4.0K Jul 16 08:09 .\n", - "drwxr-xr-x 1 root root 4.0K Jul 16 08:09 ..\n", - "-rw-r--r-- 1 root root 12M Jul 16 08:09 Full-Economic-News-DFE-839861.csv\n" - ] - } - ], + "outputs": [], "source": [ "try:\n", " from google.colab import files\n", @@ -211,24 +202,21 @@ "(8000, 15)" ] }, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ + "relevance\n", "no 0.821375\n", "yes 0.177500\n", "not sure 0.001125\n", - "Name: relevance, dtype: float64" + "Name: count, dtype: float64" ] }, "execution_count": 5, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -264,9 +252,7 @@ ] }, "execution_count": 6, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -305,7 +291,7 @@ }, "outputs": [], "source": [ - "stopwords = stop_words.ENGLISH_STOP_WORDS\n", + "stopwords = _stop_words.ENGLISH_STOP_WORDS\n", "def clean(doc): # doc is a string of text\n", " doc = doc.replace(\"
\", \" \") # This text contains a lot of
tags.\n", " doc = \"\".join([char for char in doc if char not in string.punctuation and not char.isdigit()])\n", @@ -412,8 +398,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 14 ms, sys: 994 µs, total: 14.9 ms\n", - "Wall time: 15.2 ms\n" + "CPU times: user 35.4 ms, sys: 0 ns, total: 35.4 ms\n", + "Wall time: 48.9 ms\n" ] } ], @@ -446,15 +432,12 @@ }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light", - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -537,22 +520,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 7.05 ms, sys: 7 µs, total: 7.06 ms\n", - "Wall time: 7.13 ms\n", + "CPU times: user 14.5 ms, sys: 0 ns, total: 14.5 ms\n", + "Wall time: 14.6 ms\n", "Accuracy: 0.6876876876876877\n" ] }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsQAAAJnCAYAAACQ3UXDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB+OElEQVR4nO3dd3gU5dfG8Xs3vZBQ0iiB0HsvoYN0QZqCgCgEBURBVKSIhaogNhBEsFFFBUXxpyCCkd47gtK7ECC0hJaQ7Lx/8GZlSbImISFh9/vxmksy88yzZzaEnJycecZkGIYhAAAAwEmZszsAAAAAIDuREAMAAMCpkRADAADAqZEQAwAAwKmREAMAAMCpkRADAADAqZEQAwAAwKmREAMAAMCpkRADAADAqZEQAznMwYMH1aJFC/n7+8tkMmnRokWZOv+xY8dkMpk0a9asTJ3XEYSFhSkiIiJbYxg1apRMJlO6xkZHR2dxVMiohIQEDR06VKGhoTKbzerQoUN2hwQgBSTEQAoOHz6sZ599VsWKFZOnp6f8/PxUr149ffTRR7px40aWvnbPnj31559/6u2339bcuXNVo0aNLH09R/TXX39p1KhROnbsWHaHkinGjRuX6T8YPUhMJlOK2zvvvJNs7D///KPHH39cuXPnlp+fn9q3b68jR46kOO+XX36psmXLytPTUyVLltSUKVNSHJeeOe82Y8YMvffee+rUqZNmz56tl19+Oe0Xng5ff/21Jk2alCVzA87AZBiGkd1BADnJ4sWL1blzZ3l4eKhHjx6qUKGC4uPjtXbtWi1cuFARERH67LPPsuS1b9y4IW9vb73++ut66623suQ1DMNQXFyc3Nzc5OLikiWvkd2+//57de7cWStWrFDjxo3TfF5cXJzMZrPc3NyyLrj/kJCQoISEBHl6elr3+fr6qlOnTsmq+qNGjdLo0aN1/vx5BQQE3OdI7x+TyaTmzZurR48eNvurVq2q8uXLWz++evWqqlWrpitXruiVV16Rm5ubJk6cKMMwtHPnTuXLl8869tNPP1W/fv302GOPqWXLllqzZo3mzp2rd955R8OGDcvQnCnp2rWr1q5dq1OnTmXSu5GyRx55RHv27HGYHwKB+801uwMAcpKjR4+qa9euKlKkiP744w/lz5/feqx///46dOiQFi9enGWvf/78eUlS7ty5s+w1TCaTTbLl7AzD0M2bN+Xl5SUPD4/sDkeurq5ydeWf5ruVKlVKTz75pN0xn3zyiQ4ePKjNmzerZs2akqSHH35YFSpU0AcffKBx48ZJuv2D5+uvv642bdro+++/lyT16dNHFotFY8eOVd++fZUnT550zZmac+fOZenXc1a7fv26vL29szsMIOsZAKz69etnSDLWrVuXpvG3bt0yxowZYxQrVsxwd3c3ihQpYgwfPty4efOmzbgiRYoYbdq0MdasWWPUrFnT8PDwMIoWLWrMnj3bOmbkyJGGJJutSJEihmEYRs+ePa1/vlPSOXdatmyZUa9ePcPf39/w8fExSpUqZQwfPtx6/OjRo4YkY+bMmTbnRUZGGvXr1ze8vb0Nf39/o127dsZff/2V4usdPHjQ6Nmzp+Hv72/4+fkZERERxrVr1/7z/WrUqJFRvnx5Y9euXUbDhg0NLy8vo3jx4sZ3331nGIZhrFy50qhVq5bh6elplCpVyli+fLnN+ceOHTOee+45o1SpUoanp6eRN29eo1OnTsbRo0etY2bOnJnsfZRkrFixwuZzsXTpUqN69eqGh4eHMXHiROuxnj17GoZhGBaLxWjcuLEREBBgnD171jp/XFycUaFCBaNYsWLG1atXU7xOi8Vi5MuXz3j55Zet+xITEw1/f3/DbDYbly5dsu5/5513DBcXFyM2NtbmPU6S0rUkxfggfD7u5X288z3o37+/cf36dePGjRupjqtZs6ZRs2bNZPtbtGhhFC9e3Prx4sWLDUnG4sWLbcatX7/ekGTMnTs33XPeLenrLLW/h4mJicbEiRONcuXKGR4eHkZQUJDRt29f4+LFizbzLFq0yGjdurWRP39+w93d3ShWrJgxZswYIyEhwTqmUaNGqf7bkfT1cOfnxDAMY8WKFTbxJM1Tvnx5Y+vWrUaDBg0MLy8v48UXXzQMwzBu3rxpjBgxwihevLjh7u5uFCpUyBgyZEiyf+v+698fIKeihxi4w88//6xixYqpbt26aRrfu3dvjRgxQtWqVdPEiRPVqFEjjR8/Xl27dk029tChQ+rUqZOaN2+uDz74QHny5FFERIT27t0rSXr00Uc1ceJESVK3bt00d+7cdPcE7t27V4888oji4uI0ZswYffDBB2rXrp3WrVtn97zff/9dLVu21Llz5zRq1CgNGjRI69evV7169VL8Fezjjz+u2NhYjR8/Xo8//rhmzZql0aNHpynGS5cu6ZFHHlF4eLjeffddeXh4qGvXrpo/f766du2q1q1b65133tG1a9fUqVMnxcbGWs/dsmWL1q9fr65du2ry5Mnq16+fIiMj1bhxY12/fl2S1LBhQw0cOFCS9Nprr2nu3LmaO3euypYta51n//796tatm5o3b66PPvpIVapUSRanyWTSjBkzdPPmTfXr18+6f+TIkdq7d69mzpwpHx+fFK/RZDKpXr16Wr16tXXf7t27deXKFUmy+XysWbNGVatWla+vb4pzzZ07Vx4eHmrQoIH1Wp599lmbMTn583Ev7+OdZs2aJR8fH3l5ealcuXL6+uuvbY5bLBbt3r07xZ77WrVq6fDhw9bYd+zYIUnJxlavXl1ms9l6PD1z3i0wMFBz585VmTJlVKhQoWR/D5999lkNGTLEem9Cr169NG/ePLVs2VK3bt2yuW5fX18NGjRIH330kapXr64RI0bo1VdftY55/fXXVaVKFQUEBFhfJ6P9xBcuXNDDDz+sKlWqaNKkSXrooYdksVjUrl07vf/++2rbtq2mTJmiDh06aOLEierSpYv13Iz++wPkCNmdkQM5xZUrVwxJRvv27dM0fufOnYYko3fv3jb7Bw8ebEgy/vjjD+u+IkWKGJKM1atXW/edO3fO8PDwMF555RXrvqSq0nvvvWczZ1orxBMnTjQkGefPn0817pQqxFWqVDGCgoKMCxcuWPft2rXLMJvNRo8ePZK93tNPP20zZ8eOHY18+fKl+ppJkipZX3/9tXXfvn37DEmG2Ww2Nm7caN3/22+/JYvz+vXryebcsGGDIcmYM2eOdd93332XrPqVJOlzsXTp0hSPJVVfk3z66aeGJOOrr74yNm7caLi4uBgvvfTSf17re++9Z7i4uBgxMTGGYRjG5MmTjSJFihi1atUyhg0bZhjG7Sph7ty5bSrJKVX9fXx8ksV159ic/vkwjIy/j4ZhGHXr1jUmTZpk/PTTT8a0adOMChUqGJKMTz75xDrm/PnzhiRjzJgxyc6fOnWqIcnYt2+fYRiG0b9/f8PFxSXF1woMDDS6du2a7jlTk1R1vdOaNWsMSca8efNs9i9dujTZ/pTe42effdbw9va2qc62adMmxX8j0lshlmRMnz7dZuzcuXMNs9lsrFmzxmb/9OnTbX6jlpZ/f4Ccigox8P9iYmIkSbly5UrT+CVLlkiSBg0aZLP/lVdekaRkvcblypVTgwYNrB8HBgaqdOnSab5bPS2SehV/+uknWSyWNJ1z5swZ7dy5UxEREcqbN691f6VKldS8eXPrdd7pzkqfJDVo0EAXLlywvof2+Pr62lTQS5curdy5c6ts2bIKDw+37k/6853vj5eXl/XPt27d0oULF1SiRAnlzp1b27dvT8PV3la0aFG1bNkyTWP79u2rli1b6oUXXtBTTz2l4sWL/2ffqHT7PUlMTNT69esl3a4EN2jQQA0aNNCaNWskSXv27NHly5dt/l5kxIPw+cjo+yjdrqi/+OKLateunfr166dt27apQoUKeu2116yrviT9P6U+8KSe+TvHuru7p/hanp6eGZozPb777jv5+/urefPmio6Otm7Vq1eXr6+vVqxYYR1753scGxur6OhoNWjQQNevX9e+ffvS/dr/xcPDQ7169UoWb9myZVWmTBmbeJs0aSJJ1ngz8u8PkFOQEAP/z8/PT5JS/RXo3Y4fPy6z2awSJUrY7A8JCVHu3Ll1/Phxm/2FCxdONkeePHl06dKlDEacXJcuXVSvXj317t1bwcHB6tq1qxYsWGD3m1NSnKVLl052rGzZsoqOjta1a9ds9t99LUk3IKXlWgoVKpRsnV1/f3+FhoYm23f3nDdu3NCIESMUGhoqDw8PBQQEKDAwUJcvX7a2I6RF0aJF0zxWur081/Xr13Xw4EHNmjXLJklJTbVq1eTt7W1NfpMS4oYNG2rr1q26efOm9Vj9+vXTFc/dHpTPR0bex5S4u7trwIABunz5srZt2ybp38QxLi4u2fibN2/ajPHy8lJ8fHyKcyfdYJneOdPj4MGDunLlioKCghQYGGizXb16VefOnbOO3bt3rzp27Ch/f3/5+fkpMDDQenNhev7Op1XBggWT/bBw8OBB7d27N1mspUqVkiRrvBn59wfIKbiVGfh/fn5+KlCggPbs2ZOu89L6EIXUljgz0rDyYWqvkZiYaPOxl5eXVq9erRUrVmjx4sVaunSp5s+fryZNmmjZsmWZtszavVxLauemZc4XXnhBM2fO1EsvvaQ6depYH17StWvXdH3TTW8Ss3LlSmtS9Oeff6pOnTr/eY6bm5vCw8O1evVqHTp0SFFRUWrQoIGCg4N169Ytbdq0SWvWrFGZMmUUGBiYrnju9qB8PjLyPqYmKWG/ePGiJClv3rzy8PDQmTNnko1N2legQAFJUv78+ZWYmKhz584pKCjIOi4+Pl4XLlywjkvPnOlhsVgUFBSkefPmpXg86e/D5cuX1ahRI/n5+WnMmDEqXry4PD09tX37dg0bNixNf+fT+m9HkpS+NiwWiypWrKgPP/wwxXOSPhf3698fICuQEAN3eOSRR/TZZ59pw4YN//nNukiRIrJYLDp48KDNDVtnz57V5cuXVaRIkUyLK0+ePLp8+XKy/XdXoSXJbDaradOmatq0qT788EONGzdOr7/+ulasWKFmzZqleB3S7RvN7rZv3z4FBASk6aan++H7779Xz5499cEHH1j33bx5M9l7k9YfUtLizJkzeuGFF9SiRQu5u7tr8ODBatmyZZo+vw0aNNCECRP0+++/KyAgQGXKlJHJZFL58uW1Zs0arVmzRo888sh/zpOZ15OZ0vr5kO7tfUxJUutGUvJoNptVsWJFbd26NdnYTZs2qVixYtZ2qKSbKLdu3arWrVtbx23dulUWi8V6PD1zpkfx4sX1+++/q169enZ/OFu5cqUuXLigH374QQ0bNrTuP3r0aLKxqf0dSfptwd2fk5T+7bAX765du9S0adP//LuY3n9/gJyClgngDkOHDpWPj4969+6ts2fPJjt++PBhffTRR5Jk/UZ6993cSVWUNm3aZFpcxYsX15UrV7R7927rvjNnzujHH3+0GZdULbtT0jf3lH7tK92ullWpUkWzZ8+2+aa5Z88eLVu2zCZhyG4uLi7Jqp5TpkxJVu1KSuBTSszSK2l92i+//FKfffaZXF1d9cwzz6Sp+tqgQQPFxcVp0qRJql+/vjWZSFox4vTp02nqH/bx8cmUa8lsaf18SBl/H5PW5r5TbGysJk2apICAAFWvXt26v1OnTtqyZYtNArt//3798ccf6ty5s3VfkyZNlDdvXk2bNs1m3mnTpsnb29vmazetc6bH448/rsTERI0dOzbZsYSEBOvnOqmieud7FB8fr08++STZeT4+Pim2UBQvXlySbFY8SUxMTNfDhR5//HH9888/+vzzz5Mdu3HjhrWlKiP//gA5BRVi4A7FixfX119/rS5duqhs2bI2T6pbv369vvvuO0VEREiSKleurJ49e+qzzz6z/mpz8+bNmj17tjp06KCHHnoo0+Lq2rWrhg0bpo4dO2rgwIG6fv26pk2bplKlStncvDRmzBitXr1abdq0UZEiRXTu3Dl98sknKlSokN0+1ffee08PP/yw6tSpo2eeeUY3btzQlClT5O/vr1GjRmXaddyrRx55RHPnzpW/v7/KlSunDRs26Pfff0/2tLAqVarIxcVFEyZM0JUrV+Th4aEmTZrY/Ho8LWbOnKnFixdr1qxZKlSokKTbCd+TTz6padOm6fnnn7d7fp06deTq6qr9+/erb9++1v0NGza0JmNpSYirV6+u33//XR9++KEKFCigokWL2tzwll3S+vm4l/dx6tSpWrRokdq2bavChQvrzJkzmjFjhk6cOKG5c+fa9Ls+//zz+vzzz9WmTRsNHjxYbm5u+vDDDxUcHGy92VW6/av9sWPHqn///urcubP1SXVfffWV3n77bZubS9M6Z3o0atRIzz77rMaPH6+dO3eqRYsWcnNz08GDB/Xdd9/po48+UqdOnVS3bl3lyZNHPXv21MCBA2UymTR37twUf4ioXr265s+fr0GDBqlmzZry9fVV27ZtVb58edWuXVvDhw/XxYsXlTdvXn377bdKSEhIc7xPPfWUFixYoH79+mnFihWqV6+eEhMTtW/fPi1YsEC//fabatSokeF/f4AcIZtWtwBytAMHDhh9+vQxwsLCDHd3dyNXrlxGvXr1jClTptgsdXTr1i1j9OjRRtGiRQ03NzcjNDTU7oM57taoUSOjUaNG1o9TW3bNMG4veF+hQgXD3d3dKF26tPHVV18lW6IrMjLSaN++vVGgQAHD3d3dKFCggNGtWzfjwIEDyV7j7gdz/P7770a9evUMLy8vw8/Pz2jbtm2qD+a4e1ml1JZ2Sul6716CyjBSf3/0/w9kSHLp0iWjV69eRkBAgOHr62u0bNnS2LdvX4rLpX3++edGsWLFDBcXlxQfzJGSO+c5efKk4e/vb7Rt2zbZuI4dOxo+Pj7GkSNH7F6vYdx+sIMkY9OmTdZ9p06dMiQZoaGhycantOzavn37rA/OUAoP5sjJn497fR+XLVtmNG/e3AgJCTHc3NyM3LlzGy1atDAiIyNTHH/y5EmjU6dOhp+fn+Hr62s88sgjxsGDB1Mc+9lnnxmlS5c23N3djeLFixsTJ040LBbLPc15t9Te46TXr169uuHl5WXkypXLqFixojF06FDj9OnT1jHr1q0zateubXh5eRkFChQwhg4dal0C784l065evWo88cQTRu7cuW0ezGEYhnH48GGjWbNmhoeHhxEcHGy89tprxvLly1N9MEdK4uPjjQkTJhjly5c3PDw8jDx58hjVq1c3Ro8ebVy5csUwjLT9+wPkVCbDSMPv/QAAAAAHRQ8xAAAAnBoJMQAAAJwaCTEAAACcGgkxAAAAnBoJMQAAAJwaCTEAAACcGg/mQIosFotOnz6tXLly5djHxgIAkBMZhqHY2FgVKFBAZnP21x5v3ryp+Pj4LJnb3d1dnp6eWTL3/URCjBSdPn1aoaGh2R0GAAAPrJMnT1qfzphdbt68Ka9c+aSE61kyf0hIiI4ePfrAJ8UkxEhRrly5JEnu5XrK5OL+H6MB3G8nVr6f3SEASEVsTIxKFA21fi/NTvHx8VLCdXmU6yll9vfzxHhF/TVb8fHxJMRwTEltEiYXdxJiIAfy8/PL7hAA/Icc1XLo6pnp388NU/a3g2QWEmIAAABHZ5KU2Ql6Dsr375XjpPYAAABABlAhBgAAcHQm8+0ts+d0EI5zJQAAAEAGUCEGAABwdCZTFvQQO04TMRViAAAAODUqxAAAAI6OHmK7HOdKAAAAgAygQgwAAODo6CG2i4QYAADA4WVBy4QDNRo4zpUAAAAAGUCFGAAAwNHRMmEXFWIAAAA4NSrEAAAAjo5l1+xynCsBAAAAMoAKMQAAgKOjh9guKsQAAABwalSIAQAAHB09xHY5zpUAAAAAGUCFGAAAwNHRQ2wXCTEAAICjo2XCLse5EgAAACADqBADAAA4OpMpCyrEjtMyQYUYAAAATo0KMQAAgKMzm25vmT2ng6BCDAAAAKdGhRgAAMDRscqEXY5zJQAAAEAGUCEGAABwdDyYwy4SYgAAAEdHy4RdjnMlAAAAQAZQIQYAAHB0tEzYRYUYAAAATo0KMQAAgKOjh9gux7kSAAAAIAOoEAMAADg6eojtokIMAAAAp0aFGAAAwNHRQ2yX41wJAAAAkAFUiAEAABwdPcR2kRADAAA4vCxomXCgRgPHuRIAAAAgA6gQAwAAODpaJuyiQgwAAACnRoUYAADA0ZlMWbDsGhViAAAAwCFQIQYAAHB0PJjDLse5EgAAAOR4U6dOVVhYmDw9PRUeHq7NmzfbHX/58mX1799f+fPnl4eHh0qVKqUlS5ZYj48aNUomk8lmK1OmTLpiokIMAADg6HLIKhPz58/XoEGDNH36dIWHh2vSpElq2bKl9u/fr6CgoGTj4+Pj1bx5cwUFBen7779XwYIFdfz4ceXOndtmXPny5fX7779bP3Z1TV+KS0IMAADg6HJIy8SHH36oPn36qFevXpKk6dOna/HixZoxY4ZeffXVZONnzJihixcvav369XJzc5MkhYWFJRvn6uqqkJCQdMeThJYJAAAAZFhMTIzNFhcXl+K4+Ph4bdu2Tc2aNbPuM5vNatasmTZs2JDiOf/73/9Up04d9e/fX8HBwapQoYLGjRunxMREm3EHDx5UgQIFVKxYMXXv3l0nTpxI1zWQEAMAADi6pJaJzN4khYaGyt/f37qNHz8+xRCio6OVmJio4OBgm/3BwcGKiopK8ZwjR47o+++/V2JiopYsWaI333xTH3zwgd566y3rmPDwcM2aNUtLly7VtGnTdPToUTVo0ECxsbFpfntomQAAAECGnTx5Un5+ftaPPTw8Mm1ui8WioKAgffbZZ3JxcVH16tX1zz//6L333tPIkSMlSQ8//LB1fKVKlRQeHq4iRYpowYIFeuaZZ9L0OiTEAAAAji4Le4j9/PxsEuLUBAQEyMXFRWfPnrXZf/bs2VT7f/Pnzy83Nze5uLhY95UtW1ZRUVGKj4+Xu7t7snNy586tUqVK6dChQ2m+FFomAAAAkOXc3d1VvXp1RUZGWvdZLBZFRkaqTp06KZ5Tr149HTp0SBaLxbrvwIEDyp8/f4rJsCRdvXpVhw8fVv78+dMcGwkxAACAo8vCHuL0GDRokD7//HPNnj1bf//9t5577jldu3bNuupEjx49NHz4cOv45557ThcvXtSLL76oAwcOaPHixRo3bpz69+9vHTN48GCtWrVKx44d0/r169WxY0e5uLioW7duaY6LlgkAAADcF126dNH58+c1YsQIRUVFqUqVKlq6dKn1RrsTJ07IbP63XhsaGqrffvtNL7/8sipVqqSCBQvqxRdf1LBhw6xjTp06pW7duunChQsKDAxU/fr1tXHjRgUGBqY5LpNhGEbmXSYcRUxMjPz9/eVRsY9MLin/SgJA9rm05ePsDgFAKmJiYhScz19XrlxJU29tVsfi7+8vz7Yfy+TmlalzG7du6ObPA3LEdd4rKsQAAAAOLumRxpk8aebOl43oIQYAAIBTo0IMAADg6Ez/v2X2nA6CCjEAAACcGhViAAAAB0cPsX1UiAEAAODUqBADAAA4OCrE9lEhBgAAgFOjQgwAAODgqBDbR4UYAAAATo0KMQAAgIOjQmwfCTEAAICj48EcdtEyAQAAAKdGhRgAAMDB0TJhHxViAAAAODUqxAAAAA7OZFIWVIgzd7rsRIUYAAAATo0KMQAAgIMzKQt6iB2oREyFGAAAAE6NCjEAAICDY5UJ+0iIAQAAHB0P5rCLlgkAAAA4NSrEAAAAji4LWiYMB2qZoEIMAAAAp0aFGAAAwMFlxU11mb+MW/ahQgwAAACnRoUYAADAwVEhto8KMQAAAJwaFWIAAABHxzrEdlEhBgAAgFOjQgwAAODg6CG2j4QYAADAwZEQ20fLBAAAAJwaFWIAAAAHR4XYPirEAAAAcGpUiAEAABwcFWL7qBADAADAqVEhBgAAcHQ8mMMuKsQAAABwalSIAQAAHBw9xPaREAMAADg4EmL7aJkAAACAU6NCDAAA4OCoENtHhRgAAABOjQoxAACAo2PZNbuoEAMPqGcfb6h9i0fr0saJWj1nsGqUL5Lq2N8+f1E3dnycbPthcj+bcW8+10ZHlr2tixs+1OLpA1S8cGBWXwbgkKZ/MlWlS4Qpt6+nGtQN15bNm1Mdu+jHH1QvvIZCAnIrn7+PwqtX0ddfzbUZYxiGxowaoaKh+ZUnl5dat2ymQwcPZvVlAE6DhBh4AHVqUU0TXumotz/9VXWemKDdB/7R/z7pr8A8vimO7/rK5wprNty6VXvsLSUkJOqH5TusY16JaKbnuzXSwHHfqmGP93XtRrx+ntpfHu78IglIj+8WzNewIYP0+hsjtWHzdlWqVFnt2rTUuXPnUhyfN29eDR3+ulau2aAt23frqZ691Ld3Ly1f9pt1zAfvv6tPPp6syVOna/W6TfLx8VHbNi118+bN+3VZeMAl9RBn9uYoSIiBB9DAJ5to5g/rNfd/G7XvSJReePtb3bgZr54d6qQ4/lLMdZ29EGvdmtYuo+s3420S4v5PPKQJn/+mX1b+qT0HT6v3m3OUP9Bf7R6qfL8uC3AIkyd9qF7P9FGPiF4qW66cpnwyXV7e3po9a0aK4xs2aqz2HTqqTNmyKla8uAYMfFEVK1bS+nVrJd2uDk+dPEnDXntDbdu1V8VKlfTFzDk6c/q0/vfTovt4ZYDjIiEGHjBuri6qWjZUf2zab91nGIb+2LRftSoVTdMcPTvU1Xe/bdf1m/GSpLCC+ZQ/0F9/bNpnHRNz9aa27Dmm8EphmRo/4Mji4+O1Y/s2NWnazLrPbDarSZNm2rxxw3+ebxiGVvwRqQMH9qt+g4aSpGNHjyoqKkpNmvw7p7+/v2rWCtemNMwJSDmrQjx16lSFhYXJ09NT4eHh2mynpUiSLl++rP79+yt//vzy8PBQqVKltGTJknua824kxMADJiCPr1xdXXTuYqzN/nMXYhSSz+8/z69RvogqlCygWT+ut+4LCbh9XvI5YxWchjkB3BYdHa3ExEQFBQXb7A8KDlZUVFSq5125ckUBuX3l5+2uju3a6MNJU9S0WXNJsp4XFJx8zrNnU58TuJNJWZAQZ+Cuuvnz52vQoEEaOXKktm/frsqVK6tly9RbiuLj49W8eXMdO3ZM33//vfbv36/PP/9cBQsWzPCcKSEhzkSjRo1SlSpVsjsMwK6eHerozwP/aOve49kdCoD/lytXLm3aulNrN2zRqLFva9iQQVq9amV2hwVkug8//FB9+vRRr169VK5cOU2fPl3e3t6aMSPllqIZM2bo4sWLWrRokerVq6ewsDA1atRIlStXzvCcKcmRCXFERIRMJpPeeecdm/2LFi1Kd3k+LCxMkyZNStO4pJ94vL29VbFiRX3xxRfpeq2cikTdsURfuqqEhEQF5c1lsz8on5+iLsTYPdfb012dW1bX7EW2v2aNir59XvI5c+nsf8wJ4F8BAQFycXHRuXNnbfafO3tWISEhqZ5nNptVvEQJVa5SRS+9/Io6PtpJ700YL0nW886dTT5ncHDqcwJ3ysqWiZiYGJstLi4uxRji4+O1bds2NWtm21LUrFkzbdiQcvvP//73P9WpU0f9+/dXcHCwKlSooHHjxikxMTHDc6YkRybEkuTp6akJEybo0qVL9+01x4wZozNnzmjPnj168skn1adPH/3666/37fWBtLiVkKgdf5/UQ+GlrftMJpMeqlVKm3cftXvuo82rysPdVd8s2WKz/9g/F3Tm/BWbOXP5eKpmhTBt2n0sU+MHHJm7u7uqVquuFX9EWvdZLBatWBGpWrVTvuk1JRaLxZpUhBUtqpCQEK1Y8e+cMTEx2rJ5k8LTMSeQVUJDQ+Xv72/dxo8fn+K4pJai4Lvaf4LttBQdOXJE33//vRITE7VkyRK9+eab+uCDD/TWW29leM6U5NiEuFmzZgoJCUn1TU2ycOFClS9fXh4eHgoLC9MHH3xgPda4cWMdP35cL7/8cpqav3PlyqWQkBAVK1ZMw4YNU968ebV8+XLr8cuXL6t3794KDAyUn5+fmjRpol27dtmd84svvlDZsmXl6empMmXK6JNPPrEeq1u3roYNG2Yz/vz583Jzc9Pq1aslSXPnzlWNGjWssT3xxBM2PTErV66UyWRSZGSkatSoIW9vb9WtW1f799++4WrWrFkaPXq0du3aZX0PZs2aZTdm5HyTv/pDvTrWVfe24SpdNFiTX+siby8PzflpoyTpi7FPacwL7ZKdF9Ghjn5euVsXr1xLdmzq1ys0rHcrtWlUUeVLFNCXY5/SmfNX9L8V9v+OA7A18KVBmvnl5/pqzmzt+/tvDez/nK5fu6YePXtJkp6J6KE3Xx9uHf/ehPGK/H25jh45on1//61JEz/Q1/PmqtsTT0q6/QNv/4EvacK4t/TLz//Tnj//1DO9eih/gQJq175DdlwiHkSmLNoknTx5UleuXLFuw4f/+/f7XlksFgUFBemzzz5T9erV1aVLF73++uuaPn16pr2GlIOfVOfi4qJx48bpiSee0MCBA1WoUKFkY7Zt26bHH39co0aNUpcuXbR+/Xo9//zzypcvnyIiIvTDDz+ocuXK6tu3r/r06ZPm17ZYLPrxxx916dIlubu7W/d37txZXl5e+vXXX+Xv769PP/1UTZs21YEDB5Q3b95k88ybN08jRozQxx9/rKpVq2rHjh3q06ePfHx81LNnT3Xv3l3vvvuu3nnnHWuyPn/+fBUoUEANGjSQJN26dUtjx45V6dKlde7cOQ0aNEgRERHJ7q58/fXX9cEHHygwMFD9+vXT008/rXXr1qlLly7as2ePli5dqt9//13S7buT7xYXF2fzK46YGH5NnpN9v2y7AvL4asRzbRScL5d27/9H7ftPtd4UFxqSVxaLYXNOySJBqlethNr0+zjFOT+Y9bu8vTz08RvdlDuXl9bvPKx2/T9RXHxCll8P4Eg6P95F0efPa8zoETobFaVKlavop1+WWitYJ0+ekNn8bz3q2rVrevGF5/XPqVPy8vJSqdJlNGP2V+r8eBfrmFcGD9X1a9c04Lm+unz5surWq6///bJUnp6e9/36gLv5+fnJz++/b8BOaik6e1f7z1k7LUX58+eXm5ubXFxcrPvKli2rqKgoxcfHZ2jOlJgMwzD+e9j9FRERocuXL2vRokWqU6eOypUrpy+//FKLFi1Sx44dlRRy9+7ddf78eS1btsx67tChQ7V48WLt3btX0u3e4JdeekkvvfSS3dcMCwvTmTNn5Obmpri4OCUkJChv3rzatGmTSpQoobVr16pNmzY6d+6cPDw8rOeVKFFCQ4cOVd++fTVq1CgtWrRIO3futB4bO3asunXrZh3/1ltvacmSJVq/fr3Onz+vAgUK6I8//rAmwHXr1lXDhg2T9U8n2bp1q2rWrKnY2Fj5+vpq5cqVeuihh/T777+radOmkqQlS5aoTZs2unHjhjw9PZPFlZJRo0Zp9OjRyfZ7VOwjk4t7CmcAyE6XtqT8gw2A7BcTE6PgfP66cuVKmhLFrI7F399fRZ7/TmYP70yd2xJ3Xcc/6Zyu6wwPD1etWrU0ZcqU23NYLCpcuLAGDBigV199Ndn41157TV9//bWOHDli/UHyo48+0oQJE3T69OkMzZmSHNsykWTChAmaPXu2/v7772TH/v77b9WrV89mX7169XTw4EFrs3V6DBkyRDt37tQff/yh8PBwTZw4USVKlJAk7dq1S1evXlW+fPnk6+tr3Y4eParDhw8nm+vatWs6fPiwnnnmGZvxb731lnV8YGCgWrRooXnz5kmSjh49qg0bNqh79+7WebZt26a2bduqcOHCypUrlxo1aiRJOnHihM3rVapUyfrn/PnzS1K6lhsZPny4za87Tp48meZzAQAA0mLQoEH6/PPPrbndc889p2vXrqlXr9stRT169LBpuXjuued08eJFvfjiizpw4IAWL16scePGqX///mmeMy1ybMtEkoYNG6ply5YaPny4IiIisvS1AgICVKJECZUoUULfffedKlasqBo1aqhcuXK6evWq8ufPr5UrVyY7L3fu3Mn2Xb16VZL0+eefKzw83ObYnWX/7t27a+DAgZoyZYq+/vprVaxYURUrVpR0O6lu2bKlWrZsqXnz5ikwMFAnTpxQy5YtFR8fbzOnm5ub9c9J7RcWiyXN1+7h4WFT+QYAAI4jKx61nJH5unTpovPnz2vEiBGKiopSlSpVtHTpvy1FJ07YthSFhobqt99+08svv6xKlSqpYMGCevHFF23uwfqvOdMixyfEkvTOO++oSpUqKl26tM3+smXLat26dTb71q1bp1KlSlmTTnd39wxVi0NDQ9WlSxcNHz5cP/30k6pVq6aoqCi5uroqLCzsP88PDg5WgQIFdOTIEZuK793at2+vvn37aunSpfr666/Vo0cP67F9+/bpwoULeueddxQaGirpdstEemX0PQAAAMhsAwYM0IABA1I8llLhsU6dOtq4cWOG50yLByIhrlixorp3767Jkyfb7H/llVdUs2ZNjR07Vl26dNGGDRv08ccf26zkEBYWptWrV6tr167y8PBQQEBAml/3xRdfVIUKFbR161Y1a9ZMderUUYcOHfTuu++qVKlSOn36tBYvXqyOHTuqRo0ayc4fPXq0Bg4cKH9/f7Vq1UpxcXHaunWrLl26pEGDBkmSfHx81KFDB7355pv6+++/bfqNCxcuLHd3d02ZMkX9+vXTnj17NHbs2PS+fQoLC9PRo0e1c+dOFSpUSLly5aIaDACAEzGZbm+ZPaejyPE9xEnGjBmTrAWgWrVqWrBggb799ltVqFBBI0aM0JgxY2xaK8aMGaNjx46pePHiCgwMTNdrlitXTi1atNCIESNkMpm0ZMkSNWzYUL169VKpUqXUtWtXHT9+PNWSfO/evfXFF19o5syZqlixoho1aqRZs2apaNGiNuO6d++uXbt2qUGDBipcuLB1f2BgoGbNmqXvvvtO5cqV0zvvvKP3338/XdcgSY899phatWqlhx56SIGBgfrmm2/SPQcAAHhw3U6IM/vBHNl9VZknR64ygeyXdFcqq0wAOROrTAA5V05cZaLYC9/L7OGTqXNb4q7pyJROOeI679UD0TIBAACAe5AFLRNyoArxA9MyAQAAAGQFKsQAAAAOLqcsu5ZTUSEGAACAU6NCDAAA4OBYds0+KsQAAABwalSIAQAAHJzZbJLZnLklXSOT58tOJMQAAAAOjpYJ+2iZAAAAgFOjQgwAAODgWHbNPirEAAAAcGpUiAEAABwcPcT2USEGAACAU6NCDAAA4ODoIbaPCjEAAACcGhViAAAAB0eF2D4qxAAAAHBqVIgBAAAcHKtM2EdCDAAA4OBMyoKWCTlORkzLBAAAAJwaFWIAAAAHR8uEfVSIAQAA4NSoEAMAADg4ll2zjwoxAAAAnBoVYgAAAAdHD7F9VIgBAADg1KgQAwAAODh6iO0jIQYAAHBwtEzYR8sEAAAAnBoVYgAAAAdHy4R9VIgBAADg1KgQAwAAOLos6CGW4xSIqRADAADAuVEhBgAAcHD0ENtHhRgAAABOjQoxAACAg2MdYvtIiAEAABwcLRP20TIBAAAAp0aFGAAAwMHRMmEfFWIAAAA4NSrEAAAADo4eYvuoEAMAAMCpUSEGAABwcFSI7aNCDAAAAKdGhRgAAMDBscqEfVSIAQAA4NRIiAEAABxcUg9xZm8ZMXXqVIWFhcnT01Ph4eHavHlzqmNnzZqV7DU9PT1txkRERCQb06pVq3TFRMsEAACAg8spLRPz58/XoEGDNH36dIWHh2vSpElq2bKl9u/fr6CgoBTP8fPz0/79++943eQv3KpVK82cOdP6sYeHR7riokIMAACA++LDDz9Unz591KtXL5UrV07Tp0+Xt7e3ZsyYkeo5JpNJISEh1i04ODjZGA8PD5sxefLkSVdcJMQAAAAOLitbJmJiYmy2uLi4FGOIj4/Xtm3b1KxZM+s+s9msZs2aacOGDanGfvXqVRUpUkShoaFq37699u7dm2zMypUrFRQUpNKlS+u5557ThQsX0vX+kBADAAAgw0JDQ+Xv72/dxo8fn+K46OhoJSYmJqvwBgcHKyoqKsVzSpcurRkzZuinn37SV199JYvForp16+rUqVPWMa1atdKcOXMUGRmpCRMmaNWqVXr44YeVmJiY5mughxgAAMDBmZQFPcT///+TJ0/Kz8/Puj+9/bv21KlTR3Xq1LF+XLduXZUtW1affvqpxo4dK0nq2rWr9XjFihVVqVIlFS9eXCtXrlTTpk3T9DpUiAEAAJBhfn5+NltqCXFAQIBcXFx09uxZm/1nz55VSEhIml7Lzc1NVatW1aFDh1IdU6xYMQUEBNgdczcSYgAAAAdnNpmyZEsPd3d3Va9eXZGRkdZ9FotFkZGRNlVgexITE/Xnn38qf/78qY45deqULly4YHfM3UiIAQAAcF8MGjRIn3/+uWbPnq2///5bzz33nK5du6ZevXpJknr06KHhw4dbx48ZM0bLli3TkSNHtH37dj355JM6fvy4evfuLen2DXdDhgzRxo0bdezYMUVGRqp9+/YqUaKEWrZsmea46CEGAABwcDllHeIuXbro/PnzGjFihKKiolSlShUtXbrUeqPdiRMnZDb/W6+9dOmS+vTpo6ioKOXJk0fVq1fX+vXrVa5cOUmSi4uLdu/erdmzZ+vy5csqUKCAWrRoobFjx6arl9lkGIaR/suBo4uJiZG/v788KvaRycU9u8MBcJdLWz7O7hAApCImJkbB+fx15coVm5vNsisWf39/NXk/Uq5ePpk6d8KNa/pjcNMccZ33ipYJAAAAODVaJgAAAByc2XR7y+w5HQUVYgAAADg1KsQAAACOziTro5Yzc05HkaaE+Omnn87wC5hMJn355ZcZPh8AAADISmlKiGfNmpXhFyAhBgAAyF45Zdm1nCpNCfGKFSuyOg4AAAAgW6QpIW7UqFFWxwEAAIAsYvr//zJ7TkfBKhMAAABwave0ykRCQoIWL16szZs3Kzo6WuHh4dYb8E6fPq3o6GiVK1dOrq4sZgEAAJBdWIfYvgxnqmvXrtWTTz6pkydPyjAMmUwm3bp1y5oQb9iwQY8//ri+++47Pfroo5kWMAAAANLHZDJl+rJrmb6MWzbKUMvEX3/9pVatWunMmTN64YUXtGDBAhmGYTOmbdu28vb21sKFCzMlUAAAACArZKhCPHbsWN28eVNLlixRixYtUhzj7u6uatWqaceOHfcUIAAAAO4Ny67Zl6EK8YoVK1SrVq1Uk+EkBQsW1OnTpzMUGAAAAHA/ZKhCfPnyZYWGhv7nuGvXrunWrVsZeQkAAABkErPJJHMml3Qze77slKEKcVBQkA4dOvSf4/7+++80Jc4AAABAdslQQtykSRPt3LnT7hPsfvzxRx06dEjNmzfPcHAAAAC4d0k9xJm9OYoMJcSvvvqq3N3d1aFDB02bNk1RUVHWY5cuXdKMGTP0zDPPyMfHR4MGDcq0YAEAAIDMlqGEuEyZMvrmm29ksVg0YMAAFSxYUCaTSbNnz1ZAQID69OmjuLg4zZs3T0WLFs3smAEAAJAOSesQZ/bmKDL86OYOHTpoz549euGFF1SmTBl5enrK3d1dxYoV07PPPqvdu3erXbt2mRkrAAAAMoCWCfvu6ZnKRYoU0aRJkzIpFAAAAOD+u6eEGAAAADkfy67Zd08JcVxcnBYuXKg1a9ZYH8BRoEAB1a9fX4899pg8PT0zJUgAAAAgq2Q4If79998VERGhM2fOyDAMm2OfffaZhg4dqlmzZrHsGgAAQDYz/f+W2XM6igwlxJs2bdIjjzyi+Ph4hYeHq1u3bgoLC5MkHT9+XN988402btyotm3batWqVQoPD8/MmAEAAIBMk6GE+M0339StW7c0bdo0Pfvss8mOv/DCC/rss8/Ur18/jRgxQr/99ts9BwoAAICMyYpl0px+2bVNmzapRo0aKSbDSfr27auaNWtq48aNGQ4OAAAAyGoZSojNZrNKlCjxn+NKlCjhUD89AAAAPIjMpqzZHEWGWiZq1aql3bt3/+e43bt3q1atWhl5CQAAAGQSWibsy1CFeOzYsTp48KBGjhwpi8WS7LhhGBo5cqQOHjyosWPH3nOQAAAAQFZJU4V4zpw5yfb17NlTb731lubOnavHHntMRYoUkXR7lYkffvhBx44dU58+fbR//35WmQAAAMhmDlTQzXQm4+5FhFNgNptTLIvfeWrS8bunM5lMSkxMvNc4cZ/FxMTI399fHhX7yOTint3hALjLpS0fZ3cIAFIRExOj4Hz+unLlivz8/LI9Fn9/fz3+2Vq5e/tm6tzx169qQd/6OeI671WaKsQjRoxwqD4RAAAAZ0IPsX1pSohHjRqVxWEAAAAA2SPDj24GAADAgyErlklzpGXXMrTKBAAAAOAo7qlCvHbtWv300086ePCgYmNjk91QJ93uL4mMjLyXlwEAAMA9oIfYvgwlxIZh6JlnntHs2bOtSbDJZEq26oRhGA71ZgEAAMDxZKhlYvr06Zo1a5aqV6+u5cuX69FHH5Uk7d+/X7/++qsiIiJkNps1ZMgQHTlyJFMDBgAAQPqYsmhzFBmqEM+aNUs+Pj769ddflS9fPn311VeSpJIlS6pkyZJq2bKlWrdurS5duqhu3brWh3YAAADg/jObTDJn8m/tM3u+7JShCvHff/+tunXrKl++fJL+7SG58wEcnTp1UvXq1fX+++9nQpgAAABA1shQQmyxWKzJsCR5e3tLki5dumQzrmTJkvrzzz/vITwAAADcK5MpazZHkaGEuGDBgjp9+rT146SWiB07dtiMO3DggFxdWeoYAAAAOVeGEuJq1arpr7/+srZItGjRQoZhaOjQodq3b59iY2P13nvvadu2bapatWqmBgwAAID0SVp2LbM3R5GhhLhdu3aKjo7W4sWLJUmVK1dW165dtWvXLpUvX165c+fWq6++KldXV7399tuZGjAAAACQmTLUz9CtWzc9+uijNu0Qs2fPVqVKlbRo0SJdunRJpUqV0tChQ1WrVq1MCxYAAADplxU9vw5UIM74k+o8PDxsPnZzc9Orr76qV1999Z6DAgAAAO4X7ngDAABwcKxDbF+GeogBAADw4MhJy65NnTpVYWFh8vT0VHh4uDZv3pzq2FmzZiW7kc/T09NmjGEYGjFihPLnzy8vLy81a9ZMBw8eTFdMaaoQFytWLF2T3slkMunw4cMZPh8AAACOYf78+Ro0aJCmT5+u8PBwTZo0SS1bttT+/fsVFBSU4jl+fn7av3+/9eO7V7d49913NXnyZM2ePVtFixbVm2++qZYtW+qvv/5KljynJk0J8bFjx9I0GQAAAHKerFgmLSPzffjhh+rTp4969eolSZo+fboWL16sGTNmpHofmslkUkhISIrHDMPQpEmT9MYbb6h9+/aSpDlz5ig4OFiLFi1S165d0xRXmhJii8WSpsngeLb89JZy5fLL7jAA3CVPp8+yOwQAqTBu3cjuEO6rmJgYm489PDySLb4gSfHx8dq2bZuGDx9u3Wc2m9WsWTNt2LAh1fmvXr2qIkWKyGKxqFq1aho3bpzKly8vSTp69KiioqLUrFkz63h/f3+Fh4drw4YNaU6I6SEGAABwcOYs2iQpNDRU/v7+1m38+PEpxhAdHa3ExEQFBwfb7A8ODlZUVFSK55QuXVozZszQTz/9pK+++koWi0V169bVqVOnJMl6XnrmTAmrTAAAACDDTp48KT+/f3+bnFJ1OKPq1KmjOnXqWD+uW7euypYtq08//VRjx47NtNchIQYAAHBwWdlD7OfnZ5MQpyYgIEAuLi46e/aszf6zZ8+m2iN8Nzc3N1WtWlWHDh2SJOt5Z8+eVf78+W3mrFKlSprmlGiZAAAAwH3g7u6u6tWrKzIy0rrPYrEoMjLSpgpsT2Jiov78809r8lu0aFGFhITYzBkTE6NNmzaleU6JCjEAAIDDM5kkcw54dPOgQYPUs2dP1ahRQ7Vq1dKkSZN07do166oTPXr0UMGCBa19yGPGjFHt2rVVokQJXb58We+9956OHz+u3r17/38MJr300kt66623VLJkSeuyawUKFFCHDh3SHBcJMQAAAO6LLl266Pz58xoxYoSioqJUpUoVLV261HpT3IkTJ2Q2/9vAcOnSJfXp00dRUVHKkyePqlevrvXr16tcuXLWMUOHDtW1a9fUt29fXb58WfXr19fSpUvTvAaxJJkMwzAy7zLhKGJiYuTv76/dR86y7BqQA5XtPSe7QwCQCuPWDcUteUlXrlxJU29tVkr6fv78N1vk4e2bqXPHXb+qT7rVzBHXea+oEAMAADi4nPJgjpwqUxLigwcPKjo6Wvny5VOpUqUyY0oAAADgvsjwKhNxcXF67bXXFBAQoDJlyqh+/fp65513rMe/+uorVatWTTt37syMOAEAAJBBZlPWbI4iQwnxjRs31LhxY02YMEHu7u5q3bq17m5FbtKkiXbt2qUFCxZkSqAAAABAVshQQvzuu+9q06ZNevrpp3XkyBH9/PPPycYUKFBA5cqV0++//37PQQIAACDjTKas2RxFhhLi+fPnq3Dhwpo2bZrdJS1Kly6tkydPZjg4AAAAIKtl6Ka6o0ePqk2bNnJ1tX+6u7u7Ll26lKHAAAAAkDnMJpPMmVzSzez5slOGKsReXl5pSnSPHj2qPHnyZOQlAAAAgPsiQwlxlSpVtHXrVp0/fz7VMUePHtWOHTtUs2bNDAcHAACAe2fOos1RZOha+vTpo9jYWHXr1k3R0dHJjl++fFlPP/20bt26pb59+95zkAAAAMg4bqqzL0M9xN26ddPPP/+sb7/9VsWKFVPdunUlSevWrVP79u21atUqxcTEqEePHnrkkUcyNWAAAAAgM2W42j1v3jxNmDBBnp6eWrZsmaTbT6z7+eefZTKZ9Pbbb2vmzJmZFigAAAAyxiyT9ca6TNvkOCXiDD+62WQyaciQIRo0aJC2b9+uY8eOyWKxqFChQqpZs6bc3d0zM04AAAAgS2Q4IU7i4uKimjVrcvMcAABADpUVPb+O1EPsSDcIAgAAAOmWoQrx008/neaxJpNJX375ZUZeBgAAAJnAbLq9ZfacjiJDCfGsWbP+c4zJZJJhGCTEAAAAyNEylBCvWLEixf0Wi0UnT57UsmXL9O233+rll19W27Zt7ylAAAAA3BuTKfMftexIPcQZSogbNWpk93iPHj3Upk0b9ezZU+3atctQYAAAAMgc3FRnX5bdVNetWzeVL19eo0aNyqqXAAAAAO5Zlq4yUbJkSW3dujUrXwIAAAD/IemmuszeHEWWJcQWi0W7d++W2czKbgAAAMi5Mj1bvX79unbu3Klu3brp4MGD/9lvDAAAgKxlyqL/HEWGbqpzcXH5zzGGYSgwMFDvvfdeRl4CAAAAuC8ylBCHhobKlMqthe7u7sqfP78aNWqk/v37Kygo6J4CBAAAwL3hwRz2ZSghPnbsWCaHAQAAAGSPDCXE//vf/+Tm5qaHH344s+MBAABAJqNCbF+Gbqrr2LGjJk+enNmxAAAAAPddhirEgYGBypMnT2bHAgAAgCxgMplSvf/rXuZ0FBlKiBs3bqzNmzfLMAyHejMAAAAcES0T9mWoZWLs2LGKjo7Wyy+/rJs3b2Z2TAAAAMB9k6EK8TfffKPWrVtrypQp+vbbb9WsWTMVLlxYnp6eycaaTCa9+eab9xwoAAAAMsZkur1l9pyOIk0JcbFixdS5c2dNmDBBkjRq1CiZTCYZhqFz587p66+/TvVcEmIAAADkZGlKiI8dO6bz589bP545c2aWBQQAAIDMZTaZZM7kkm5mz5edMtQy0bNnz8yOAwAAAMgWGUqIAQAA8OBglQn7MrTKBAAAAOAo0lwh3rlzp8aMGZOhFxkxYkSGzgMAAEAmyIJVJuRAFeI0J8S7du3Srl270jV50oM7SIgBAACyj1kmmTM5g83s+bJTmhPi4sWLq169elkZCwAAAHDfpTkhrl+/vmbMmJGVsQAAACAL8GAO+7ipDgAAAE6NZdcAAAAcHMuu2UeFGAAAAE6NCjEAAICD49HN9qUpIbZYLFkdBwAAAJAtqBADAAA4OFaZsI8eYgAAADg1EmIAAAAHZ5bJ2kecaVsGn1Q3depUhYWFydPTU+Hh4dq8eXOazvv2229lMpnUoUMHm/0REREymUw2W6tWrdIVEwkxAACAg0tqmcjsLb3mz5+vQYMGaeTIkdq+fbsqV66sli1b6ty5c3bPO3bsmAYPHqwGDRqkeLxVq1Y6c+aMdfvmm2/SFRcJMQAAAO6LDz/8UH369FGvXr1Urlw5TZ8+Xd7e3nafhpyYmKju3btr9OjRKlasWIpjPDw8FBISYt3y5MmTrrhIiAEAABycOYs2SYqJibHZ4uLiUowhPj5e27ZtU7Nmzf6Ny2xWs2bNtGHDhlRjHzNmjIKCgvTMM8+kOmblypUKCgpS6dKl9dxzz+nChQv/9ZbYICEGAABAhoWGhsrf39+6jR8/PsVx0dHRSkxMVHBwsM3+4OBgRUVFpXjO2rVr9eWXX+rzzz9P9fVbtWqlOXPmKDIyUhMmTNCqVav08MMPKzExMc3XwLJrAAAADi7pZrPMnlOSTp48KT8/P+t+Dw+PTJk/NjZWTz31lD7//HMFBASkOq5r167WP1esWFGVKlVS8eLFtXLlSjVt2jRNr0VCDAAAgAzz8/OzSYhTExAQIBcXF509e9Zm/9mzZxUSEpJs/OHDh3Xs2DG1bdvWui/pYXGurq7av3+/ihcvnuy8YsWKKSAgQIcOHUpzQkzLBAAAgIMzZdGWHu7u7qpevboiIyOt+ywWiyIjI1WnTp1k48uUKaM///xTO3futG7t2rXTQw89pJ07dyo0NDTF1zl16pQuXLig/Pnzpzk2KsQAAAC4LwYNGqSePXuqRo0aqlWrliZNmqRr166pV69ekqQePXqoYMGCGj9+vDw9PVWhQgWb83Pnzi1J1v1Xr17V6NGj9dhjjykkJESHDx/W0KFDVaJECbVs2TLNcZEQAwAAOLikh2lk9pzp1aVLF50/f14jRoxQVFSUqlSpoqVLl1pvtDtx4oTM5rQ3MLi4uGj37t2aPXu2Ll++rAIFCqhFixYaO3ZsunqZSYgBAACcQOamwxk3YMAADRgwIMVjK1eutHvurFmzbD728vLSb7/9ds8x0UMMAAAAp0aFGAAAwMFl9FHL/zWno6BCDAAAAKdGhRgAAMDBZeWDORwBFWIAAAA4NSrEAAAADs6szK+COlJV1ZGuBQAAAEg3KsQAAAAOjh5i+0iIAQAAHJxJmf9gDsdJh2mZAAAAgJOjQgwAAODgaJmwjwoxAAAAnBoVYgAAAAfHsmv2OdK1AAAAAOlGhRgAAMDB0UNsHxViAAAAODUqxAAAAA6OdYjto0IMAAAAp0aFGAAAwMGZTLe3zJ7TUZAQAwAAODizTDJncpNDZs+XnWiZAAAAgFOjQgwAAODgaJmwjwoxAAAAnBoVYgAAAAdn+v//MntOR0GFGAAAAE6NCjEAAICDo4fYPirEAAAAcGpUiAEAABycKQvWIXakHmISYgAAAAdHy4R9tEwAAADAqVEhBgAAcHBUiO2jQgwAAACnRoUYAADAwfFgDvuoEAMAAMCpUSEGAABwcGbT7S2z53QUVIgBAADg1KgQAwAAODh6iO2jQgwAAACnRoUYAADAwbEOsX0kxAAAAA7OpMxvcXCgfJiWCQAAADg3KsTAAyqXp4v8vVzlYpbiEwxduHZL8QlGimO93c3y93KVm4tJMkkJiYau3EjQtTiLdUyAr5t8PV1szrsRn6izMbey9DoAR/Tsw+X0csfKCs7tpT+PXdSgz9dp68HzqY7393HXqO411b52UeXN5aET52I1ZMYG/bbtpCTp9a7V9UbX6jbn7D91WVUGLMjS64DjYNk1+0iIgQeQt7tZeX1cdeFqguISLPLzclGwn7v+uRQnSwo5scWQrtxI0K1EQ4YhebmbFeDrpkTLLd289W9SfD0+URdi/02AU06vAdjTqV4xTXi6jl6YtkZbDpzTgHYV9b+RrVW5/3ydv3Iz2Xg3V7MWj2qtc1duqvu7y/XPxWsqHJhLV67F2Yzbe/yi2oxcbP04IdFy91QAMoiEGHgA+Xu5KvZmoq7GJUqSLlxNkFceF+XydNGVG4nJxt+Z9EpS7M1E+Xq6yNPNbHvMkBLJgoF7MrB9Jc1ctk9z/zggSXph2ho9XL2wejYtrfd/2JVsfM+mpZUnl6cav/qTEv7/C/DEuavJxiVYLDp7+UbWBg+HxbJr9pEQAw8gd1eTrtywTXJv3rLIw9UsKXlCfDdPN7PcXEy6dFei7OlmVmheD1kshm7csujy9YQUK84AUubmalbV4gF6b+EO6z7DkP7Y9Y9qlQ5O8Zw2tYpo076zmvRsfT1Sq4iir9zU/DWH9MEPu2S54wuwRH5/HZnRXTfjE7Vp/1mNmLtZJ6OvZfk1Ac6Am+oyycqVK2UymXT58uXsDgUOzsUsmUwmJd6VqSZaDLnYaegymaTC+TxUJJ+Hgv3cdPFqgk11+EZ8os5fvaWoK/G6dD1Bnm5mBfu5Z9l1AI4oIJenXF3MOndXJffclRsKyeOd4jlFg/3UsW5RuZhN6jh2qd5ZsF0vtqukVztXtY7ZcuCc+k5eqXajf9XA6WsVFpxLv49rJ19Ptyy9HjiOpGXXMntzFCTE/y8iIkImk0kmk0lubm4qWrSohg4dqps3k/d7PUhI1JHEMKTTl+J15vLthDevj6s83f79J+BavEU34i26lWjoerxF52JuycPNbDMGQOYzm6TzV26q/ydrtONwtL5fd0Tvfr9DvVuWs45Ztv2kflh/VHuOX9TvO0+pw9il8vfx0GP1i2Vj5IDjoGXiDq1atdLMmTN169Ytbdu2TT179pTJZNKECROyOzTAKtEiGUZSNfjfKrGLOXnV+G4J/388/kai3FxM8vdySdZffOfYRIshVxeTxEITQJpEx95UQqJFQbm9bPYH+Xsp6tL1FM+JunRdtxItNu0R+05dVv683nJzNetWQvKv0SvX4nXo9GUVD/HL3AuAwzIp89cNdqACMRXiO3l4eCgkJEShoaHq0KGDmjVrpuXLl0uSLBaLxo8fr6JFi8rLy0uVK1fW999/b3e+tWvXqkGDBvLy8lJoaKgGDhyoa9du93u99tprCg8PT3ZO5cqVNWbMGEnSli1b1Lx5cwUEBMjf31+NGjXS9u3bbcabTCZ98cUX6tixo7y9vVWyZEn973//kyQdO3ZMDz30kCQpT548MplMioiIuKf3CDlDfIKRrHLr6WZWXArfOO0x2fl9l4v5duXqv5JsAP+6lWDRjsPReqhSQes+k0l6qFIBbd5/NsVzNuw7q+L5/W1+/VyygL/OXLyWYjIsST6erioa4pdqkg3czSyTzKZM3hwoJSYhTsWePXu0fv16ubvf7qEcP3685syZo+nTp2vv3r16+eWX9eSTT2rVqlUpnn/48GG1atVKjz32mHbv3q358+dr7dq1GjBggCSpe/fu2rx5sw4fPmw9Z+/evdq9e7eeeOIJSVJsbKx69uyptWvXauPGjSpZsqRat26t2NhYm9caPXq0Hn/8ce3evVutW7dW9+7ddfHiRYWGhmrhwoWSpP379+vMmTP66KOPUow3Li5OMTExNhtyris3EpTL00U+Hrdvjsvn4yqT6fbqEdLtNYVze//7CyB/r9srSriaTXJzMcnPy0W+Hi66+v/jTZLyeLvKw9UkV7NJnm5mBfm5K8Fi6EY8SzsB6TH5p93q1byMuj9UUqUL5dbkfg3k7emmOZG3V5344sXGGvNkTev4z5f+pTy+Hvqgd12VKOCvVtVDNaRTFU1f8pd1zPiIcNUvn1+Fg3xVu3Sw5r/aQokWQwvWHE72+kBON3XqVIWFhcnT01Ph4eHavHlzms779ttvZTKZ1KFDB5v9hmFoxIgRyp8/v7y8vNSsWTMdPHgwXTHRMnGHX375Rb6+vkpISFBcXJzMZrM+/vhjxcXFady4cfr9999Vp04dSVKxYsW0du1affrpp2rUqFGyucaPH6/u3bvrpZdekiSVLFlSkydPVqNGjTRt2jSVL19elStX1tdff60333xTkjRv3jyFh4erRIkSkqQmTZrYzPnZZ58pd+7cWrVqlR555BHr/oiICHXr1k2SNG7cOE2ePFmbN29Wq1atlDdvXklSUFCQcufOneq1jx8/XqNHj87YG4f77nq8RRevJSiPt5v1wRxnY+KtK0K4utj+1G4ymZTP10UuZpMMQ7qVaOh87C1dvyPZdXc1ydfT/f+rwtKNW4m6dC3hfl4W4BC+X3dEAf5eGtGthoLzeGv30QtqP3qJzl25faNdaKCvLMa/v3k5FX1N7UYv0btP19GWSY/p9MXrmvrLHn1wxxJtBfP5as4rTZQ3l6eir9zQ+r/PqtGwRYqOebDvc8H9k1NaJubPn69BgwZp+vTpCg8P16RJk9SyZUvt379fQUFBqZ537NgxDR48WA0aNEh27N1339XkyZM1e/ZsFS1aVG+++aZatmypv/76S56enmmKi4T4Dg899JCmTZuma9euaeLEiXJ1ddVjjz2mvXv36vr162revLnN+Pj4eFWtWjXFuXbt2qXdu3dr3rx51n2GYchisejo0aMqW7asunfvrhkzZujNN9+UYRj65ptvNGjQIOv4s2fP6o033tDKlSt17tw5JSYm6vr16zpx4oTNa1WqVMn6Zx8fH/n5+encuXPpuvbhw4fbvHZMTIxCQ0PTNQfur9ibidaK8N2irsTbfHz5eoIu2/nNqiHxRDogE01fslfTl+xN8VjLN35Jtm/T/nNqNOynVOfr8UFkpsUGZKcPP/xQffr0Ua9evSRJ06dP1+LFizVjxgy9+uqrKZ6TmJio7t27a/To0VqzZo3NQgGGYWjSpEl644031L59e0nSnDlzFBwcrEWLFqlr165piouE+A4+Pj7W6uyMGTNUuXJlffnll6pQoYIkafHixSpYsKDNOR4eHinOdfXqVT377LMaOHBgsmOFCxeWJHXr1k3Dhg3T9u3bdePGDZ08eVJdunSxjuvZs6cuXLigjz76SEWKFJGHh4fq1Kmj+HjbZMfNzXbZHZPJJIslfb/m9vDwSPVaAADAAy4LS8R3t1mmllPEx8dr27ZtGj58uHWf2WxWs2bNtGHDhlRfZsyYMQoKCtIzzzyjNWvW2Bw7evSooqKi1KxZM+s+f39/hYeHa8OGDSTE98psNuu1117ToEGDdODAAXl4eOjEiRMptkekpFq1avrrr7+sCXZKChUqpEaNGmnevHm6ceOGmjdvbvPrgnXr1umTTz5R69atJUknT55UdHR0uq4jqQc6MfG/H9YAAACQXnf/RnnkyJEaNWpUsnHR0dFKTExUcLDtQ2qCg4O1b9++FOdeu3atvvzyS+3cuTPF41FRUdY57p4z6VhakBDb0blzZw0ZMkSffvqpBg8erJdfflkWi0X169fXlStXtG7dOvn5+alnz57Jzh02bJhq166tAQMGqHfv3vLx8dFff/2l5cuX6+OPP7aO6969u0aOHKn4+HhNnDjRZo6SJUtq7ty5qlGjhmJiYjRkyBB5eXnd/VJ2FSlSRCaTSb/88otat24tLy8v+fr6ZuwNAQAAD6SsfHTzyZMn5ef37xKAmfUb59jYWD311FP6/PPPFRAQkClzpoZVJuxwdXXVgAED9O6772r48OF68803NX78eJUtW1atWrXS4sWLVbRo0RTPrVSpklatWqUDBw6oQYMGqlq1qkaMGKECBQrYjOvUqZMuXLig69evJ7tr8ssvv9SlS5dUrVo1PfXUUxo4cKDdhvOUFCxYUKNHj9arr76q4OBg6yoXAAAAmcHPz89mSy0hDggIkIuLi86etV2C8OzZswoJCUk2/vDhwzp27Jjatm0rV1dXubq6as6cOfrf//4nV1dXHT582HpeWudMjckwDBYZRTIxMTHy9/fX7iNnlSsXC78DOU3Z3nOyOwQAqTBu3VDckpd05coVm8ppdkj6fh6584R8M/n7+dXYGDWtUjhd1xkeHq5atWppypQpkm4/56Fw4cIaMGBAspvqbt68qUOHDtnse+ONNxQbG6uPPvpIpUqVkpubmwoUKKDBgwfrlVdekXT7moOCgjRr1ix6iAEAAHBbTll2bdCgQerZs6dq1KihWrVqadKkSbp27Zp11YkePXqoYMGCGj9+vDw9Pa0LGyRJWkL2zv0vvfSS3nrrLZUsWdK67FqBAgWS/ebdHhJiAAAA3BddunTR+fPnNWLECEVFRalKlSpaunSp9aa4EydOyGxOX0fv0KFDde3aNfXt21eXL19W/fr1tXTp0jSvQSzRMoFU0DIB5Gy0TAA5V05smfhjV9a0TDSpnL6WiZyKm+oAAADg1GiZAAAAcHBZueyaI6BCDAAAAKdGhRgAAMDBmUy3t8ye01FQIQYAAIBTo0IMAADg4HLKOsQ5FRViAAAAODUqxAAAAI6OErFdJMQAAAAOjmXX7KNlAgAAAE6NCjEAAICDY9k1+6gQAwAAwKlRIQYAAHBw3FNnHxViAAAAODUqxAAAAI6OErFdVIgBAADg1KgQAwAAODjWIbaPhBgAAMDBseyafbRMAAAAwKlRIQYAAHBw3FNnHxViAAAAODUqxAAAAI6OErFdVIgBAADg1KgQAwAAODiWXbOPCjEAAACcGhViAAAAB8c6xPZRIQYAAIBTo0IMAADg4Fhkwj4SYgAAAEdHRmwXLRMAAABwalSIAQAAHBzLrtlHhRgAAABOjQoxAACAg2PZNfuoEAMAAMCpUSEGAABwcCwyYR8VYgAAADg1KsQAAACOjhKxXSTEAAAADo5l1+yjZQIAAABOjQoxAACAo8uCZdccqEBMhRgAAADOjQoxAACAg+OeOvuoEAMAAMCpUSEGAABwdJSI7aJCDAAAAKdGhRgAAMDBsQ6xfSTEAAAADs6UBcuuZfoybtmIlgkAAAA4NRJiAAAAB2fKoi0jpk6dqrCwMHl6eio8PFybN29OdewPP/ygGjVqKHfu3PLx8VGVKlU0d+5cmzEREREymUw2W6tWrdIVEy0TAAAAuC/mz5+vQYMGafr06QoPD9ekSZPUsmVL7d+/X0FBQcnG582bV6+//rrKlCkjd3d3/fLLL+rVq5eCgoLUsmVL67hWrVpp5syZ1o89PDzSFRcVYgAAAEeXQ0rEH374ofr06aNevXqpXLlymj59ury9vTVjxowUxzdu3FgdO3ZU2bJlVbx4cb344ouqVKmS1q5dazPOw8NDISEh1i1PnjzpiouEGAAAABkWExNjs8XFxaU4Lj4+Xtu2bVOzZs2s+8xms5o1a6YNGzb85+sYhqHIyEjt379fDRs2tDm2cuVKBQUFqXTp0nruued04cKFdF0DCTEAAICDM2XRf5IUGhoqf39/6zZ+/PgUY4iOjlZiYqKCg4Nt9gcHBysqKirV2K9cuSJfX1+5u7urTZs2mjJlipo3b2493qpVK82ZM0eRkZGaMGGCVq1apYcffliJiYlpfn/oIQYAAECGnTx5Un5+ftaP09u/+19y5cqlnTt36urVq4qMjNSgQYNUrFgxNW7cWJLUtWtX69iKFSuqUqVKKl68uFauXKmmTZum6TVIiAEAABycSVmwDvH//9/Pz88mIU5NQECAXFxcdPbsWZv9Z8+eVUhISKrnmc1mlShRQpJUpUoV/f333xo/frw1Ib5bsWLFFBAQoEOHDqU5IaZlAgAAAFnO3d1d1atXV2RkpHWfxWJRZGSk6tSpk+Z5LBZLqn3KknTq1ClduHBB+fPnT/OcVIgBAAAc3L2sG2xvzvQaNGiQevbsqRo1aqhWrVqaNGmSrl27pl69ekmSevTooYIFC1r7kMePH68aNWqoePHiiouL05IlSzR37lxNmzZNknT16lWNHj1ajz32mEJCQnT48GENHTpUJUqUsFmW7b+QEAMAADi4nPLo5i5duuj8+fMaMWKEoqKiVKVKFS1dutR6o92JEydkNv/bwHDt2jU9//zzOnXqlLy8vFSmTBl99dVX6tKliyTJxcVFu3fv1uzZs3X58mUVKFBALVq00NixY9PVy2wyDMNI/+XA0cXExMjf31+7j5xVrlz/3RcE4P4q23tOdocAIBXGrRuKW/KSrly5kqbe2qyU9P38r2PnlCuTY4mNiVG5sKAccZ33igoxAACAw8spTRM5EzfVAQAAwKlRIQYAAHBwOaWHOKeiQgwAAACnRoUYAADAwdFBbB8VYgAAADg1KsQAAAAOjh5i+0iIAQAAHJzp///L7DkdBS0TAAAAcGpUiAEAABwdd9XZRYUYAAAATo0KMQAAgIOjQGwfFWIAAAA4NSrEAAAADo5l1+yjQgwAAACnRoUYAADAwbEOsX1UiAEAAODUqBADAAA4OpaZsIuEGAAAwMGRD9tHywQAAACcGhViAAAAB8eya/ZRIQYAAIBTo0IMAADg8DJ/2TVH6iKmQgwAAACnRoUYAADAwdFDbB8VYgAAADg1EmIAAAA4NVomAAAAHBwtE/ZRIQYAAIBTo0IMAADg4ExZsOxa5i/jln2oEAMAAMCpUSEGAABwcPQQ20eFGAAAAE6NCjEAAICDMynzH7TsQAViKsQAAABwblSIAQAAHB0lYrtIiAEAABwcy67ZR8sEAAAAnBoVYgAAAAfHsmv2USEGAACAU6NCDAAA4OC4p84+KsQAAABwalSIAQAAHB0lYruoEAMAAMCpUSEGAABwcKxDbB8VYgAAADg1KsRIkWEYkqSrsbHZHAmAlBi3bmR3CABSYdy6efv///+9NCeIjY3J9HWDY2NjMnfCbERCjBTF/n8iXLdyiWyOBACAB1NsbKz8/f2zNQZ3d3eFhISoZNHQLJk/JCRE7u7uWTL3/WQyctKPL8gxLBaLTp8+rVy5csnkSI+icVIxMTEKDQ3VyZMn5efnl93hALgDX5+OxzAMxcbGqkCBAjKbs7879ebNm4qPj8+Sud3d3eXp6Zklc99PVIiRIrPZrEKFCmV3GMhkfn5+fMMFcii+Ph1LdleG7+Tp6ekQSWtWyv4fWwAAAIBsREIMAAAAp0ZCDDgBDw8PjRw5Uh4eHtkdCoC78PUJZD9uqgMAAIBTo0IMAAAAp0ZCDAAAAKdGQgwAAACnRkIMAAAAp0ZCDAAAAKdGQgzgnt25WE1iYmI2RgIAQPqREAO4J4ZhyGQy6eLFi5IkFxcXrV+/Xlu3bs3myAAASBsSYgD3xGQy6fz582rdurWmTp2qX375RfXr19fVq1ezOzTAoVksFuufExISku0DkHYkxADu2fXr19W0aVO988476ty5s7799ls1btyY9gkgC5nNZp08eVK3bt2Sq6urfv75Z7399tskxUAGkBADuGdFihRRvXr19M8//8jPz08XLlyQdLt9gm/OQNa4ceOG2rdvr/r16+ubb75R+/btVbp0aZnNfGsH0otHNwO4JxaLRWazWbt379bBgwe1e/duffvtt3r22Wc1aNAgmzEAMteRI0dUu3ZtxcbGaurUqXr66aeVmJgoFxeX7A4NeKC4ZncAAB5MSTfTnT9/Xl5eXipfvrwqVaqkSpUq6ebNm/r0009lNpv10ksvyWw2a+HChSpatKiqVauW3aEDDsPDw0NXr16Vh4eH5s2bp6eeekpubm78EAqkExViABm2aNEiDR06VJ6envLz89PChQsVHByso0eP6tNPP9WPP/6oNm3ayNfXV2+99ZYOHz6sokWLZnfYgEM5evSobty4odatWyssLEzLly+3SYoTEhLk6kr9C7CHhBhAuiRVhv/66y/Vq1dPw4cPl7e3t+bPn6/Dhw/rt99+U8WKFXX8+HHNnz9fX331lTw9PfXpp5+qatWq2R0+8EBL+vr7+++/dfbsWRUqVEglSpSQJG3ZskWdO3dWsWLF9Ntvv8nNzU0ff/yxYmJiNHz4cJlMpmyOHsi5SIgBpNvGjRsVGxur9evXa+TIkZKk6OhoPfXUU9q1a5eWLVumChUqKCEhQQkJCbpx44by5MmTzVEDjuGHH35QRESEAgMDdfToUb377ruKiIhQQECAtmzZoq5du8pkMql27dqaP3++tm/frooVK2Z32ECORoMRgHS5du2ann/+ebVs2VJHjx617g8ICNDcuXNVuXJltWnTRrt27ZKrq6s8PT1JhoF7lFS7OnnypMaPH6/33ntPK1eu1Pvvv69Ro0bpo48+0rlz51SzZk2tXr1aDRs2lJeXl3bs2EEyDKQBFWIA6bZ7924NHTpU+/fv16ZNmxQUFGT9Ve6FCxfUtm1bXb58WTt37pS7u3t2hws4hN9//13bt2/XoUOHNGXKFHl4eEiSpk2bpmHDhunFF1/U888/r/z580uS4uPj+foD0oiEGIBdSYnunRITE7V//349+eSTio+P19q1a5U7d26bxzhfvXpVhQsXzqaoAcfz+uuva/z48SpWrJhWr16tAgUKWI9NmzZNb7zxhiIiIjR48GBrUgwgbUiIAaQqKcHdsGGD1qxZo6tXr6p169aqXbu2JOnvv//WE088oVu3biVLigFkvg8++EBDhgzRBx98oD59+sjX19d67MMPP9SkSZO0bds2BQYGZmOUwIOHhBiAXT/88IP69eunChUqyMfHR4sXL9ZXX32lJ554QtLtpLhnz576559/9Ndff8nf3z+bIwYefEk/WCYmJioxMdGm9eGNN97QO++8o8mTJ6tnz57y8fGxHrt8+bJy586dDREDDzYWJgSQqg0bNuj555/XuHHj1Lt3b506dUphYWHq1auXLl68qAEDBqhs2bKaMWOG+vfvrwsXLpAQA/coKRn+7bffNGfOHB07dkzNmzfXE088oVKlSumtt96SYRgaOHCgXFxc1L17d2ulmK8/IGNIiAGkKDExUTt37lTfvn3Vu3dvnTx5UvXr19ezzz6rwMBAvfTSS/L19VVERIQqVKig5cuXcwMPkAlMJpN++uknPfXUU3riiSfUpUsXjR07VgcOHFC/fv3UsGFDvf3223JxcdFzzz0nNzc39erVSyaTiXYlIINomQCQzLlz5xQUFKS///5bV69eVYUKFfTwww+rZMmS+vTTT3X8+HFVqVJFsbGx+vTTT9WnT5/sDhlwGHv37tWjjz6ql19+Wf369ZNhGAoKCtKtW7dUu3Ztvfnmm6pXr54k6a233tJjjz2msmXLZnPUwIONCjEAG3/++aeaNGmiP//80/pN9uDBg4qJiVFERITMZrPc3d312GOPqUyZMtZvzAAyR1xcnJ544gk988wzOnXqlBo0aKDu3burV69eql27tnx9fXX9+nU1b95cb7zxRnaHCzgEEmIANipWrKiCBQvqvffe0/vvvy+TyaRLly5p586dunTpkrUqvG/fPn388cfy9vbO7pABh7B79255enqqbNmyypUrl1xcXDR06FBri4SPj4/q1KmjH374Qd7e3qpfv768vLyyO2zAIfCkOgBWCQkJslgs6tSpk3bs2KGLFy9KkmrVqqX+/furXbt2qlOnjiZPnqypU6eSDAOZwDAMnTt3To8++qhWr14tLy8vlSxZUomJifrnn39UsWJF60oSZcuW1ddff61Ro0aRDAOZiB5iALp48aLy5s1r/fj06dMqX768XnnlFZtfyf7yyy+6fv26atasqaJFi2ZHqIDDevHFF/XLL79oy5Ytyps3r6Kjo9W0aVNVrVpVnTt31rp16zR79mzt3r1b+fLly+5wAYdCQgw4uY0bN+rVV19VhQoVNG7cOHl4eMjDw0OTJ0/WvHnzNHPmTJUrVy67wwQcVtIjlvft26dnnnlGzzzzjJ5++mlJ0sqVK/XEE08oV65cunXrlhYuXKiqVatmc8SA46FlAnByQUFBatCggdavX6/KlStr7Nix2rt3r1q3bq3Y2Fjt379f0u1l2ABknr/++kuxsbHW5QpLly6tggULas6cOdYxjRs31qZNm7RkyRJt3LiRZBjIIlSIASeTtOj/lStXlJCQYPOr11GjRmn79u2KjIzUuHHjNHPmTF29elVbt27l6VdAJjp69Ki6du2qkydPatKkSSpXrpwqVKig48ePq169eho+fLj69++f3WECToOEGHAiScnwzz//rE8++UQHDx5UtWrVVKNGDQ0dOlSSdOXKFf3yyy+aPn26jhw5oitXrujw4cMKDg7O5ugBx3Hr1i0dPXpUH3/8sVavXi3DMNStWzd17txZEydOVEJCgiZNmiQPDw8etgHcByTEgINLSoKTLF68WJ06ddLbb7+tChUq6Ndff9VHH32kyMhIPfTQQ9Zx//zzj/bv368iRYqoePHi2RE64DCSvg4PHTqky5cvKyEhQbVr15YkbdmyRZs3b9abb76pJk2aaP/+/dq7d682bNig8PDwbI4ccA4kxIATSExMlIuLi27cuKGIiAhVrVpVr776qqKjo1W1alV17NhRkydPzu4wAYeUlAz/8MMPevPNN5WYmCiTyaS8efPqhx9+sP725ciRI1q4cKGWLl2qFStWaN++fSpVqlQ2Rw84B26qAxzUl19+qc6dO0uSXFxcrP8/dOiQypYtqzNnzqhKlSp6+OGHrcnwd999pw0bNmRbzIAjMplMWrlypXr06KGXX35Z27Zt08cff6wNGzZoyZIlkiSLxaJixYpp8ODBioyM1OnTp0mGgfuIhBhwQLdu3dKlS5f0999/q0+fPtb9CQkJKlu2rLZt26Z69eqpdevW+vTTTyVJ0dHRWrp0qfbt2yeLxZJdoQMPvKQH2tz5dbRp0yb16tVLvXv31vnz59W7d2/169dPvXr1kiSZzbe/HSet5hISEnKfowacGwkx4IDc3NzUr18/Pf/889qyZYt1TVNvb2/VrVtXb731lgoVKqTJkydb+4snTpyo1atXq3HjxtZvzgDSZ8GCBQoMDNS+fftkNputSfGOHTusP6g2aNBALVq00NSpUyVJM2fOtP7Z1dU122IHnBlfeYADMgxDvr6+6tGjhywWi7744gv16tVLM2fO1PPPP6/z589r7Nixeumll+Ti4qJr165p0aJFWrlyJU+gA+5B7dq11bx5czVp0kR//PGHypQpI0l67LHHNGPGDJUuXVodOnTQp59+KovFIovFoq1bt8psNuvmzZvy9PTM5isAnBNlIMABJVV9k5LiPn36aNu2bYqIiJAkjRw5UlOnTlVsbKz27dunvHnzav369apSpUr2BQ04gMKFC+vLL79U1apV1bBhQ+3bt0/S7YduXL58Wblz51bXrl0lSVevXtXIkSP1ww8/aMCAASTDQDZilQnAgSTdzX7kyBHduHFDCQkJqly5sm7duqUvvvhC06dPV7Vq1TRz5kxJ0vXr1+Xt7S2LxUKbBJCJTp06pb59+2rr1q1auXKlypUrp7Vr16p///4ymUxKTExU/vz5tWfPHi1evJgn0AHZjIQYcBBJyfCPP/6owYMHy9/fX0ePHlX79u3Vv39/VapUSV988YU+//xz1ahRQ1988UV2hww4tLNnz6pnz57atm2bVq1apXLlymnPnj06cOCA1q1bp6pVq6pu3boqVqxYdocKOD0SYsCBrF69Wm3bttWECRPUr18/ffXVV+rRo4dmzJihiIgIXb16VV999ZXGjx+vdu3aacqUKdkdMvDAS/phdOvWrfrrr7905coV1a5dWzVr1tTFixfVvXt3bd261ZoUA8h5SIgBB5D0DXnEiBE6cuSIvvrqKx09elQtWrTQQw89pM8++0zS7SWdbt68qW+++UZNmjShMgVkkoULF6pv375q0KCBTpw4IbPZrBYtWmjcuHE6deqUnn32We3cuVPLli1T+fLlsztcAHehaRB4wKS0RnDSTXRRUVGqXLmyEhMTVb9+fTVt2tS6zvD8+fP1448/ysfHR8888wzJMJBJ/vzzTw0cOFDjxo3TokWL9OWXX2rv3r3Wr8tChQrpyy+/VFhYmDp27Khbt25lc8QA7kZCDDxAkm5+O378uL744gtNnDhRmzdvth4vX7683n33XRUoUECdO3fWxx9/LJPJJMMwtGTJEq1evVpxcXHWb9QA0i61B9YcOHBAhQsX1rPPPqujR4+qY8eO6tGjh95++21J0t69exUSEqKFCxcqMjJSbm5u9zNsAGnAOsTAAyIpGd69e7fatGmjsLAwbdy40fq41z59+qhz585au3atVq9ereeee06urq66ceOGxo4dq+XLl2vFihXy8PDI7ksBHjhJX38nT57UsmXLZLFYVKZMGTVo0EBubm4KDg7WyZMn1bBhQ7Vu3VqffPKJJGnNmjX67bff9MILL/D0OSAHIyEGHgBJ34z//PNP1a5dW0OHDtWQIUN06dIl1a5dWwsXLlSfPn1UoEABPf3007p06ZJq1KihmjVryjAM7d+/X4sXL1bp0qWz+1KAB86dP4y2a9dOwcHBOnz4sHLnzq0PP/xQlSpV0pIlS/Trr7+qX79++uijj6znLliwQMeOHWONYSCHo2UCeAAktUnUqVNH7du316hRo+Tj46NChQqpXLly2rlzp06ePClJevjhh7Vw4UK9++67qlGjhh5//HHrEk8A0ufOZLhOnTrq1q2bVqxYoW+//VY3btzQ9OnTFRYWpmnTpskwDBUqVEgnTpzQ4cOHNXToUM2bN0/vvPOO/P39s/tSANjBKhPAA+LYsWNq0KCBatSooUGDBqlBgwZ67733NGzYMJUoUUIVKlSQJFWpUkX9+vVTnjx56FUEMsHJkydVrVo1PfTQQ1qwYIF1f61atXT58mVt2bJFrq6umj9/vvr376/g4GB5e3vLZDLpq6++4odR4AFAQgw8AJKqVPv379djjz2m0qVLKzAwUN99952++eYbhYWFyTAMzZw5U6tWrdLOnTvVokULffPNN/L09OQpdMA9OHbsmB5//HHlz59fQ4cOVb169TR+/Hi9/vrrqlGjhvLnz698+fLpkUceUe7cuXXjxg0VKVJEgYGBCg4Ozu7wAaQBCTHwgEhKivft26cuXbrozz//1Pvvv69BgwZZxyStRzxv3jzVrVtXRYsWzcaIAcdx8OBBDRw4UO7u7goKCtJPP/2kTz75RLVq1dK2bdu0Z88eTZkyRT4+PqpWrZoWLlyY3SEDSAcSYuABkpQUHz58WB06dFBYWJiGDBmihg0bSpISEhLk6sq9skBWOHDggAYMGKA1a9Zo7NixGjx4sM3xCxcuaMWKFapcubJKliyZTVECyAgSYiCHSlrz1Gw2WxPhpP1JleJOnTqpSJEiGj58uOrXr5+d4QJO4fDhw3r++efl4uKi1157zfp1d+vWLXr2gQcYjYVADpGUAN+8eVPS7UT44MGD1j8nSUqQy5Qpo++//17//POPXn31VW3YsOH+Bw04meLFi+vjjz+WYRh66623tG7dOkkiGQYecCTEQA5hNpt15MgRvfTSS/rnn3/0/fffq2zZstq7d2+KY5OS4nnz5slisahQoULZEDXgfEqWLKnJkyfLzc1NgwcP1saNG7M7JAD3iJYJIAdZvXq1OnTooMqVK2vDhg367LPP1KNHD+vNcndLTEyUi4sLv64FssG+ffv05ptv6oMPPlDhwoWzOxwA94CEGMghkpLeCRMmaPjw4apdu7bmzJmjEiVK2By3dy6A+ys+Pl7u7u7ZHQaAe0TLBJBDJCYmSpI8PT01YsQInT17VqNGjdKOHTskSSaTSXf+/JrUc5x0DMD9RzIMOAYqxEA2S6ru3r1k2rJly/Tss8+qbt26Gjp0qCpXrixJ2rBhg+rUqZNd4QIA4HBIiIFslJQMR0ZG6scff9SlS5dUrlw59enTR0FBQVq2bJn69eunevXqqWvXrtq+fbtGjhypqKgoBQYGUhkGACATkBAD2WzRokXq1q2bnnzySR0/flyXLl3S+fPntXr1ahUuXFiRkZEaPHiwLBaLYmJi9P3336t69erZHTYAAA6DhBi4j+6++S06OlrNmzfXE088oSFDhkiS9uzZo1deeUUHDx7U5s2bFRAQoGPHjikmJkaBgYHKnz9/doUPAIBD4qY64D5I+rnz+vXrkv69Ie7q1as6c+aMqlSpYh1btmxZvfvuu8qTJ4++/fZbSVJYWJgqVapEMgwAQBYgIQbuA5PJpHPnziksLEwLFiywPnkuJCREoaGhWrVqlXWsi4uLKlWqJFdXV+3fvz+7QgYAwGmQEAP3idlsVrt27fTUU0/pp59+su4LDw/XH3/8oR9++ME61mQyqWDBgsqdO7cMwxCdTQAAZB16iIEsktLDMs6dO6e3335bU6ZM0cKFC9WxY0dduHBB3bt315UrVxQeHq569epp9erVmjNnjjZt2qQyZcpk0xUAAOAcSIiBLGCxWGQ2m3Xt2jUlJibKz8/PeuzMmTMaN26cpk6dqu+++06PPfaYLly4oHfeeUfr1q1TdHS0QkJCNHnyZJveYgAAkDVIiIEscvDgQT3++OPy9fVVnz59FBISohYtWkiS4uLi9Morr+iTTz7R/Pnz1blzZyUkJMhkMunixYvy9vaWj49PNl8BAADOwfW/hwBIL4vFolmzZmnXrl3y9PTU5cuXdf36deXNm1e1atXS008/rV69eilfvnzq0qWL/Pz81LJlS0lSYGBgNkcPAIBzoUIMZJGoqChNmDBBhw8fVokSJdS/f3/NmzdPa9as0e7du5U3b14VK1ZM27Zt07lz57Ry5Uo1bNgwu8MGAMDpUCEGskhISIiGDBmicePGae3atSpZsqRGjBghSdq0aZNOnz6tzz77TEFBQTp37pwCAgKyOWIAAJwTFWIgiyXdRLdp0yZ16NBBr732mvXYrVu3ZLFYdOXKFQUFBWVjlAAAOC8SYuA+iIqK0ttvv60tW7aoQ4cOevXVVyVJCQkJcnXlFzUAAGQnEmLgPklKinfs2KGmTZtq9OjR2R0SAAAQT6oD7puQkBC9/vrrKlmypNavX68LFy5kd0gAAEBUiIH77uzZs5Kk4ODgbI4EAABIJMQAAABwcrRMAAAAwKmREAMAAMCpkRADAADAqZEQAwAAwKmREAMAAMCpkRADAADAqZEQAwAAwKmREAPIMUwmk81mNpuVO3duNWjQQF988YWye9n0WbNmyWQyadSoUTb7IyIiZDKZtHLlymyJK6MaN24sk8mkY8eOpWl8atefEWFhYTKZTPc8z395UD83AO4vEmIAOU7Pnj3Vs2dPde/eXeXKldO6devUp08fPfHEE9kdWpbJzGQTAJA+rtkdAADcbdasWTYfL1++XK1bt9a3336r7t2765FHHsmewFIxfvx4vfrqqypcuHB2hwIAyAAqxAByvObNm+upp56SJC1atCh7g0lB/vz5VaZMGXl7e2d3KACADCAhBvBAqFq1qiTp5MmT1n0mk0lhYWGKj4/XmDFjVKZMGXl4eKhDhw7WMdevX9f48eNVtWpV+fr6ytfXV7Vr19bs2bNTfa1169apWbNmypUrl3Lnzq2WLVtq06ZNqY6316d67do1TZgwQTVq1JCfn598fHxUpkwZ9e/fXwcOHJB0u5e3V69ekqTRo0fb9FHfXS3/+++/FRERodDQUHl4eCg4OFhdu3bV3r17U4wtMTFR77//vsqUKSNPT0+FhobqxRdfVExMTKrXk15nzpzRu+++q0aNGqlgwYJyd3dXSEiIHn30UW3ZssXuuYZh6KOPPlK5cuXk6empggULauDAgbp8+XKq47/55hs1adJEefLkkaenp8qWLatRo0bp+vXrmXZNAJwLLRMAHgixsbGSJA8PD5v9FotFHTp00OrVq9WoUSNVqlRJ+fLlkySdO3dOzZs31+7duxUSEqJGjRrJMAytX79eERER2rp1q6ZMmWIz3y+//KKOHTsqISFBtWrVUrFixbRr1y41bNhQERER6Yr5zJkzat68ufbu3as8efKocePG8vDw0JEjRzR9+nSVLFlSpUqVUqtWrZSQkKB169apcuXKqlKlinWOEiVKWP+8aNEide3aVXFxcapSpYpq166tkydPasGCBfr555/166+/qmHDhjYxPPnkk/r222/l7e2tFi1ayNXVVbNnz9a6devk5uaWrutJzU8//aRhw4apdOnSqlSpkvz8/HTw4EH9+OOP+uWXX/TLL7+oRYsWKZ77wgsv6LPPPlPjxo1VsWJFrVq1SlOmTNGqVau0Zs0a+fn5WcdaLBY9+eST+uabb+Tr66saNWooT5482rp1q0aPHq1ff/1VK1eulJeXV6ZcFwAnYgBADiHJSOmfJYvFYtSpU8eQZLz++uvJxpcoUcI4depUsvNat25tSDJefPFF4+bNm9b9UVFRRo0aNQxJxq+//mrdHxMTYwQGBhqSjBkzZti8/rBhw6yvN3LkSJvX6dmzpyHJWLFihc3+pk2bGpKMxx9/3IiNjbU5dvToUWPXrl3Wj2fOnJni3HeO9/HxMXx9fY3ly5fbHPv1118NNzc3IzQ01IiLi7Pu//bbbw1JRuHChY2jR49a9589e9aoUKGC9XruPGZPajHu3r3b2LNnT7LxS5cuNdzd3Y3ixYsbFovF5liRIkUMSYafn5+xdetW6/7Y2FijSZMm1s/bnd59911DktG4cWPjzJkz1v1xcXHGM888Y0gyhg0bZnNOap8bALgTCTGAHOPuhDghIcE4cOCAERERYUgyPDw8jEOHDiUb/9133yWba8eOHYYko2bNmkZiYmKy49u3bzckGe3atbPumzFjhiHJaNiwYbLx8fHxRqFChdKcEG/atMmQZAQFBRkxMTH/ee3/lRC/+OKLhiRjypQpKR4fOHCgIcn44YcfrPsaNmyYLLlP8uuvv2ZaQmxP9+7dDUnG7t27bfYnJcSvvfZasnP27t1rmEwmw9fX17hx44ZhGIZx69YtIyAgwPDx8TGioqKSnXP9+nUjJCTEyJMnj83nm4QYQFrQQwwgx0nqn3V1dVWpUqU0a9Ys5cqVS998842KFy+ebGzbtm2TzbFs2TJJUocOHWQ2J/+nLqmnePPmzdZ9a9askSR17do12Xg3Nzd16tQpzdfw+++/S5K6deumXLlypfm81CRdz6OPPpri8QYNGkiS9Xpu3bqljRs3SpK6dOmSbHyrVq2UJ0+ee44rSVxcnH766Se9/vrr6tu3ryIiIhQREaE///xTknTw4MEUz0vpvS5XrpwqV66sq1evaseOHZKk7du3Kzo6WnXr1lVwcHCyc7y8vFS9enVdunQp1dcCgNTQQwwgx+nZs6ckyWw2y8/PTxUrVtSjjz6aYgIXFBSUrK9YkvVhE6+//rpef/31VF/r5s2b1j+fPn1aklSkSJEUx4aFhaX1Eqw3/92dwGdU0vUULFjQ7rjo6GhJ0oULFxQfH6/AwMBUV78oUqSILl26dM+x/fnnn2rXrp3dB3wk9YCnFENKwsLCtHPnTuvnJGnu5cuX/+cDPaKjo1W6dOn/DhwA/h8JMYAc5+6VFezx9PRMcb/FYpEk1a9fP9OS0uyUdD1JPyykJjw8/H6EY2UYhh5//HEdO3ZM/fr1U79+/VSsWDH5+vrKZDLptdde0/jx4+/5KYNJ11+iRAnVq1fP7tikmyoBIK1IiAE4pEKFCkm63TLxyiuvpOmc/PnzS5KOHz+e4vHU9qckNDRUknT48OE0n2NPoUKFdPjwYX3wwQdpSvjy5csnd3d3nT9/Xjdu3Ehx5YUTJ07cc1z79u3Tvn37VKNGDU2bNi3Z8SNHjtg9//jx46pYsWKK+yWpQIECkv79fJYpUyZdPzABQFrQQwzAITVv3lyS9OOPP6b5nKQ+3AULFiQ7lpCQoIULF6Z5rmbNmkmSvvnmG129evU/x7u7u1tfJyXpvR43NzdrtTil61m2bJkuXryYprnsSWq5SEpY7z62fPlyu+enFNu+ffu0c+dO+fr6Wpegq1mzpvz9/bVq1apMiRsA7kRCDMAhhYeHq3nz5lq3bp369++f4oModu3apaVLl1o/7ty5s/Lly6eVK1faPLjDMAyNHDkyXRXVWrVq6aGHHtK5c+fUt29fXbt2zeb4sWPHrDecSf9WQvfv35/ifK+88oq8vLw0ePBg/fDDD8mOx8XF6fvvv9epU6es+5577jlJShZ7dHS0hgwZkuZrsadEiRIym836448/bG5mu3nzpvr16/efyeuUKVOsN85Jtx+k8sILL8gwDPXq1cta2fbw8NDQoUMVGxurRx99NMXK8z///KO5c+dmynUBcDLZu8gFAPxLqaxDbG98kSJFUj1+9uxZo2rVqoYkI3fu3Ebjxo2NJ554wmjTpo0RGhqa4lq3ixYtMlxcXAxJRnh4uNGtWzejXLlyhpubm9GnT590rUN86tQpo3Tp0oYkI2/evEa7du2Mzp07G9WqVTPMZrMxceJE69gbN24YQUFBhiSjUaNGRq9evYxnnnnGWLdunU1s3t7e1rWX27Zta3Tt2tVo0KCB4ePjY0gyduzYYRND586dDUmGj4+P0a5dO+PRRx81cufObVSrVs2oXbt2piy7lvS+eHl5GW3atDE6depkBAcHGwEBAdYl82bOnGlzTtKya/379zfc3NyMli1bGo8//rgREhJiSDLKly9vXL582eacxMRE46mnnjIkGe7u7kZ4eLjRtWtX49FHHzXKly9vmEwmo3Llymn63ADAnagQA3BYQUFBWr9+vSZPnqxy5cppx44d+v7777V7924VK1ZM7733ngYPHmxzTvv27bVixQo99NBD2rNnjxYvXqz8+fNr1apVqlu3brpev2DBgtqyZYvGjBmjQoUKafny5fr11191/fp1Pf/883rkkUesYz09PbV48WI1b95cO3fu1KxZs/Tll19aH++cFNvu3bv1/PPPy2Qyafny5Vq8eLHOnTuntm3basGCBSpXrpxNDF9//bUmTJigggULaunSpdq4caOeeOIJ/fHHHymuzpER06ZN0wcffKCiRYsqMjJSa9asUbNmzbR169ZUV5FIMnnyZI0fP17Hjx/XTz/9JJPJpP79+2vNmjXy9/e3GWs2mzVnzhz99NNPat68uY4ePaqFCxdq7dq18vT01JAhQzRjxoxMuSYAzsVkGPd46y8AAADwAKNCDAAAAKdGQgwAAACnRkIMAAAAp0ZCDAAAAKdGQgwAAACnRkIMAAAAp0ZCDAAAAKdGQgwAAACnRkIMAAAAp0ZCDAAAAKdGQgwAAACnRkIMAAAAp/Z/8F95wrmqYi0AAAAASUVORK5CYII=", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light", - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -596,21 +576,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.7377377377377378\n", - "AUC: 0.7251117679464362\n" + "Accuracy: 0.7367367367367368\n", + "AUC: 0.6586769358985225\n" ] }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light", - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -622,6 +599,7 @@ "\n", "# Make predictions on test data\n", "y_pred_class = logreg.predict(X_test_dtm)\n", + "y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]\n", "\n", "# calculate evaluation measures:\n", "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", @@ -657,21 +635,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Accuracy: 0.6836836836836837\n", - "AUC: 0.7251117679464362\n" + "Accuracy: 0.6841841841841841\n", + "AUC: 0.6732650365850213\n" ] }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsQAAAJnCAYAAACQ3UXDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAACCwUlEQVR4nOzdeVwV1fsH8M/cyw4CKquIIOKGCqgkouIWirkvuZdAilmWGZlmJa6JSxq5pFbumrvpt1xSyX3fUVNUBMWFTUEWEYQ7vz/8MXkFboAXwbmfd6955T1z5swz94o8PJw5I4iiKIKIiIiISEcpyjsAIiIiIqLyxISYiIiIiHQaE2IiIiIi0mlMiImIiIhIpzEhJiIiIiKdxoSYiIiIiHQaE2IiIiIi0mlMiImIiIhIpzEhJiIiIiKdxoSYSEfcuHEDHTt2hIWFBQRBwLZt27Q6fmxsLARBwIoVK7Q6rhw4OzsjMDCwXGOYNGkSBEEoUd/k5OQyjurNIggCJk2aJL1esWIFBEFAbGzsa42jvM5LJGdMiIleo+joaHz44YdwcXGBkZERzM3N0bJlS/z444/Iysoq03MHBATg0qVL+O6777B69Wp4eXmV6fnk6J9//sGkSZNkk4hMnz5d6z8Ykfbw8yF6fQRRFMXyDoJIF+zYsQN9+/aFoaEhhgwZgoYNGyInJwdHjhzBli1bEBgYiJ9//rlMzp2VlQUTExN88803mDZtWpmcQxRFZGdnQ19fH0qlskzOUd42b96Mvn37Yv/+/Wjbtm2xj8vOzoZCoYC+vn7ZBfcfcnNzkZubCyMjI6nNzMwM7777boGq/qRJkzB58mQkJSXBysrqNUdacQmCgIkTJ0pV4ry8PDx79gyGhobFrr6XRFGfT1mfl0gX6ZV3AES6ICYmBgMGDICTkxP+/vtv2NvbS/tGjhyJmzdvYseOHWV2/qSkJACApaVlmZ1DEAS1ZEvXiaKIp0+fwtjYGIaGhuUdDvT09KCnJ59/8nNzc6FSqWBgYFBuMSiVynL54a+8zkskZ5wyQfQazJo1CxkZGVi6dKlaMpzP1dUVn332mfQ6NzcXU6dORa1atWBoaAhnZ2d8/fXXyM7OVjvO2dkZXbt2xZEjR9CsWTMYGRnBxcUFq1atkvpMmjQJTk5OAIAvv/wSgiDA2dkZABAYGCj9+UWFzTfdu3cvWrVqBUtLS5iZmaFu3br4+uuvpf1FzSH++++/4evrC1NTU1haWqJHjx64evVqoee7efMmAgMDYWlpCQsLCwQFBeHJkydFv7H/r23btmjYsCEiIyPRpk0bmJiYwNXVFZs3bwYAHDx4EN7e3jA2NkbdunWxb98+teNv376Njz/+GHXr1oWxsTGqVq2Kvn37qk2NWLFiBfr27QsAaNeuHQRBgCAIOHDgAIB/P4u//voLXl5eMDY2xpIlS6R9+XOIRVFEu3btYG1tjcTERGn8nJwcNGrUCLVq1UJmZmah1ymKIqysrBASEiK1qVQqWFpaQqlUIjU1VWqfOXMm9PT0kJGRofYe5xMEAZmZmVi5cqV0LS/Pc05NTX2lz+Off/5Bu3btYGJiAgcHB8yaNatA38TERAwdOhS2trYwMjKCh4cHVq5cqdYn/+/W999/j/DwcOnrIn8KiyAIuH79Ot577z1YWFjA2toaEyZMgCiKiIuLQ48ePWBubg47OzvMmTNHbeycnByEhoaiadOmsLCwgKmpKXx9fbF///7/vM6X5/Lmx1LY9uJ7+/3336NFixaoWrUqjI2N0bRpU+nvaj5Nn09Rc4h/+uknNGjQAIaGhqhWrRpGjhyp9neipJ8NkS5hQkz0Gvzxxx9wcXFBixYtitV/2LBhCA0NRZMmTfDDDz+gTZs2CAsLw4ABAwr0vXnzJt5991106NABc+bMQeXKlREYGIgrV64AAHr37o0ffvgBADBw4ECsXr0a4eHhJYr/ypUr6Nq1K7KzszFlyhTMmTMH3bt3x9GjRzUet2/fPvj7+yMxMRGTJk1CSEgIjh07hpYtWxY6D7dfv35IT09HWFgY+vXrhxUrVmDy5MnFijElJQVdu3aFt7c3Zs2aBUNDQwwYMAAbNmzAgAED0LlzZ8yYMQOZmZl49913kZ6eLh17+vRpHDt2DAMGDMC8efMwYsQIREREoG3btlIC2Lp1a4waNQoA8PXXX2P16tVYvXo16tevL40TFRWFgQMHokOHDvjxxx/h6elZIE5BELBs2TI8ffoUI0aMkNonTpyIK1euYPny5TA1NS30GgVBQMuWLXHo0CGpLTIyEo8fPwYAtc/j8OHDaNy4MczMzAoda/Xq1TA0NISvr690LR9++KFan1f9PDp16gQPDw/MmTMH9erVw7hx47Br1y6pT1ZWFtq2bYvVq1dj8ODBmD17NiwsLBAYGIgff/yxwJjLly/H/PnzMXz4cMyZMwdVqlSR9vXv3x8qlQozZsyAt7c3pk2bhvDwcHTo0AEODg6YOXMmXF1dMWbMGLX3Ly0tDb/++ivatm2LmTNnYtKkSUhKSoK/vz8uXLhQrGvN17t3b+m9zN9Gjx4NALCxsZH6/fjjj2jcuDGmTJmC6dOnQ09PD3379lX7LVFxPp8XTZo0CSNHjkS1atUwZ84c9OnTB0uWLEHHjh3x7Nkztb7F+WyIdI5IRGXq8ePHIgCxR48exep/4cIFEYA4bNgwtfYxY8aIAMS///5banNychIBiIcOHZLaEhMTRUNDQ/GLL76Q2mJiYkQA4uzZs9XGDAgIEJ2cnArEMHHiRPHFfx5++OEHEYCYlJRUZNz551i+fLnU5unpKdrY2IgPHz6U2i5evCgqFApxyJAhBc73wQcfqI3Zq1cvsWrVqkWeM1+bNm1EAOJvv/0mtV27dk0EICoUCvHEiRNS+19//VUgzidPnhQY8/jx4yIAcdWqVVLbpk2bRADi/v37C/TP/yx2795d6L6AgAC1tiVLlogAxDVr1ognTpwQlUqlOHr06P+81tmzZ4tKpVJMS0sTRVEU582bJzo5OYnNmjUTx40bJ4qiKObl5YmWlpbi559/Lh338mcqiqJoampaIK4X+77q5/Hie5ednS3a2dmJffr0kdrCw8Ol9yBfTk6O6OPjI5qZmUnXmP93y9zcXExMTCw01uHDh0ttubm5YvXq1UVBEMQZM2ZI7SkpKaKxsbHaNefm5orZ2dlqY6akpIi2trYFrh+AOHHiROn18uXLRQBiTExMoe9DUlKSWKNGDbFRo0ZiRkaG1P7y37ecnByxYcOGYvv27dXai/p8Xj5vYmKiaGBgIHbs2FHMy8uT+i1YsEAEIC5btkxqK+5nQ6RrWCEmKmNpaWkAgEqVKhWr/86dOwFA7dfiAPDFF18AQIG5xm5ubvD19ZVeW1tbo27durh161apY35Z/tzj7du3Q6VSFeuYBw8e4MKFCwgMDFSr5Lm7u6NDhw7Sdb7oxYopAPj6+uLhw4fSe6iJmZmZWgW9bt26sLS0RP369eHt7S215//5xffH2NhY+vOzZ8/w8OFDuLq6wtLSEufOnSvG1T5Xs2ZN+Pv7F6vv8OHD4e/vj08//RTvv/8+atWqhenTp//ncb6+vsjLy8OxY8cAPK8E+/r6wtfXF4cPHwYAXL58GampqWp/L0rjVT+P9957T3ptYGCAZs2aqb3vO3fuhJ2dHQYOHCi16evrY9SoUcjIyMDBgwfVxuzTpw+sra0LPd+wYcOkPyuVSnh5eUEURQwdOlRqt7S0LPC1oVQqpXnIKpUKjx49Qm5uLry8vEr02b8sLy8PAwcORHp6On7//Xe1qv+Lf99SUlLw+PFj+Pr6lvp8+/btQ05ODkaPHg2F4t9v68HBwTA3Ny/wb0ZxPhsiXcOEmKiMmZubA4Dar+g1uX37NhQKBVxdXdXa7ezsYGlpidu3b6u116hRo8AYlStXRkpKSikjLqh///5o2bIlhg0bBltbWwwYMAAbN27UmBznx1m3bt0C++rXr4/k5OQCc2VfvpbKlSsDQLGupXr16gXmPVtYWMDR0bFA28tjZmVlITQ0FI6OjjA0NISVlRWsra2RmpoqTUcojpo1axa7LwAsXboUT548wY0bN7BixQq1RKkoTZo0gYmJiZT85ifErVu3xpkzZ/D06VNpX6tWrUoUz8u0/Xm8/Pfy9u3bqF27tloSB0CahvLy33VN7+/LsVpYWMDIyKjAKhkWFhYF4l+5ciXc3d1hZGSEqlWrwtraGjt27CjRZ/+yb7/9Fn///Td+++031KpVS23fn3/+iebNm8PIyAhVqlSBtbU1Fi1aVOrzFfW1ZmBgABcXlwLvY3E+GyJdw4SYqIyZm5ujWrVquHz5comOK+5ySkXdbS4WY0XFos6Rl5en9trY2BiHDh3Cvn378P777yMyMhL9+/dHhw4dCvR9Fa9yLUUdW5wxP/30U3z33Xfo168fNm7ciD179mDv3r2oWrVqsSviAIqV0L7owIED0o2Sly5dKtYx+vr68Pb2xqFDh3Dz5k3Ex8fD19cXrVq1wrNnz3Dy5EkcPnwY9erVK7KaWlxl8XkU59iiaHp/CztfcWJYs2YNAgMDUatWLSxduhS7d+/G3r170b59+xJ99i/atm0bZs6ciSlTpqBTp05q+w4fPozu3bvDyMgIP/30E3bu3Im9e/di0KBBr/TelERZfDZEbzomxESvQdeuXREdHY3jx4//Z18nJyeoVCrcuHFDrT0hIQGpqanSihHaULly5QJ3oQMFK3MAoFAo8Pbbb2Pu3Ln4559/8N133+Hvv/8u8m78/DijoqIK7Lt27RqsrKyKvHnsddu8eTMCAgIwZ84c6QbFVq1aFXhvtLnm64MHD/Dpp5+iY8eO6Nq1K8aMGVPo+14YX19fnDp1Cvv27YOVlRXq1auHKlWqoEGDBjh8+DAOHz6M1q1b/+c45b2GrZOTE27cuFEg8bx27Zq0v6xt3rwZLi4u2Lp1K95//334+/vDz88PT58+LdV4169fR0BAAHr27Km2Cku+LVu2wMjICH/99Rc++OADvPPOO/Dz8yt0rOJ+PkV9reXk5CAmJua1vI9EbzomxESvwdixY2Fqaophw4YhISGhwP7o6GjprvrOnTsDQIGVIObOnQsA6NKli9biqlWrFh4/fozIyEip7cGDB/j999/V+j169KjAsfkrKLy8FFw+e3t7eHp6YuXKlWqJ5eXLl7Fnzx7pOisCpVJZoDo2f/78AtXv/AS+sB8iSio4OBgqlQpLly7Fzz//DD09PQwdOrRYVTpfX19kZ2cjPDwcrVq1khKn/BUJ7t+/X6z5w6amplq5ltLq3Lkz4uPjsWHDBqktNzcX8+fPh5mZGdq0aVPmMeRXS19830+ePFmsH15flpGRgV69esHBwUFaLq2w8wmCoPZ3KzY2ttAn0hX38/Hz84OBgQHmzZundh1Lly7F48ePtfpvBpFcyWeVdqIKrFatWvjtt9/Qv39/1K9fX+1JdceOHcOmTZukNUY9PDwQEBCAn3/+GampqWjTpg1OnTqFlStXomfPnmjXrp3W4howYADGjRuHXr16YdSoUXjy5AkWLVqEOnXqqN3gM2XKFBw6dAhdunSBk5MTEhMT8dNPP6F69eoa56nOnj0b77zzDnx8fDB06FBkZWVh/vz5sLCwkJ72VRF07doVq1evhoWFBdzc3HD8+HHs27cPVatWVevn6ekJpVKJmTNn4vHjxzA0NET79u3VltQqjuXLl2PHjh1YsWIFqlevDuB5Av7ee+9h0aJF+PjjjzUe7+PjAz09PURFRWH48OFSe+vWrbFo0SIAKFZC3LRpU+zbtw9z585FtWrVULNmTbUbEMva8OHDsWTJEgQGBuLs2bNwdnbG5s2bcfToUYSHhxf7RtRX0bVrV2zduhW9evVCly5dEBMTg8WLF8PNzU1aw7m4Jk+ejH/++Qfffvsttm/frravVq1a8PHxQZcuXTB37lx06tQJgwYNQmJiIhYuXAhXV1e1H0yB4n8+1tbWGD9+PCZPnoxOnTqhe/fuiIqKwk8//YS33npL7QY6IiocE2Ki16R79+6IjIzE7NmzsX37dixatAiGhoZwd3fHnDlzEBwcLPX99ddf4eLighUrVuD333+HnZ0dxo8fj4kTJ2o1pqpVq+L3339HSEgIxo4di5o1ayIsLAw3btxQS4i7d++O2NhYLFu2DMnJybCyskKbNm0wefJk6Sa1wvj5+WH37t2YOHEiQkNDoa+vjzZt2mDmzJklvgGtLP34449QKpVYu3Ytnj59ipYtW0prKL/Izs4OixcvRlhYGIYOHYq8vDzs37+/RAnx3bt38fnnn6Nbt24ICAiQ2gcPHowtW7Zg7NixeOeddzS+P6ampmjcuDFOnz6t9gNJfhLs6OhYrF+Tz507F8OHD8e3336LrKwsBAQEvNaE2NjYGAcOHMBXX32FlStXIi0tDXXr1sXy5csLPCSkrAQGBiI+Ph5LlizBX3/9BTc3N6xZswabNm2SHrpSXPlPhCzs8egBAQHw8fFB+/btsXTpUsyYMQOjR49GzZo1MXPmTMTGxhZIiEvy+UyaNAnW1tZYsGABPv/8c1SpUgXDhw/H9OnTy/WR4URvCkHkLHoiIiIi0mGcQ0xEREREOo0JMRERERHpNCbERERERKTTmBATERERkU5jQkxEREREOo0JMRERERHpNK5DTIVSqVS4f/8+KlWqVO6PdyUiInqTiKKI9PR0VKtWDQpF+dcenz59ipycnDIZ28DAAEZGRmUy9uvEhJgKdf/+fTg6OpZ3GERERG+suLg46WmU5eXp06cwrlQVyH1SJuPb2dkhJibmjU+KmRBTofIfmWrgFgBBaVDO0RDRy+4c+L68QyCiIqSnpcG1puNrefz4f8nJyQFyn8DQLQDQ9vfzvBzE/7MSOTk5TIhJnvKnSQhKAybERBWQubl5eYdARP+hQk051DPS+vdzUSj/6SDawoSYiIiISO4EANpO0CtQvv+q5JPaExERERGVAivERERERHInKJ5v2h5TJuRzJUREREREpcAKMREREZHcCUIZzCGWzyRiVoiJiIiISKexQkxEREQkd5xDrJF8roSIiIiIqBRYISYiIiKSO84h1ogJMREREZHslcGUCRlNNJDPlRARERERlQIrxERERERyxykTGrFCTEREREQ6jQkxERERkdzlL7um7a0UFi5cCGdnZxgZGcHb2xunTp0qsm/btm0hCEKBrUuXLlIfURQRGhoKe3t7GBsbw8/PDzdu3ChRTEyIiYiIiOi12LBhA0JCQjBx4kScO3cOHh4e8Pf3R2JiYqH9t27digcPHkjb5cuXoVQq0bdvX6nPrFmzMG/ePCxevBgnT56Eqakp/P398fTp02LHxYSYiIiISO7y5xBreyuhuXPnIjg4GEFBQXBzc8PixYthYmKCZcuWFdq/SpUqsLOzk7a9e/fCxMRESohFUUR4eDi+/fZb9OjRA+7u7li1ahXu37+Pbdu2FTsuJsREREREVGppaWlqW3Z2dqH9cnJycPbsWfj5+UltCoUCfn5+OH78eLHOtXTpUgwYMACmpqYAgJiYGMTHx6uNaWFhAW9v72KPCTAhJiIiIpK/MpxD7OjoCAsLC2kLCwsrNITk5GTk5eXB1tZWrd3W1hbx8fH/eQmnTp3C5cuXMWzYMKkt/7jSjpmPy64RERERUanFxcXB3Nxcem1oaFgm51m6dCkaNWqEZs2aaX1sVoiJiIiI5K4M5xCbm5urbUUlxFZWVlAqlUhISFBrT0hIgJ2dncbwMzMzsX79egwdOlStPf+40oz5IibERERERHJXAZZdMzAwQNOmTRERESG1qVQqREREwMfHR+OxmzZtQnZ2Nt577z219po1a8LOzk5tzLS0NJw8efI/x3wRp0wQERER0WsREhKCgIAAeHl5oVmzZggPD0dmZiaCgoIAAEOGDIGDg0OBechLly5Fz549UbVqVbV2QRAwevRoTJs2DbVr10bNmjUxYcIEVKtWDT179ix2XEyIiYiIiOROEEr9IA2NY5ZQ//79kZSUhNDQUMTHx8PT0xO7d++Wboq7c+cOFAr1OKOionDkyBHs2bOn0DHHjh2LzMxMDB8+HKmpqWjVqhV2794NIyOj4l+KKIpiia+GZC8tLQ0WFhYwbBQMQWlQ3uEQ0UtSTi8o7xCIqAhpaWmwrWqBx48fq91sVl6xWFhYwLDFeAh6xU8Qi0PMfYrsY2EV4jpfFSvERERERHKnEJ5v2h5TJnhTHRERERHpNFaIiYiIiOSuFKtCFGtMmZDPlRARERERlQIrxERERERy98KDNLQ6pkwwISYiIiKSO06Z0Eg+V0JEREREVAqsEBMRERHJHadMaMQKMRERERHpNFaIiYiIiOSOc4g1ks+VEBERERGVAivERERERHLHOcQasUJMRERERDqNFWIiIiIiueMcYo3kcyVERERERKXACjERERGR3HEOsUZMiImIiIhkrwymTMhoooF8roSIiIiIqBRYISYiIiKSO06Z0IgVYiIiIiLSaawQExEREcmdIJTBsmusEBMRERERyQIrxERERERyxwdzaCSfKyEiIiIiKgVWiImIiIjkjqtMaMSEmIiIiEjuOGVCI/lcCRERERFRKbBCTERERCR3nDKhESvERERERKTTWCEmIiIikjvOIdZIPldCRERERFQKrBATERERyR3nEGvECjERERER6TRWiImIiIhkThAECKwQF4kJMREREZHMMSHWjFMmiIiIiEinsUJMREREJHfC/2/aHlMmWCEmIiIiIp3GCjERERGRzHEOsWasEBMRERGRTmOFmIiIiEjmWCHWjBViIiIiItJpTIiJiIiIZC6/QqztrTQWLlwIZ2dnGBkZwdvbG6dOndLYPzU1FSNHjoS9vT0MDQ1Rp04d7Ny5U9o/adKkAnHVq1evRDFxygQRERERvRYbNmxASEgIFi9eDG9vb4SHh8Pf3x9RUVGwsbEp0D8nJwcdOnSAjY0NNm/eDAcHB9y+fRuWlpZq/Ro0aIB9+/ZJr/X0SpbiMiEmIiIikrmKMod47ty5CA4ORlBQEABg8eLF2LFjB5YtW4avvvqqQP9ly5bh0aNHOHbsGPT19QEAzs7OBfrp6enBzs6uxPHk45QJIiIiIrkTymgDkJaWprZlZ2cXGkJOTg7Onj0LPz8/qU2hUMDPzw/Hjx8v9Jj//e9/8PHxwciRI2Fra4uGDRti+vTpyMvLU+t348YNVKtWDS4uLhg8eDDu3LlToreHCTERERERlZqjoyMsLCykLSwsrNB+ycnJyMvLg62trVq7ra0t4uPjCz3m1q1b2Lx5M/Ly8rBz505MmDABc+bMwbRp06Q+3t7eWLFiBXbv3o1FixYhJiYGvr6+SE9PL/Y1cMoEERERkcyV5ZSJuLg4mJubS82GhoZaO4VKpYKNjQ1+/vlnKJVKNG3aFPfu3cPs2bMxceJEAMA777wj9Xd3d4e3tzecnJywceNGDB06tFjnYUJMRERERKVmbm6ulhAXxcrKCkqlEgkJCWrtCQkJRc7/tbe3h76+PpRKpdRWv359xMfHIycnBwYGBgWOsbS0RJ06dXDz5s1iXwOnTBARERHJnCCUxdJrJYvBwMAATZs2RUREhNSmUqkQEREBHx+fQo9p2bIlbt68CZVKJbVdv34d9vb2hSbDAJCRkYHo6GjY29sXOzYmxERERET0WoSEhOCXX37BypUrcfXqVXz00UfIzMyUVp0YMmQIxo8fL/X/6KOP8OjRI3z22We4fv06duzYgenTp2PkyJFSnzFjxuDgwYOIjY3FsWPH0KtXLyiVSgwcOLDYcXHKBBEREZHMCSiDOcQo+Xj9+/dHUlISQkNDER8fD09PT+zevVu60e7OnTtQKP6t1zo6OuKvv/7C559/Dnd3dzg4OOCzzz7DuHHjpD53797FwIED8fDhQ1hbW6NVq1Y4ceIErK2ti38loiiKJb4akr20tDRYWFjAsFEwBGXhv5IgovKTcnpBeYdAREVIS0uDbVULPH78uFhza8s6FgsLC1j2+wWCgYlWxxZzniB1Y3CFuM5XxQoxERERkcxVlAdzVFRMiImIiIjk7oUHaWh1TJngTXVEREREpNNYISYiIiKSuzKYMiHKaMoEK8REREREpNNYISYiIiKSubK4qU77y7iVH1aIiYiIiEinsUJMREREJHOsEGvGCjERERER6TRWiImIiIjkjusQa8QKMRERERHpNFaIiYiIiGSOc4g1Y0JMREREJHNMiDXjlAkiIiIi0mmsEBMRERHJHCvEmrFCTEREREQ6jRViIiIiIpljhVgzVoiJiIiISKexQkxEREQkd3wwh0asEBMRERGRTmOFmIiIiEjmOIdYMybERERERDLHhFgzTpkgIiIiIp3GCjERERGRzLFCrBkrxERERESk01ghJiIiIpI7LrumESvERG+oD/u1xrUdk5Fy4gccWjUGXg2ciuz71y+fIev8ggLb1nkjAAB6egpMG9UDpzd+jeRjc3Brz3f4der7sLe2eF2XQyQri39aiLquzrA0M4JvC2+cPnWqyL7Lfv0Fb7f1hb11ZdhbV0Znf78C/TMyMjB61Ceo5VwdlSsZo7G7G35ZsrisL4NIZzAhJnoDvduxCWZ+0QvfLdkFn0EzEXn9Hv7300hYVzYrtP+AL36Bs994aWvSZxpyc/Owde95AICJkQE86ztixi+74DNwJgZ88QvqONliU/iHr/OyiGRh08YNGPdlCL75diKOnzoHd3cPdO/ij8TExEL7Hzp4AP36D8Tuvftx4PBxVK/uiG6dO+LevXtSn3FjQrB3z24sX7kGFy5dxSefjsbnn32CP//43+u6LHrD5c8h1vYmF0yIid5Ao95rj+Vbj2H1/07g2q14fPrdemQ9zUFAT59C+6ekPUHCw3Rpe7t5PTx5miMlxGkZT9H1owXYsvc8btxOxKlLsfh8xkY0dasBR7vKr/PSiN5488LnImhoMIYEBqG+mxvm/7QYxiYmWLliWaH9V6xeiw8/+hgenp6oW68eFv38K1QqFQ78HSH1OXHiGN57PwCt27SFk7MzhgYPh7u7B86cLrryTETFx4SY6A2jr6dE4/qO+PtklNQmiiL+PhmFZu41izVGQM8W2PTXOTx5mlNkH/NKxlCpVEhNz3rlmIl0RU5ODs6fO4v2b/tJbQqFAu3b++HUiePFGuPJkyd49uwZKlepIrU1b94Cf/7xP9y7dw+iKOLggf24ceM6/Dp01Po1kDyxQqwZE2KiN4xVZTPo6SmR+ChdrT3xYRrsqpr/5/FeDZzQsHY1rPj9WJF9DA30MG1UD2zcfRbpmU9fOWYiXZGcnIy8vDzY2NiqtdvY2iI+Pr5YY3w7fhzsq1VTS6rn/jgf9eu7wdW5OsxNDNC9SyeEz1uIVr6ttRo/yZeAMkiIZXRXHRNiLZo0aRI8PT3LOwwijQJ6+uDS9Xs4c+V2ofv19BRYM2soBEHAqOkbXnN0RLpt9qwZ2LRxPTZs+h1GRkZS+08L5+PUqRPY/Pv/cOzkWcyYNQejR43E3xH7yjFaIvmokAlxYGAgBEHAjBkz1Nq3bdtW4vK8s7MzwsPDi9Uv/yceExMTNGrUCL/++muJzlVRMVGXl+SUDOTm5sGmSiW1dpuq5oh/mKbxWBMjA/T1b4qV2wr/1a2engJrZw5FDfvK6PrRAlaHiUrIysoKSqUSiYkJau2JCQmws7PTeOwPc7/HnFkz8MfOPWjk7i61Z2VlYeK3X2Pm7Lno0rUbGrm746ORn+Ddvv0RPvf7MrkOkh9OmdCsQibEAGBkZISZM2ciJSXltZ1zypQpePDgAS5fvoz33nsPwcHB2LVr12s7P1FxPMvNw/mrcWjnXVdqEwQB7ZrVwanIGI3H9u7QGIYGeli383SBffnJcK0a1ugyYgEePc7UeuxEcmdgYIDGTZpi/ws3xKlUKuzfH4FmzQu/6RUA5nw/CzO+m4rtf+5GUy8vtX3Pnj3Ds2fPoFCof8tWKpVQqVTavQAiHVVhE2I/Pz/Y2dkhLCxMY78tW7agQYMGMDQ0hLOzM+bMmSPta9u2LW7fvo3PP/+8WD/JVKpUCXZ2dnBxccG4ceNQpUoV7N27V9qfmpqKYcOGwdraGubm5mjfvj0uXryoccxff/0V9evXh5GREerVq4effvpJ2teiRQuMGzdOrX9SUhL09fVx6NAhAMDq1avh5eUlxTZo0CC1pXsOHDgAQRAQEREBLy8vmJiYoEWLFoiKen7D1YoVKzB58mRcvHhReg9WrFihMWaq+Oat+RtBvVpgcDdv1K1pi3lf94eJsSFWbT8BAPh16vuY8mn3AscF9vTBHwciCyS7enoK/DZ7GJq41UDQNyuhVAiwrVoJtlUrQV9P+VquiUguRo0OwfKlv2DNqpW4dvUqRo38CE8yMzEkIAgAMDRwCCZ8M17q//3smZgycQIW/7IMTs7OiI+PR3x8PDIyMgAA5ubm8G3dBl9/9SUOHTyA2JgYrF65AmvXrEL3Hr3K5RrpDSSU0SYTFfZJdUqlEtOnT8egQYMwatQoVK9evUCfs2fPol+/fpg0aRL69++PY8eO4eOPP0bVqlURGBiIrVu3wsPDA8OHD0dwcHCxz61SqfD7778jJSUFBgYGUnvfvn1hbGyMXbt2wcLCAkuWLMHbb7+N69evo8oLdwPnW7t2LUJDQ7FgwQI0btwY58+fR3BwMExNTREQEIDBgwdj1qxZmDFjhpSsb9iwAdWqVYOvry+A55WBqVOnom7dukhMTERISAgCAwOxc+dOtXN98803mDNnDqytrTFixAh88MEHOHr0KPr374/Lly9j9+7d2Lfv+VwzC4uCD1vIzs5Gdna29DotTfOv3ql8bd5zDlaVzRD6URfYVq2EyKh76DFyoXSjnaNdFahUotoxtZ1s0LKJK7qMWFBgvGrWlujW9vmvaE9tGK+2r+OwH3H47I0yuhIi+enbrz+Sk5IwZXIoEuLj4e7hie1/7oat7fMb7eLi7qhVe39Zsgg5OTkY1P9dtXG+mTAR34ZOAgCsWrseod+MR+CQwUh59Ag1nJwwacp3CP5wxGu7LiI5E0RRFP+72+sVGBiI1NRUbNu2DT4+PnBzc8PSpUuxbds29OrVC/khDx48GElJSdizZ4907NixY7Fjxw5cuXIFwPO5waNHj8bo0aM1ntPZ2RkPHjyAvr4+srOzkZubiypVquDkyZNwdXXFkSNH0KVLFyQmJsLQ0FA6ztXVFWPHjsXw4cMxadIkbNu2DRcuXJD2TZ06FQMHDpT6T5s2DTt37sSxY8eQlJSEatWq4e+//5YS4BYtWqB169YF5k/nO3PmDN566y2kp6fDzMwMBw4cQLt27bBv3z68/fbbAICdO3eiS5cuyMrKgpGRUYG4CjNp0iRMnjy5QLtho2AISoNCjiCi8pRyuuAPNkRUMaSlpcG2qgUeP34Mc/P/Xv2nrGOxsLCA08eboDA00erYquwnuP1T3wpxna+qwk6ZyDdz5kysXLkSV69eLbDv6tWraNmypVpby5YtcePGDeTl5ZX4XF9++SUuXLiAv//+G97e3vjhhx/g6uoKALh48SIyMjJQtWpVmJmZSVtMTAyio6MLjJWZmYno6GgMHTpUrf+0adOk/tbW1ujYsSPWrl0LAIiJicHx48cxePBgaZyzZ8+iW7duqFGjBipVqoQ2bdoAAO7cuaN2PvcXbsCwt7cHgCKfilSY8ePH4/Hjx9IWFxdX7GOJiIiI3mQVdspEvtatW8Pf3x/jx49HYGBgmZ7LysoKrq6ucHV1xaZNm9CoUSN4eXnBzc0NGRkZsLe3x4EDBwocZ2lpWaAtf+7XL7/8Am9vb7V9SuW/czIHDx6MUaNGYf78+fjtt9/QqFEjNGrUCMDzpNrf3x/+/v5Yu3YtrK2tcefOHfj7+yMnR/2BCvr6+tKf86dflORmC0NDQ7XKNxEREclHWawKIadVJip8QgwAM2bMgKenJ+rWravWXr9+fRw9elSt7ejRo6hTp46UdBoYGJSqWuzo6Ij+/ftj/Pjx2L59O5o0aYL4+Hjo6enB2dn5P4+3tbVFtWrVcOvWLbWK78t69OiB4cOHY/fu3fjtt98wZMgQad+1a9fw8OFDzJgxA46OjgCeT5koqdK+B0RERES64I1IiBs1aoTBgwdj3rx5au1ffPEF3nrrLUydOhX9+/fH8ePHsWDBArWVHJydnXHo0CEMGDAAhoaGsLKyKvZ5P/vsMzRs2BBnzpyBn58ffHx80LNnT8yaNQt16tTB/fv3sWPHDvTq1QteLy2TAwCTJ0/GqFGjYGFhgU6dOiE7OxtnzpxBSkoKQkJCAACmpqbo2bMnJkyYgKtXr6rNN65RowYMDAwwf/58jBgxApcvX8bUqVNL+vbB2dkZMTExuHDhAqpXr45KlSqxGkxERKRDBOH5pu0x5aLCzyHON2XKlAJTAJo0aYKNGzdi/fr1aNiwIUJDQzFlyhS1qRVTpkxBbGwsatWqBWtr6xKd083NDR07dkRoaCgEQcDOnTvRunVrBAUFoU6dOhgwYABu374t3Tn8smHDhuHXX3/F8uXL0ahRI7Rp0wYrVqxAzZo11foNHjwYFy9ehK+vL2rUqCG1W1tbY8WKFdi0aRPc3NwwY8YMfP99yRdh79OnDzp16oR27drB2toa69atK/EYRERE9OZ6nhBr+8Ec5X1V2lMhV5mg8pd/VypXmSCqmLjKBFHFVRFXmXD5dDMUhqZaHVuVnYlb898t8XUuXLgQs2fPRnx8PDw8PDB//nw0a9asyP6pqan45ptvsHXrVjx69AhOTk4IDw9H586dSz3my96YCjERERERlZLw77QJbW2leTDHhg0bEBISgokTJ+LcuXPw8PCAv79/kStj5eTkoEOHDoiNjcXmzZsRFRWFX375BQ4ODqUeszBMiImIiIjotZg7dy6Cg4MRFBQENzc3LF68GCYmJli2bFmh/ZctW4ZHjx5h27ZtaNmyJZydndGmTRt4eHiUeszCMCEmIiIikjntzx/+dxm3tLQ0te3FJ9++KCcnB2fPnoWfn5/UplAo4Ofnh+PHjxd6zP/+9z/4+Phg5MiRsLW1RcOGDTF9+nRp9azSjFkYJsREREREVGqOjo6wsLCQtrCwsEL7JScnIy8vr8BiBLa2toiPjy/0mFu3bmHz5s3Iy8vDzp07MWHCBMyZMwfTpk0r9ZiFeSOWXSMiIiKi0ivLZdfi4uLUbqrT5tKuKpUKNjY2+Pnnn6FUKtG0aVPcu3cPs2fPxsSJE7V2HibERERERFRq5ubmxVplwsrKCkqlEgkJCWrtCQkJsLOzK/QYe3t76Ovrqz3lt379+oiPj0dOTk6pxiwMp0wQERERyZxCIZTJVhIGBgZo2rQpIiIipDaVSoWIiAj4+PgUekzLli1x8+ZNtWdRXL9+Hfb29jAwMCjVmIW+PyW6EiIiIiJ642h7ybXSTsEICQnBL7/8gpUrV+Lq1av46KOPkJmZiaCgIADAkCFDMH78eKn/Rx99hEePHuGzzz7D9evXsWPHDkyfPh0jR44s9pjFwSkTRERERPRa9O/fH0lJSQgNDUV8fDw8PT2xe/du6aa4O3fuQKH4t17r6OiIv/76C59//jnc3d3h4OCAzz77DOPGjSv2mMXBJ9VRofikOqKKjU+qI6q4KuKT6uqN+R1KLT+pLi87E9e+71UhrvNVccoEEREREek0TpkgIiIikrmyXHZNDlghJiIiIiKdxgoxERERkcy9+KhlbY4pF6wQExEREZFOY4WYiIiISOZYIdaMFWIiIiIi0mmsEBMRERHJHFeZ0IwJMREREZHMCSiDKROQT0bMKRNEREREpNNYISYiIiKSOU6Z0IwVYiIiIiLSaawQExEREckcl13TjBViIiIiItJprBATERERyRznEGvGCjERERER6TRWiImIiIhkjnOINWNCTERERCRznDKhGadMEBEREZFOY4WYiIiISOY4ZUIzVoiJiIiISKexQkxEREQkd2UwhxjyKRCzQkxEREREuo0VYiIiIiKZ4xxizVghJiIiIiKdxgoxERERkcxxHWLNmBATERERyRynTGjGKRNEREREpNNYISYiIiKSOU6Z0IwVYiIiIiLSaawQExEREckc5xBrxgoxEREREek0VoiJiIiIZI4VYs1YISYiIiIincYKMREREZHMcZUJzVghJiIiIiKdxgoxERERkcxxDrFmTIiJiIiIZI5TJjTjlAkiIiIi0mmsEBMRERHJHKdMaMYKMRERERHpNFaIiYiIiGROQBnMIdbucOWKFWIiIiIiem0WLlwIZ2dnGBkZwdvbG6dOnSqy74oVK6TpHvmbkZGRWp/AwMACfTp16lSimFghJiIiIpI5hSBAoeUScWnG27BhA0JCQrB48WJ4e3sjPDwc/v7+iIqKgo2NTaHHmJubIyoqSnpd2NzlTp06Yfny5dJrQ0PDEsXFCjERERERvRZz585FcHAwgoKC4ObmhsWLF8PExATLli0r8hhBEGBnZydttra2BfoYGhqq9alcuXKJ4mJCTERERCRz+esQa3sDgLS0NLUtOzu70BhycnJw9uxZ+Pn5SW0KhQJ+fn44fvx4kbFnZGTAyckJjo6O6NGjB65cuVKgz4EDB2BjY4O6devio48+wsOHD0v0/jAhJiIiIpK5l+fYamsDAEdHR1hYWEhbWFhYoTEkJycjLy+vQIXX1tYW8fHxhR5Tt25dLFu2DNu3b8eaNWugUqnQokUL3L17V+rTqVMnrFq1ChEREZg5cyYOHjyId955B3l5ecV+fziHmIiIiIhKLS4uDubm5tLrks7f1cTHxwc+Pj7S6xYtWqB+/fpYsmQJpk6dCgAYMGCAtL9Ro0Zwd3dHrVq1cODAAbz99tvFOg8rxEREREQypxDKZgOe3/T24lZUQmxlZQWlUomEhAS19oSEBNjZ2RXrOvT19dG4cWPcvHmzyD4uLi6wsrLS2OdlTIiJiIiIqMwZGBigadOmiIiIkNpUKhUiIiLUqsCa5OXl4dKlS7C3ty+yz927d/Hw4UONfV7GKRNEREREcieUwaOWSzFcSEgIAgIC4OXlhWbNmiE8PByZmZkICgoCAAwZMgQODg7SPOQpU6agefPmcHV1RWpqKmbPno3bt29j2LBhAJ7fcDd58mT06dMHdnZ2iI6OxtixY+Hq6gp/f/9ix1WshPiDDz4o6fVKBEHA0qVLS308EREREclD//79kZSUhNDQUMTHx8PT0xO7d++WbrS7c+cOFIp/JzCkpKQgODgY8fHxqFy5Mpo2bYpjx47Bzc0NAKBUKhEZGYmVK1ciNTUV1apVQ8eOHTF16tQSzWUWRFEU/6vTi4GVlCAIJbrLjyqGtLQ0WFhYwLBRMASlQXmHQ0QvSTm9oLxDIKIipKWlwbaqBR4/fqx2s1l5xWJhYYEOP0RA39hMq2M/y8rA3s/frhDX+aqKVSHev39/WcdBRERERFQuipUQt2nTpqzjICIiIqIyIvz/f9oeUy64ygQRERER6bRXWmUiNzcXO3bswKlTp5CcnAxvb2/pBrz79+8jOTkZbm5u0NPjYhZERERE5eXFdYO1OaZclDpTPXLkCN577z3ExcVBFEUIgoBnz55JCfHx48fRr18/bNq0Cb1799ZawERERERUMi8+almbY8pFqaZM/PPPP+jUqRMePHiATz/9FBs3bsTLi1V069YNJiYm2LJli1YCJSIiIiIqC6WqEE+dOhVPnz7Fzp070bFjx0L7GBgYoEmTJjh//vwrBUhEREREr0YQnm/aHlMuSlUh3r9/P5o1a1ZkMpzPwcEB9+/fL1VgRERERESvQ6kqxKmpqXB0dPzPfpmZmXj27FlpTkFEREREWqIQBCi0XNLV9njlqVQVYhsbG9y8efM/+129erVYiTMRERERUXkpVULcvn17XLhwQeMT7H7//XfcvHkTHTp0KHVwRERERPTq8ucQa3uTi1IlxF999RUMDAzQs2dPLFq0CPHx8dK+lJQULFu2DEOHDoWpqSlCQkK0FiwRERERkbaVKiGuV68e1q1bB5VKhU8++QQODg4QBAErV66ElZUVgoODkZ2djbVr16JmzZrajpmIiIiISiB/HWJtb3JR6kc39+zZE5cvX8ann36KevXqwcjICAYGBnBxccGHH36IyMhIdO/eXZuxEhEREVEpcMqEZq/0TGUnJyeEh4drKRQiIiIiotfvlRJiIiIiIqr4uOyaZq+UEGdnZ2PLli04fPiw9ACOatWqoVWrVujTpw+MjIy0EiQRERERUVkpdUK8b98+BAYG4sGDBxBFUW3fzz//jLFjx2LFihVcdo2IiIionAn/v2l7TLkoVUJ88uRJdO3aFTk5OfD29sbAgQPh7OwMALh9+zbWrVuHEydOoFu3bjh48CC8vb21GTMRERERkdaUKiGeMGECnj17hkWLFuHDDz8ssP/TTz/Fzz//jBEjRiA0NBR//fXXKwdKRERERKVTFsuk6fyyaydPnoSXl1ehyXC+4cOH46233sKJEydKHRwRERERUVkrVUKsUCjg6ur6n/1cXV1l9dMDERER0ZtIIZTNJhelmjLRrFkzREZG/me/yMhINGvWrDSnICIiIiIt4ZQJzUpVIZ46dSpu3LiBiRMnQqVSFdgviiImTpyIGzduYOrUqa8cJBERERFRWSlWhXjVqlUF2gICAjBt2jSsXr0affr0gZOTE4Dnq0xs3boVsbGxCA4ORlRUFFeZICIiIipnMiroap0gvryIcCEUCkWhZfEXD83f//JwgiAgLy/vVeOk1ywtLQ0WFhYwbBQMQWlQ3uEQ0UtSTi8o7xCIqAhpaWmwrWqBx48fw9zcvNxjsbCwQL+fj8DAxEyrY+c8ycDG4a0qxHW+qmJViENDQ2U1T4SIiIhIl3AOsWbFSognTZpUxmEQEREREZWPUj+6mYiIiIjeDGWxTJqcll0r1SoTRERERERy8UoV4iNHjmD79u24ceMG0tPTC9xQBzyfXxIREfEqpyEiIiKiV8A5xJqVKiEWRRFDhw7FypUrpSRYEIQCq06IoiirN4uIiIiI5KdUUyYWL16MFStWoGnTpti7dy969+4NAIiKisKuXbsQGBgIhUKBL7/8Erdu3dJqwERERERUMkIZbXJRqgrxihUrYGpqil27dqFq1apYs2YNAKB27dqoXbs2/P390blzZ/Tv3x8tWrSQHtpBRERERK+fQhCg0PJv7bU9XnkqVYX46tWraNGiBapWrQrg3zkkLz6A491330XTpk3x/fffayFMIiIiIqKyUaqEWKVSSckwAJiYmAAAUlJS1PrVrl0bly5deoXwiIiIiOhVCULZbHJRqoTYwcEB9+/fl17nT4k4f/68Wr/r169DT49LHRMRERFRxVWqhLhJkyb4559/pCkSHTt2hCiKGDt2LK5du4b09HTMnj0bZ8+eRePGjbUaMBERERGVTP6ya9re5KJUCXH37t2RnJyMHTt2AAA8PDwwYMAAXLx4EQ0aNIClpSW++uor6Onp4bvvvtNqwERERERE2lSq+QwDBw5E79691aZDrFy5Eu7u7ti2bRtSUlJQp04djB07Fs2aNdNasERERERUcmUx51dGBeLSP6nO0NBQ7bW+vj6++uorfPXVV68cFBERERHR68I73oiIiIhkjusQa1aqOcRERERE9OaoSMuuLVy4EM7OzjAyMoK3tzdOnTpVZN8VK1YUuJHPyMhIrY8oiggNDYW9vT2MjY3h5+eHGzdulCimYlWIXVxcSjToiwRBQHR0dKmPJyIiIiJ52LBhA0JCQrB48WJ4e3sjPDwc/v7+iIqKgo2NTaHHmJubIyoqSnr98uoWs2bNwrx587By5UrUrFkTEyZMgL+/P/75558CyXNRipUQx8bGFmswIiIiIqp4ymKZtNKMN3fuXAQHByMoKAgAsHjxYuzYsQPLli0r8j40QRBgZ2dX6D5RFBEeHo5vv/0WPXr0AACsWrUKtra22LZtGwYMGFCsuIqVEKtUqmINRvLz6cQRMDQ1K+8wiOglwzdcLO8QiKgIOU8yyjuE1yotLU3ttaGhYYHFFwAgJycHZ8+exfjx46U2hUIBPz8/HD9+vMjxMzIy4OTkBJVKhSZNmmD69Olo0KABACAmJgbx8fHw8/OT+ltYWMDb2xvHjx8vdkLMOcREREREMqcoow0AHB0dYWFhIW1hYWGFxpCcnIy8vDzY2tqqtdva2iI+Pr7QY+rWrYtly5Zh+/btWLNmDVQqFVq0aIG7d+8CgHRcScYsDFeZICIiIqJSi4uLg7m5ufS6sOpwafn4+MDHx0d63aJFC9SvXx9LlizB1KlTtXYeJsREREREMleWc4jNzc3VEuKiWFlZQalUIiEhQa09ISGhyDnCL9PX10fjxo1x8+ZNAJCOS0hIgL29vdqYnp6exRoT4JQJIiIiInoNDAwM0LRpU0REREhtKpUKERERalVgTfLy8nDp0iUp+a1Zsybs7OzUxkxLS8PJkyeLPSbACjERERGR7AkCoKgAj24OCQlBQEAAvLy80KxZM4SHhyMzM1NadWLIkCFwcHCQ5iFPmTIFzZs3h6urK1JTUzF79mzcvn0bw4YN+/8YBIwePRrTpk1D7dq1pWXXqlWrhp49exY7LibERERERPRa9O/fH0lJSQgNDUV8fDw8PT2xe/du6aa4O3fuQKH4dwJDSkoKgoODER8fj8qVK6Np06Y4duwY3NzcpD5jx45FZmYmhg8fjtTUVLRq1Qq7d+8u9hrEACCIoihq7zJJLtLS0mBhYYExm89y2TWiCujOw6zyDoGIipDzJAMbhrfC48ePizW3tizlfz//eN1pGJpo9/t59pMM/DTwrQpxna+KFWIiIiIimasoD+aoqLSSEN+4cQPJycmoWrUq6tSpo40hiYiIiIhei1KvMpGdnY2vv/4aVlZWqFevHlq1aoUZM2ZI+9esWYMmTZrgwoUL2oiTiIiIiEpJIZTNJhelSoizsrLQtm1bzJw5EwYGBujcuTNenorcvn17XLx4ERs3btRKoEREREREZaFUCfGsWbNw8uRJfPDBB7h16xb++OOPAn2qVasGNzc37Nu375WDJCIiIqLSE4Sy2eSiVAnxhg0bUKNGDSxatEjjkhZ169ZFXFxcqYMjIiIiIiprpbqpLiYmBl26dIGenubDDQwMkJKSUqrAiIiIiEg7FIIAhZZLutoerzyVqkJsbGxcrEQ3JiYGlStXLs0piIiIiIhei1IlxJ6enjhz5gySkpKK7BMTE4Pz58/jrbfeKnVwRERERPTqFGW0yUWpriU4OBjp6ekYOHAgkpOTC+xPTU3FBx98gGfPnmH48OGvHCQRERERlR5vqtOsVHOIBw4ciD/++APr16+Hi4sLWrRoAQA4evQoevTogYMHDyItLQ1DhgxB165dtRowEREREZE2lbravXbtWsycORNGRkbYs2cPgOdPrPvjjz8gCAK+++47LF++XGuBEhEREVHpKCBIN9ZpbYN8SsSlfnSzIAj48ssvERISgnPnziE2NhYqlQrVq1fHW2+9BQMDA23GSURERERUJkqdEOdTKpV46623ePMcERERUQVVFnN+5TSHWE43CBIRERERlVipKsQffPBBsfsKgoClS5eW5jREREREpAUK4fmm7THlolQJ8YoVK/6zjyAIEEWRCTERERERVWilSoj3799faLtKpUJcXBz27NmD9evX4/PPP0e3bt1eKUAiIiIiejWCoP1HLctpDnGpEuI2bdpo3D9kyBB06dIFAQEB6N69e6kCIyIiIiLt4E11mpXZTXUDBw5EgwYNMGnSpLI6BRERERHRKyvTVSZq166NM2fOlOUpiIiIiOg/5N9Up+1NLsosIVapVIiMjIRCwZXdiIiIiKji0nq2+uTJE1y4cAEDBw7EjRs3/nO+MRERERGVLaGM/pOLUt1Up1Qq/7OPKIqwtrbG7NmzS3MKIiIiIqLXolQJsaOjI4Qibi00MDCAvb092rRpg5EjR8LGxuaVAiQiIiKiV8MHc2hWqoQ4NjZWy2EQEREREZWPUiXE//vf/6Cvr4933nlH2/EQERERkZaxQqxZqW6q69WrF+bNm6ftWIiIiIiIXrtSVYitra1RuXJlbcdCRERERGVAEIQi7/96lTHlolQJcdu2bXHq1CmIoiirN4OIiIhIjjhlQrNSTZmYOnUqkpOT8fnnn+Pp06fajomIiIiI6LUpVYV43bp16Ny5M+bPn4/169fDz88PNWrUgJGRUYG+giBgwoQJrxwoEREREZWOIDzftD2mXBQrIXZxcUHfvn0xc+ZMAMCkSZMgCAJEUURiYiJ+++23Io9lQkxEREREFVmxEuLY2FgkJSVJr5cvX15mARERERGRdikEAQotl3S1PV55KtWUiYCAAG3HQURERERULkqVEBMRERHRm4OrTGhWqlUmiIiIiIjkotgV4gsXLmDKlCmlOkloaGipjiMiIiIiLSiDVSYgowpxsRPiixcv4uLFiyUaPP/BHUyIiYiIiMqPAgIUWs5gtT1eeSp2QlyrVi20bNmyLGMhIiIiInrtip0Qt2rVCsuWLSvLWIiIiIioDPDBHJrxpjoiIiIi0mlMiImIiIhkLn/ZNW1vpbFw4UI4OzvDyMgI3t7eOHXqVLGOW79+PQRBQM+ePdXaAwMDIQiC2tapU6cSxcSEmIiIiIheiw0bNiAkJAQTJ07EuXPn4OHhAX9/fyQmJmo8LjY2FmPGjIGvr2+h+zt16oQHDx5I27p160oUFxNiIiIiIpnLf3SztreSmjt3LoKDgxEUFAQ3NzcsXrwYJiYmGu9Ty8vLw+DBgzF58mS4uLgU2sfQ0BB2dnbSVrly5RLFVayEWKVS8YY6IiIiIiogLS1NbcvOzi60X05ODs6ePQs/Pz+pTaFQwM/PD8ePHy9y/ClTpsDGxgZDhw4tss+BAwdgY2ODunXr4qOPPsLDhw9LdA2sEBMRERHJXP4qE9reAMDR0REWFhbSFhYWVmgMycnJyMvLg62trVq7ra0t4uPjCz3myJEjWLp0KX755Zcir61Tp05YtWoVIiIiMHPmTBw8eBDvvPMO8vLyiv3+FHvZNSIiIiKil8XFxcHc3Fx6bWhoqJVx09PT8f777+OXX36BlZVVkf0GDBgg/blRo0Zwd3dHrVq1cODAAbz99tvFOhcTYiIiIiKZU6B0c37/a0wAMDc3V0uIi2JlZQWlUomEhAS19oSEBNjZ2RXoHx0djdjYWHTr1k1qU6lUAAA9PT1ERUWhVq1aBY5zcXGBlZUVbt68WeyEmFMmiIiIiGSuLKdMFJeBgQGaNm2KiIgIqU2lUiEiIgI+Pj4F+terVw+XLl3ChQsXpK179+5o164dLly4AEdHx0LPc/fuXTx8+BD29vbFjo0VYiIiIiJ6LUJCQhAQEAAvLy80a9YM4eHhyMzMRFBQEABgyJAhcHBwQFhYGIyMjNCwYUO14y0tLQFAas/IyMDkyZPRp08f2NnZITo6GmPHjoWrqyv8/f2LHRcTYiIiIiKZU0D70wJKM17//v2RlJSE0NBQxMfHw9PTE7t375ZutLtz5w4UiuKPrFQqERkZiZUrVyI1NRXVqlVDx44dMXXq1BLNZWZCTERERESvzSeffIJPPvmk0H0HDhzQeOyKFSvUXhsbG+Ovv/565ZiYEBMRERHJXP4jjbU9plzwpjoiIiIi0mmsEBMRERHJnPD/m7bHlAtWiImIiIhIp7FCTERERCRzCqEMHswhoznETIiJiIiIdIB80lft45QJIiIiItJprBATERERyVxpHrVcnDHlghViIiIiItJprBATERERyRwfzKEZK8REREREpNNYISYiIiKSOQW0XwWVU1VVTtdCRERERFRirBATERERyRznEGvGhJiIiIhI5gRo/8Ec8kmHOWWCiIiIiHQcK8REREREMscpE5qxQkxEREREOo0VYiIiIiKZ47JrmsnpWoiIiIiISowVYiIiIiKZ4xxizVghJiIiIiKdxgoxERERkcxxHWLNWCEmIiIiIp3GCjERERGRzAnC803bY8oFE2IiIiIimVNAgELLkxy0PV554pQJIiIiItJprBATERERyRynTGjGCjERERER6TRWiImIiIhkTvj//7Q9plywQkxEREREOo0VYiIiIiKZ4xxizVghJiIiIiKdxgoxERERkcwJZbAOsZzmEDMhJiIiIpI5TpnQjFMmiIiIiEinsUJMREREJHOsEGvGCjERERER6TRWiImIiIhkjg/m0IwVYiIiIiLSaawQExEREcmcQni+aXtMuWCFmIiIiIh0GhNiIiIiIpkTyui/0li4cCGcnZ1hZGQEb29vnDp1qljHrV+/HoIgoGfPnmrtoigiNDQU9vb2MDY2hp+fH27cuFGimJgQExEREdFrsWHDBoSEhGDixIk4d+4cPDw84O/vj8TERI3HxcbGYsyYMfD19S2wb9asWZg3bx4WL16MkydPwtTUFP7+/nj69Gmx42JCTERERCRz+esQa3srqblz5yI4OBhBQUFwc3PD4sWLYWJigmXLlhV5TF5eHgYPHozJkyfDxcVFbZ8oiggPD8e3336LHj16wN3dHatWrcL9+/exbdu2YsfFhJiIiIhI5gSUxbSJ59LS0tS27OzsQmPIycnB2bNn4efnJ7UpFAr4+fnh+PHjRcY+ZcoU2NjYYOjQoQX2xcTEID4+Xm1MCwsLeHt7axzzZUyIiYiIiKjUHB0dYWFhIW1hYWGF9ktOTkZeXh5sbW3V2m1tbREfH1/oMUeOHMHSpUvxyy+/FLo//7iSjFkYLrtG9IbyrmGBVjWrwMxAifj0bPx5NQn3Hv/3fKlGdpXQ39Me/yRk4Lfz96V2A6WAjnWsUd/WFCb6SqRkPcPx26k4Hfe4LC+DSJberlMVnevbwMJYD3EpWVh95h5uPcwqtG8rl8oY7lNDrS0nT4Vh6y8BAJQC0MfDHh4OlWBjZoAnOSpciU/HxgsPkJqVW+bXQvJQlsuuxcXFwdzcXGo3NDTUyvjp6el4//338csvv8DKykorYxaFCTHRG6ihnRneqWeN/11JRFzqU7RwtkSglwPCD8ciMyevyOMsjfXQqZ4VYh89KbDvnXrWcKligs2R8UjJegbXqqbo5maD9Ke5uJaUWZaXQyQr3k6WGNSkGlacuovo5Cfwr2eNL9u5YOwfUUjPLjyBfZKTh3F/XJNeiy/sM9BTwLmKMbZfSsCdlKcwNVDiPa9q+LxNTUzcXbI76YnKgrm5uVpCXBQrKysolUokJCSotSckJMDOzq5A/+joaMTGxqJbt25Sm0qlAgDo6ekhKipKOi4hIQH29vZqY3p6ehb7GjhlgugN1NK5Ms7EpeHcvTQkZebgf1cS8SxPRFOHov9BEgD0dbfH3zce4lHWswL7a1ga4/z9NMQ8ykJqVi7O3H2M+PRsVLc0KsMrIZKfTvWscODmIxy+lYL7adlYceousvNEtKlVpchjRACPn+ZKW9rTfxPnrGcqzPr7Fk7def41Gf3wCVadvoeaVU1Q1UT/NVwRyUFFWHbNwMAATZs2RUREhNSmUqkQEREBHx+fAv3r1auHS5cu4cKFC9LWvXt3tGvXDhcuXICjoyNq1qwJOzs7tTHT0tJw8uTJQscsCivERG8YpQBUMzfCoVuPpDYRQPTDTDhaGgNIKfS4dq5VkZmTh7P30uBUxbjA/jupWahnY4azd9OQnp2LmlWMYWVqgF3XksroSojkR6kQ4FzFBH9c+XcJKRHAP/HpcLUyKfI4Iz0F5vasDwHA7UdZ2HTxAe49LvzGJAAwMVBCJYoafyNEVBGFhIQgICAAXl5eaNasGcLDw5GZmYmgoCAAwJAhQ+Dg4ICwsDAYGRmhYcOGasdbWloCgFr76NGjMW3aNNSuXRs1a9bEhAkTUK1atQLrFWvChFhLDhw4gHbt2iElJUX6sIjKgomBEkqFgIyXvhFmZOfBytSg0GOcLI3QtLo5Fh69XeS4f/6ThJ4NbTCunQvyVCJEiNh2OQGxKYXPeySigioZPv/6fLHCCzyv/tqbFz6vMj4tG7+eiENcahaM9ZXoXN8aEzrWxvg/o5BSyG9z9BUC+nna40RsKp7mqsrkOkh+SrtM2n+NWVL9+/dHUlISQkNDER8fD09PT+zevVu6Ke7OnTtQKEo2gWHs2LHIzMzE8OHDkZqailatWmH37t0wMir+bziZEP+/wMBArFy5EsDzeSnVq1dH3759MWXKlBK9oRUNE3UyUAp4190e2y4n4Mmzor95NneyRHVLY6w+ew+pWc/gXMUE3dxskZ6dh+iHBeccE5F23Ex+gpvJ/36N3UzKxIxu9dC+dlVsiVS/S14pACN9nSAIwIpTd193qERa8cknn+CTTz4pdN+BAwc0HrtixYoCbYIgYMqUKZgyZUqpY2JC/IJOnTph+fLlePbsGc6ePYuAgAAIgoCZM2eWd2hEkic5echTiTAzUKq1mxkqkZFd8NenVUwMUNlEH+81cZDa8n+qn9yxNn48HIu07Fx0qGOF387fx/X/v4EuISMH9pUM0dK5MhNiomJKz37+9WlupP7t1cJID4+LuSJEnvh82oRNJfXf+DxPhp1hZWqAGfuiWR2mEhH+f9P2mHLBm+peYGhoCDs7Ozg6OqJnz57w8/PD3r17ATyf9B0WFoaaNWvC2NgYHh4e2Lx5s8bxjhw5Al9fXxgbG8PR0RGjRo1CZubzZOPrr7+Gt7d3gWM8PDykn3BOnz6NDh06wMrKChYWFmjTpg3OnTun1l8QBPz666/o1asXTExMULt2bfzvf/8D8Pwxh+3atQMAVK5cGYIgIDAw8JXeIyp/eSJwP+0pXKr+Ox9RAOBS1QRxqQWnNyRn5mDekVgsPHZb2q4lZiLmURYWHruNx0+fQSkI0FMIEEVR7VgRotaX6SGSszyViNhHT9DArpLUJgBwszNTqwJrIghAdUsjtQQ6Pxm2q2SAmRHRBaZMEf0XBQQoBC1vMkqJmRAX4fLlyzh27BgMDJ7/hB4WFoZVq1Zh8eLFuHLlCj7//HO89957OHjwYKHHR0dHo1OnTujTpw8iIyOxYcMGHDlyRPoVweDBg3Hq1ClER0dLx1y5cgWRkZEYNGgQgOfr7wUEBODIkSM4ceIEateujc6dOyM9PV3tXJMnT0a/fv0QGRmJzp07Y/DgwXj06BEcHR2xZcsWAEBUVBQePHiAH3/8sdB4s7OzCzxphiquo7Ep8KpugcbVzGFtaoDuDWxgoFTg7L3nn1ufRnboUOf5mo25KhGJGTlq29PcPGTnqpCYkYM8EcjOUyHm0RN0qmuNmlWMUdlYD40dzOFZzRz/JGSU56USvXF2X0tGG9cqaFWzMqqZGyKgWXUYKhXSjbDDfRzR1/PfJaZ6NLRFQzszWJsZwKmyMUa0qAErUwMcuPkQwPNk+FNfZ9SsYoxFR+9AIQiwMNKDhZEelPyJlUgrOGXiBX/++SfMzMyQm5uL7OxsKBQKLFiwANnZ2Zg+fTr27dsnLeHh4uKCI0eOYMmSJWjTpk2BscLCwjB48GCMHj0aAFC7dm3MmzcPbdq0waJFi9CgQQN4eHjgt99+w4QJEwAAa9euhbe3N1xdXQEA7du3Vxvz559/hqWlJQ4ePIiuXbtK7YGBgRg4cCAAYPr06Zg3bx5OnTqFTp06oUqV58v82NjYaJxDHBYWhsmTJ5fujaPX7nJ8BkwNkvF27aowM1TiQVo2Vp65J91xbmmsBxHif4yibsOFB+hYxwp93e1hrK9AalYu9t5Ixik+mIOoRE7eTkUlQyV6e9jBwkgPd1KyMHt/jHSjXVVTA7z4yxhTAyU+aO4ICyM9ZObkIfZRFqbuuYn7ac9Xmahsoo8mjhYAgO+61FU71/S9N3EtkeuE03/jlAnNmBC/oF27dli0aBEyMzPxww8/QE9PD3369MGVK1fw5MkTdOjQQa1/Tk4OGjduXOhYFy9eRGRkJNauXSu1iaIIlUqFmJgY1K9fH4MHD8ayZcswYcIEiKKIdevWISQkROqfkJCAb7/9FgcOHEBiYiLy8vLw5MkT3LlzR+1c7u7u0p9NTU1hbm6OxMRElMT48ePVzp2WlgZHR8cSjUGv18k7qTh5J7XQfUv/42abrZcSCrRl5ORh6+WC7URUcvuuP8S+6w8L3Re2L1rt9W/n7uO3c/cL7QsAyZnPMGTtRa3GR0TqmBC/wNTUVKrOLlu2DB4eHli6dKm01t2OHTvg4OCgdkxRjyfMyMjAhx9+iFGjRhXYV6PG80d0Dhw4EOPGjcO5c+eQlZWFuLg49O/fX+oXEBCAhw8f4scff4STkxMMDQ3h4+ODnJwctfH09dUXZhcEQXqSS3EZGhpq7VGLREREVMGwRKwRE+IiKBQKfP311wgJCcH169dhaGiIO3fuFDo9ojBNmjTBP//8IyXYhalevTratGmDtWvXIisrCx06dICNjY20/+jRo/jpp5/QuXNnAM+fFZ6cnFyi68ifA52XxxswiIiIiArDm+o06Nu3L5RKJZYsWYIxY8bg888/x8qVKxEdHY1z585h/vz50trFLxs3bhyOHTuGTz75BBcuXMCNGzewffv2AuvuDR48GOvXr8emTZswePBgtX21a9fG6tWrcfXqVZw8eRKDBw+GsXHBJ4xp4uTkBEEQ8OeffyIpKQkZGbxBioiISNdUhEc3V2RMiDXQ09PDJ598glmzZmH8+PGYMGECwsLCUL9+fXTq1Ak7duxAzZo1Cz3W3d0dBw8exPXr1+Hr64vGjRsjNDQU1apVU+v37rvv4uHDh3jy5EmBRwwuXboUKSkpaNKkCd5//32MGjVKrYJcHA4ODpg8eTK++uor2NraFrkQNhEREZGuEsSXFx4lwvOb6iwsLDBm81kYmpqVdzhE9JI7D/lIbaKKKudJBjYMb4XHjx/D3Ny8XGPJ/34eceEOzCppN5aM9DS87VmjQlznq+IcYiIiIiKZ4z11mnHKBBERERHpNFaIiYiIiOSOJWKNWCEmIiIiIp3GCjERERGRzJXFMmlcdo2IiIiISCZYISYiIiKSOUF4vml7TLlghZiIiIiIdBorxEREREQyx0UmNGOFmIiIiIh0GivERERERHLHErFGTIiJiIiIZI7LrmnGKRNEREREpNNYISYiIiKSOS67phkrxERERESk01ghJiIiIpI53lOnGSvERERERKTTWCEmIiIikjuWiDVihZiIiIiIdBorxEREREQyx3WINWNCTERERCRzXHZNM06ZICIiIiKdxgoxERERkczxnjrNWCEmIiIiIp3GCjERERGR3LFErBErxERERESk01ghJiIiIpI5LrumGSvERERERKTTWCEmIiIikjmuQ6wZK8REREREpNNYISYiIiKSOS4yoRkTYiIiIiK5Y0asEadMEBEREZFOY0JMREREJHNCGf1XGgsXLoSzszOMjIzg7e2NU6dOFdl369at8PLygqWlJUxNTeHp6YnVq1er9QkMDIQgCGpbp06dShQTp0wQERER0WuxYcMGhISEYPHixfD29kZ4eDj8/f0RFRUFGxubAv2rVKmCb775BvXq1YOBgQH+/PNPBAUFwcbGBv7+/lK/Tp06Yfny5dJrQ0PDEsXFhJiIiIhI5spy2bW0tDS1dkNDwyIT0rlz5yI4OBhBQUEAgMWLF2PHjh1YtmwZvvrqqwL927Ztq/b6s88+w8qVK3HkyBG1hNjQ0BB2dnalvhZOmSAiIiKiUnN0dISFhYW0hYWFFdovJycHZ8+ehZ+fn9SmUCjg5+eH48eP/+d5RFFEREQEoqKi0Lp1a7V9Bw4cgI2NDerWrYuPPvoIDx8+LNE1sEJMREREJHNluchEXFwczM3NpfaiqsPJycnIy8uDra2tWrutrS2uXbtW5HkeP34MBwcHZGdnQ6lU4qeffkKHDh2k/Z06dULv3r1Rs2ZNREdH4+uvv8Y777yD48ePQ6lUFutamBATERERUamZm5urJcTaVqlSJVy4cAEZGRmIiIhASEgIXFxcpOkUAwYMkPo2atQI7u7uqFWrFg4cOIC33367WOdgQkxEREQkdxVgHWIrKysolUokJCSotSckJGic/6tQKODq6goA8PT0xNWrVxEWFlZgfnE+FxcXWFlZ4ebNm8VOiDmHmIiIiEjmKsKyawYGBmjatCkiIiKkNpVKhYiICPj4+BR7HJVKhezs7CL33717Fw8fPoS9vX2xx2SFmIiIiIhei5CQEAQEBMDLywvNmjVDeHg4MjMzpVUnhgwZAgcHB+nGvLCwMHh5eaFWrVrIzs7Gzp07sXr1aixatAgAkJGRgcmTJ6NPnz6ws7NDdHQ0xo4dC1dXV7VVKP4LE2IiIiIiuSuDZddKMwWjf//+SEpKQmhoKOLj4+Hp6Yndu3dLN9rduXMHCsW/ExgyMzPx8ccf4+7duzA2Nka9evWwZs0a9O/fHwCgVCoRGRmJlStXIjU1FdWqVUPHjh0xderUEq1FLIiiKJb8ckju0tLSYGFhgTGbz8LQ1Ky8wyGil9x5mFXeIRBREXKeZGDD8FZ4/Phxmd5sVhz538/P3YxHpUrajSU9PQ1NXO0qxHW+KlaIiYiIiGSuAtxTV6HxpjoiIiIi0mmsEBMRERHJHUvEGrFCTEREREQ6jRViIiIiIpkrzbrBxRlTLpgQExEREcmcUAbLrml9GbdyxCkTRERERKTTWCEmIiIikjneU6cZK8REREREpNNYISYiIiKSO5aINWKFmIiIiIh0GivERERERDLHZdc0Y4WYiIiIiHQaK8REREREMiegDNYh1u5w5YoVYiIiIiLSaawQExEREckcF5nQjAkxERERkczx0c2accoEEREREek0VoiJiIiIZI+TJjRhhZiIiIiIdBorxEREREQyxznEmrFCTEREREQ6jRViIiIiIpnjDGLNWCEmIiIiIp3GCjERERGRzHEOsWZMiImIiIhkTvj//7Q9plxwygQRERER6TRWiImIiIjkjnfVacQKMRERERHpNFaIiYiIiGSOBWLNWCEmIiIiIp3GCjERERGRzHHZNc1YISYiIiIincYKMREREZHMcR1izVghJiIiIiKdxgoxERERkdxxmQmNmBATERERyRzzYc04ZYKIiIiIdBorxEREREQyx2XXNGOFmIiIiIh0GivERERERLKn/WXX5DSLmBViIiIiItJpTIiJiIiIZC5/DrG2t9JYuHAhnJ2dYWRkBG9vb5w6darIvlu3boWXlxcsLS1hamoKT09PrF69Wq2PKIoIDQ2Fvb09jI2N4efnhxs3bpQoJibERERERPRabNiwASEhIZg4cSLOnTsHDw8P+Pv7IzExsdD+VapUwTfffIPjx48jMjISQUFBCAoKwl9//SX1mTVrFubNm4fFixfj5MmTMDU1hb+/P54+fVrsuJgQExEREdFrMXfuXAQHByMoKAhubm5YvHgxTExMsGzZskL7t23bFr169UL9+vVRq1YtfPbZZ3B3d8eRI0cAPK8Oh4eH49tvv0WPHj3g7u6OVatW4f79+9i2bVux42JCTERERCRzZTllIi0tTW3Lzs4uNIacnBycPXsWfn5+UptCoYCfnx+OHz/+n9cgiiIiIiIQFRWF1q1bAwBiYmIQHx+vNqaFhQW8vb2LNaYUR7F7EhERERG9xNHRERYWFtIWFhZWaL/k5GTk5eXB1tZWrd3W1hbx8fFFjv/48WOYmZnBwMAAXbp0wfz589GhQwcAkI4r6Zgv47JrRERERDInlMGya/njxcXFwdzcXGo3NDTU6nkqVaqECxcuICMjAxEREQgJCYGLiwvatm2rtXMwISYiIiKiUjM3N1dLiItiZWUFpVKJhIQEtfaEhATY2dkVeZxCoYCrqysAwNPTE1evXkVYWBjatm0rHZeQkAB7e3u1MT09PYt9DZwyQURERCRzFWHZNQMDAzRt2hQRERFSm0qlQkREBHx8fIo9jkqlkuYp16xZE3Z2dmpjpqWl4eTJkyUakxViIiIiInotQkJCEBAQAC8vLzRr1gzh4eHIzMxEUFAQAGDIkCFwcHCQ5iGHhYXBy8sLtWrVQnZ2Nnbu3InVq1dj0aJFAABBEDB69GhMmzYNtWvXRs2aNTFhwgRUq1YNPXv2LHZcTIiJiIiIZE6A9h+0XJrx+vfvj6SkJISGhiI+Ph6enp7YvXu3dFPcnTt3oFD8O4EhMzMTH3/8Me7evQtjY2PUq1cPa9asQf/+/aU+Y8eORWZmJoYPH47U1FS0atUKu3fvhpGRUfGvRRRFsRTXQzKXlpYGCwsLjNl8FoamZuUdDhG95M7DrPIOgYiKkPMkAxuGt8Ljx4+LNbe2LOV/P7+bkKL1WNLS0lDdtnKFuM5XxQoxERERkdxVlBJxBcWEmIiIiEjmynLZNTngKhNEREREpNNYISYiIiKSudIsk1acMeWCFWIiIiIi0mmsEBMRERHJHO+p04wVYiIiIiLSaawQExEREckdS8QasUJMRERERDqNFWIiIiIimeM6xJqxQkxEREREOo0VYiqUKIoAgOwnGeUcCREVJudJVnmHQERFeJaVCeDf76UVQXp6mtbXDU5PT9PugOWICTEVKj09HQAwf0ibco6EiIjozZSeng4LC4tyjcHAwAB2dnaoXdOxTMa3s7ODgYFBmYz9OgliRfrxhSoMlUqF+/fvo1KlShDk9CgaHZWWlgZHR0fExcXB3Ny8vMMhohfw61N+RFFEeno6qlWrBoWi/GenPn36FDk5OWUytoGBAYyMjMpk7NeJFWIqlEKhQPXq1cs7DNIyc3NzfsMlqqD49Skv5V0ZfpGRkZEsktayVP4/thARERERlSMmxERERESk05gQE+kAQ0NDTJw4EYaGhuUdChG9hF+fROWPN9URERERkU5jhZiIiIiIdBoTYiIiIiLSaUyIiYiIiEinMSEmIiIiIp3GhJiIiIiIdBoTYiJ6ZS8uVpOXl1eOkRAREZUcE2IieiWiKEIQBDx69AgAoFQqcezYMZw5c6acIyMiIioeJsRE9EoEQUBSUhI6d+6MhQsX4s8//0SrVq2QkZFR3qERyZpKpZL+nJubW6CNiIqPCTERvbInT57g7bffxowZM9C3b1+sX78ebdu25fQJojKkUCgQFxeHZ8+eQU9PD3/88Qe+++47JsVEpcCEmIhemZOTE1q2bIl79+7B3NwcDx8+BPB8+gS/OROVjaysLPTo0QOtWrXCunXr0KNHD9StWxcKBb+1E5UUH91MRK9EpVJBoVAgMjISN27cQGRkJNavX48PP/wQISEhan2ISLtu3bqF5s2bIz09HQsXLsQHH3yAvLw8KJXK8g6N6I2iV94BENGbKf9muqSkJBgbG6NBgwZwd3eHu7s7nj59iiVLlkChUGD06NFQKBTYsmULatasiSZNmpR36ESyYWhoiIyMDBgaGmLt2rV4//33oa+vzx9CiUqIFWIiKrVt27Zh7NixMDIygrm5ObZs2QJbW1vExMRgyZIl+P3339GlSxeYmZlh2rRpiI6ORs2aNcs7bCJZiYmJQVZWFjp37gxnZ2fs3btXLSnOzc2Fnh7rX0SaMCEmohLJrwz/888/aNmyJcaPHw8TExNs2LAB0dHR+Ouvv9CoUSPcvn0bGzZswJo1a2BkZIQlS5agcePG5R0+0Rst/+vv6tWrSEhIQPXq1eHq6goAOH36NPr27QsXFxf89ddf0NfXx4IFC5CWlobx48dDEIRyjp6o4mJCTEQlduLECaSnp+PYsWOYOHEiACA5ORnvv/8+Ll68iD179qBhw4bIzc1Fbm4usrKyULly5XKOmkgetm7disDAQFhbWyMmJgazZs1CYGAgrKyscPr0aQwYMACCIKB58+bYsGEDzp07h0aNGpV32EQVGicYEVGJZGZm4uOPP4a/vz9iYmKkdisrK6xevRoeHh7o0qULLl68CD09PRgZGTEZJnpF+bWruLg4hIWFYfbs2Thw4AC+//57TJo0CT/++CMSExPx1ltv4dChQ2jdujWMjY1x/vx5JsNExcAKMRGVWGRkJMaOHYuoqCicPHkSNjY20q9yHz58iG7duiE1NRUXLlyAgYFBeYdLJAv79u3DuXPncPPmTcyfPx+GhoYAgEWLFmHcuHH47LPP8PHHH8Pe3h4AkJOTw68/omJiQkxEGuUnui/Ky8tDVFQU3nvvPeTk5ODIkSOwtLRUe4xzRkYGatSoUU5RE8nPN998g7CwMLi4uODQoUOoVq2atG/RokX49ttvERgYiDFjxkhJMREVDxNiIipSfoJ7/PhxHD58GBkZGejcuTOaN28OALh69SoGDRqEZ8+eFUiKiUj75syZgy+//BJz5sxBcHAwzMzMpH1z585FeHg4zp49C2tr63KMkujNw4SYiDTaunUrRowYgYYNG8LU1BQ7duzAmjVrMGjQIADPk+KAgADcu3cP//zzDywsLMo5YqI3X/4Plnl5ecjLy1Ob+vDtt99ixowZmDdvHgICAmBqairtS01NhaWlZTlETPRm48KERFSk48eP4+OPP8b06dMxbNgw3L17F87OzggKCsKjR4/wySefoH79+li2bBlGjhyJhw8fMiEmekX5yfBff/2FVatWITY2Fh06dMCgQYNQp04dTJs2DaIoYtSoUVAqlRg8eLBUKebXH1HpMCEmokLl5eXhwoULGD58OIYNG4a4uDi0atUKH374IaytrTF69GiYmZkhMDAQDRs2xN69e3kDD5EWCIKA7du34/3338egQYPQv39/TJ06FdevX8eIESPQunVrfPfdd1Aqlfjoo4+gr6+PoKAgCILA6UpEpcQpE0RUQGJiImxsbHD16lVkZGSgYcOGeOedd1C7dm0sWbIEt2/fhqenJ9LT07FkyRIEBweXd8hEsnHlyhX07t0bn3/+OUaMGAFRFGFjY4Nnz56hefPmmDBhAlq2bAkAmDZtGvr06YP69euXc9REbzZWiIlIzaVLl9C+fXtcunRJ+iZ748YNpKWlITAwEAqFAgYGBujTpw/q1asnfWMmIu3Izs7GoEGDMHToUNy9exe+vr4YPHgwgoKC0Lx5c5iZmeHJkyfo0KEDvv322/IOl0gWmBATkZpGjRrBwcEBs2fPxvfffw9BEJCSkoILFy4gJSVFqgpfu3YNCxYsgImJSXmHTCQLkZGRMDIyQv369VGpUiUolUqMHTtWmiJhamoKHx8fbN26FSYmJmjVqhWMjY3LO2wiWeCT6ohIkpubC5VKhXfffRfnz5/Ho0ePAADNmjXDyJEj0b17d/j4+GDevHlYuHAhk2EiLRBFEYmJiejduzcOHToEY2Nj1K5dG3l5ebh37x4aNWokrSRRv359/Pbbb5g0aRKTYSIt4hxiIsKjR49QpUoV6fX9+/fRoEEDfPHFF2q/kv3zzz/x5MkTvPXWW6hZs2Z5hEokW5999hn+/PNPnD59GlWqVEFycjLefvttNG7cGH379sXRo0excuVKREZGomrVquUdLpGsMCEm0nEnTpzAV199hYYNG2L69OkwNDSEoaEh5s2bh7Vr12L58uVwc3Mr7zCJZCv/EcvXrl3D0KFDMXToUHzwwQcAgAMHDmDQoEGoVKkSnj17hi1btqBx48blHDGR/HDKBJGOs7Gxga+vL44dOwYPDw9MnToVV65cQefOnZGeno6oqCgAz5dhIyLt+eeff5Ceni4tV1i3bl04ODhg1apVUp+2bdvi5MmT2LlzJ06cOMFkmKiMsEJMpGPyF/1//PgxcnNz1X71OmnSJJw7dw4RERGYPn06li9fjoyMDJw5c4ZPvyLSopiYGAwYMABxcXEIDw+Hm5sbGjZsiNu3b6Nly5YYP348Ro4cWd5hEukMJsREOiQ/Gf7jjz/w008/4caNG2jSpAm8vLwwduxYAMDjx4/x559/YvHixbh16xYeP36M6Oho2NralnP0RPLx7NkzxMTEYMGCBTh06BBEUcTAgQPRt29f/PDDD8jNzUV4eDgMDQ35sA2i14AJMZHM5SfB+Xbs2IF3330X3333HRo2bIhdu3bhxx9/REREBNq1ayf1u3fvHqKiouDk5IRatWqVR+hEspH/dXjz5k2kpqYiNzcXzZs3BwCcPn0ap06dwoQJE9C+fXtERUXhypUrOH78OLy9vcs5ciLdwISYSAfk5eVBqVQiKysLgYGBaNy4Mb766iskJyejcePG6NWrF+bNm1feYRLJUn4yvHXrVkyYMAF5eXkQBAFVqlTB1q1bpd++3Lp1C1u2bMHu3buxf/9+XLt2DXXq1Cnn6Il0A2+qI5KppUuXom/fvgAApVIp/f/mzZuoX78+Hjx4AE9PT7zzzjtSMrxp0yYcP3683GImkiNBEHDgwAEMGTIEn3/+Oc6ePYsFCxbg+PHj2LlzJwBApVLBxcUFY8aMQUREBO7fv89kmOg1YkJMJEPPnj1DSkoKrl69iuDgYKk9NzcX9evXx9mzZ9GyZUt07twZS5YsAQAkJydj9+7duHbtGlQqVXmFTvTGy3+gzYtfRydPnkRQUBCGDRuGpKQkDBs2DCNGjEBQUBAAQKF4/u04fzUXOzu71xw1kW5jQkwkQ/r6+hgxYgQ+/vhjnD59WlrT1MTEBC1atMC0adNQvXp1zJs3T5pf/MMPP+DQoUNo27at9M2ZiEpm48aNsLa2xrVr16BQKKSk+Pz589IPqr6+vujYsSMWLlwIAFi+fLn0Zz09vXKLnUiX8SuPSIZEUYSZmRmGDBkClUqFX3/9FUFBQVi+fDk+/vhjJCUlYerUqRg9ejSUSiUyMzOxbds2HDhwgE+gI3oFzZs3R4cOHdC+fXv8/fffqFevHgCgT58+WLZsGerWrYuePXtiyZIlUKlUUKlUOHPmDBQKBZ4+fQojI6NyvgIi3cQyEJEM5Vd985Pi4OBgnD17FoGBgQCAiRMnYuHChUhPT8e1a9dQpUoVHDt2DJ6enuUXNJEM1KhRA0uXLkXjxo3RunVrXLt2DcDzh26kpqbC0tISAwYMAABkZGRg4sSJ2Lp1Kz755BMmw0TliKtMEMlI/t3st27dQlZWFnJzc+Hh4YFnz57h119/xeLFi9GkSRMsX74cAPDkyROYmJhApVJxmgSRFt29exfDhw/HmTNncODAAbi5ueHIkSMYOXIkBEFAXl4e7O3tcfnyZezYsYNPoCMqZ0yIiWQiPxn+/fffMWbMGFhYWCAmJgY9evTAyJEj4e7ujl9//RW//PILvLy88Ouvv5Z3yESylpCQgICAAJw9exYHDx6Em5sbLl++jOvXr+Po0aNo3LgxWrRoARcXl/IOlUjnMSEmkpFDhw6hW7dumDlzJkaMGIE1a9ZgyJAhWLZsGQIDA5GRkYE1a9YgLCwM3bt3x/z588s7ZKI3Xv4Po2fOnME///yDx48fo3nz5njrrbfw6NEjDB48GGfOnJGSYiKqeJgQE8lA/jfk0NBQ3Lp1C2vWrEFMTAw6duyIdu3a4eeffwbwfEmnp0+fYt26dWjfvj0rU0RasmXLFgwfPhy+vr64c+cOFAoFOnbsiOnTp+Pu3bv48MMPceHCBezZswcNGjQo73CJ6CWcNEj0hilsjeD8m+ji4+Ph4eGBvLw8tGrVCm+//ba0zvCGDRvw+++/w9TUFEOHDmUyTKQlly5dwqhRozB9+nRs27YNS5cuxZUrV6Svy+rVq2Pp0qVwdnZGr1698OzZs3KOmIhexoSY6A2Sf/Pb7du38euvv+KHH37AqVOnpP0NGjTArFmzUK1aNfTt2xcLFiyAIAgQRRE7d+7EoUOHkJ2dLX2jJqLiK+qBNdevX0eNGjXw4YcfIiYmBr169cKQIUPw3XffAQCuXLkCOzs7bNmyBREREdDX13+dYRNRMXAdYqI3RH4yHBkZiS5dusDZ2RknTpyQHvcaHByMvn374siRIzh06BA++ugj6OnpISsrC1OnTsXevXuxf/9+GBoalvelEL1x8r/+4uLisGfPHqhUKtSrVw++vr7Q19eHra0t4uLi0Lp1a3Tu3Bk//fQTAODw4cP466+/8Omnn/Lpc0QVGBNiojdA/jfjS5cuoXnz5hg7diy+/PJLpKSkoHnz5tiyZQuCg4NRrVo1fPDBB0hJSYGXlxfeeustiKKIqKgo7NixA3Xr1i3vSyF647z4w2j37t1ha2uL6OhoWFpaYu7cuXB3d8fOnTuxa9cujBgxAj/++KN07MaNGxEbG8s1hokqOE6ZIHoD5E+T8PHxQY8ePTBp0iSYmpqievXqcHNzw4ULFxAXFwcAeOedd7BlyxbMmjULXl5e6Nevn7TEExGVzIvJsI+PDwYOHIj9+/dj/fr1yMrKwuLFi+Hs7IxFixZBFEVUr14dd+7cQXR0NMaOHYu1a9dixowZsLCwKO9LISINuMoE0RsiNjYWvr6+8PLyQkhICHx9fTF79myMGzcOrq6uaNiwIQDA09MTI0aMQOXKlTlXkUgL4uLi0KRJE7Rr1w4bN26U2ps1a4bU1FScPn0aenp62LBhA0aOHAlbW1uYmJhAEASsWbOGP4wSvQGYEBO9AfKrVFFRUejTpw/q1q0La2trbNq0CevWrYOzszNEUcTy5ctx8OBBXLhwAR07dsS6detgZGTEp9ARvYLY2Fj069cP9vb2GDt2LFq2bImwsDB888038PLygr29PapWrYquXbvC0tISWVlZcHJygrW1NWxtbcs7fCIqBibERG+I/KT42rVr6N+/Py5duoTvv/8eISEhUp/89YjXrl2LFi1aoGbNmuUYMZF83LhxA6NGjYKBgQFsbGywfft2/PTTT2jWrBnOnj2Ly5cvY/78+TA1NUWTJk2wZcuW8g6ZiEqACTHRGyQ/KY6OjkbPnj3h7OyML7/8Eq1btwYA5ObmQk+P98oSlYXr16/jk08+weHDhzF16lSMGTNGbf/Dhw+xf/9+eHh4oHbt2uUUJRGVBhNiogoqf81ThUIhJcL57fmV4nfffRdOTk4YP348WrVqVZ7hEumE6OhofPzxx1Aqlfj666+lr7tnz55xzj7RG4wTC4kqiPwE+OnTpwCeJ8I3btyQ/pwvP0GuV68eNm/ejHv37uGrr77C8ePHX3/QRDqmVq1aWLBgAURRxLRp03D06FEAYDJM9IZjQkxUQSgUCty6dQujR4/GvXv3sHnzZtSvXx9XrlwptG9+Urx27VqoVCpUr169HKIm0j21a9fGvHnzoK+vjzFjxuDEiRPlHRIRvSJOmSCqQA4dOoSePXvCw8MDx48fx88//4whQ4ZIN8u9LC8vD0qlkr+uJSoH165dw4QJEzBnzhzUqFGjvMMholfAhJiogshPemfOnInx48ejefPmWLVqFVxdXdX2azqWiF6vnJwcGBgYlHcYRPSKOGWCqILIy8sDABgZGSE0NBQJCQmYNGkSzp8/DwAQBAEv/vyaP+c4fx8RvX5MhonkgRVionKWX919ecm0PXv24MMPP0SLFi0wduxYeHh4AACOHz8OHx+f8gqXiIhIdpgQE5Wj/GQ4IiICv//+O1JSUuDm5obg4GDY2Nhgz549GDFiBFq2bIkBAwbg3LlzmDhxIuLj42Ftbc3KMBERkRYwISYqZ9u2bcPAgQPx3nvv4fbt20hJSUFSUhIOHTqEGjVqICIiAmPGjIFKpUJaWho2b96Mpk2blnfYREREssGEmOg1evnmt+TkZHTo0AGDBg3Cl19+CQC4fPkyvvjiC9y4cQOnTp2ClZUVYmNjkZaWBmtra9jb25dX+ERERLLEm+qIXoP8nzufPHkC4N8b4jIyMvDgwQN4enpKfevXr49Zs2ahcuXKWL9+PQDA2dkZ7u7uTIaJiIjKABNiotdAEAQkJibC2dkZGzdulJ48Z2dnB0dHRxw8eFDqq1Qq4e7uDj09PURFRZVXyERERDqDCTHRa6JQKNC9e3e8//772L59u9Tm7e2Nv//+G1u3bpX6CoIABwcHWFpaQhRFcGYTERFR2eEcYqIyUtjDMhITE/Hdd99h/vz52LJlC3r16oWHDx9i8ODBePz4Mby9vdGyZUscOnQIq1atwsmTJ1GvXr1yugIiIiLdwISYqAyoVCooFApkZmYiLy8P5ubm0r4HDx5g+vTpWLhwITZt2oQ+ffrg4cOHmDFjBo4ePYrk5GTY2dlh3rx5anOLiYiIqGwwISYqIzdu3EC/fv1gZmaG4OBg2NnZoWPHjgCA7OxsfPHFF/jpp5+wYcMG9O3bF7m5uRAEAY8ePYKJiQlMTU3L+QqIiIh0g95/dyGiklKpVFixYgUuXrwIIyMjpKam4smTJ6hSpQqaNWuGDz74AEFBQahatSr69+8Pc3Nz+Pv7AwCsra3LOXoiIiLdwgoxURmJj4/HzJkzER0dDVdXV4wcORJr167F4cOHERkZiSpVqsDFxQVnz55FYmIiDhw4gNatW5d32ERERDqHFWKiMmJnZ4cvv/wS06dPx5EjR1C7dm2EhoYCAE6ePIn79+/j559/ho2NDRITE2FlZVXOERMREekmVoiJylj+TXQnT55Ez5498fXXX0v7nj17BpVKhcePH8PGxqYcoyQiItJdTIiJXoP4+Hh89913OH36NHr27ImvvvoKAJCbmws9Pf6ihoiIqDwxISZ6TfKT4vPnz+Ptt9/G5MmTyzskIiIiAp9UR/Ta2NnZ4ZtvvkHt2rVx7NgxPHz4sLxDIiIiIrBCTPTaJSQkAABsbW3LORIiIiICmBATERERkY7jlAkiIiIi0mlMiImIiIhIpzEhJiIiIiKdxoSYiIiIiHQaE2IiIiIi0mlMiImIiIhIpzEhJiIiIiKdxoSYiCoMQRDUNoVCAUtLS/j6+uLXX39FeS+bvmLFCgiCgEmTJqm1BwYGQhAEHDhwoFziKq22bdtCEATExsYWq39R118azs7OEAThlcf5L2/qZ0NErxcTYiKqcAICAhAQEIDBgwfDzc0NR48eRXBwMAYNGlTeoZUZbSabRERUMnrlHQAR0ctWrFih9nrv3r3o3Lkz1q9fj8GDB6Nr167lE1gRwsLC8NVXX6FGjRrlHQoREZUCK8REVOF16NAB77//PgBg27Zt5RtMIezt7VGvXj2YmJiUdyhERFQKTIiJ6I3QuHFjAEBcXJzUJggCnJ2dkZOTgylTpqBevXowNDREz549pT5PnjxBWFgYGjduDDMzM5iZmaF58+ZYuXJlkec6evQo/Pz8UKlSJVhaWsLf3x8nT54ssr+meaqZmZmYOXMmvLy8YG5uDlNTU9SrVw8jR47E9evXATyfyxsUFAQAmDx5sto86per5VevXkVgYCAcHR1haGgIW1tbDBgwAFeuXCk0try8PHz//feoV68ejIyM4OjoiM8++wxpaWlFXk9JPXjwALNmzUKbNm3g4OAAAwMD2NnZoXfv3jh9+rTGY0VRxI8//gg3NzcYGRnBwcEBo0aNQmpqapH9161bh/bt26Ny5cowMjJC/fr1MWnSJDx58kRr10REuoVTJojojZCeng4AMDQ0VGtXqVTo2bMnDh06hDZt2sDd3R1Vq1YFACQmJqJDhw6IjIyEnZ0d2rRpA1EUcezYMQQGBuLMmTOYP3++2nh//vknevXqhdzcXDRr1gwuLi64ePEiWrdujcDAwBLF/ODBA3To0AFXrlxB5cqV0bZtWxgaGuLWrVtYvHgxateujTp16qBTp07Izc3F0aNH4eHhAU9PT2kMV1dX6c/btm3DgAEDkJ2dDU9PTzRv3hxxcXHYuHEj/vjjD+zatQutW7dWi+G9997D+vXrYWJigo4dO0JPTw8rV67E0aNHoa+vX6LrKcr27dsxbtw41K1bF+7u7jA3N8eNGzfw+++/488//8Sff/6Jjh07Fnrsp59+ip9//hlt27ZFo0aNcPDgQcyfPx8HDx7E4cOHYW5uLvVVqVR47733sG7dOpiZmcHLywuVK1fGmTNnMHnyZOzatQsHDhyAsbGxVq6LiHSISERUQQAQC/tnSaVSiT4+PiIA8ZtvvinQ39XVVbx7926B4zp37iwCED/77DPx6dOnUnt8fLzo5eUlAhB37doltaelpYnW1tYiAHHZsmVq5x83bpx0vokTJ6qdJyAgQAQg7t+/X6397bffFgGI/fr1E9PT09X2xcTEiBcvXpReL1++vNCxX+xvamoqmpmZiXv37lXbt2vXLlFfX190dHQUs7Ozpfb169eLAMQaNWqIMTExUntCQoLYsGFD6Xpe3KdJUTFGRkaKly9fLtB/9+7dooGBgVirVi1RpVKp7XNychIBiObm5uKZM2ek9vT0dLF9+/bS5/aiWbNmiQDEtm3big8ePJDas7OzxaFDh4oAxHHjxqkdU9RnQ0T0IibERFRhvJwQ5+bmitevXxcDAwNFAKKhoaF48+bNAv03bdpUYKzz58+LAMS33npLzMvLK7D/3LlzIgCxe/fuUtuyZctEAGLr1q0L9M/JyRGrV69e7IT45MmTIgDRxsZGTEtL+89r/6+E+LPPPhMBiPPnzy90/6hRo0QA4tatW6W21q1bF0ju8+3atUtrCbEmgwcPFgGIkZGRau35CfHXX39d4JgrV66IgiCIZmZmYlZWliiKovjs2TPRyspKNDU1FePj4wsc8+TJE9HOzk6sXLmy2ufNhJiIioNziImowsmfP6unp4c6depgxYoVqFSpEtatW4datWoV6NutW7cCY+zZswcA0LNnTygUBf+py59TfOrUKant8OHDAIABAwYU6K+vr49333232Newb98+AMDAgQNRqVKlYh9XlPzr6d27d6H7fX19AUC6nmfPnuHEiRMAgP79+xfo36lTJ1SuXPmV48qXnZ2N7du345tvvsHw4cMRGBiIwMBAXLp0CQBw48aNQo8r7L12c3ODh4cHMjIycP78eQDAuXPnkJycjBYtWsDW1rbAMcbGxmjatClSUlKKPBcRUVE4h5iIKpyAgAAAgEKhgLm5ORo1aoTevXsXmsDZ2NgUmFcMQHrYxDfffINvvvmmyHM9ffpU+vP9+/cBAE5OToX2dXZ2Lu4lSDf/vZzAl1b+9Tg4OGjsl5ycDAB4+PAhcnJyYG1tXeTqF05OTkhJSXnl2C5duoTu3btrfMBH/hzwwmIojLOzMy5cuCB9Jvlj79279z8f6JGcnIy6dev+d+BERP+PCTERVTgvr6ygiZGRUaHtKpUKANCqVSutJaXlKf968n9YKIq3t/frCEciiiL69euH2NhYjBgxAiNGjICLiwvMzMwgCAK+/vprhIWFvfJTBvOv39XVFS1bttTYN/+mSiKi4mJCTESyVL16dQDPp0x88cUXxTrG3t4eAHD79u1C9xfVXhhHR0cAQHR0dLGP0aR69eqIjo7GnDlzipXwVa1aFQYGBkhKSkJWVlahKy/cuXPnleO6du0arl27Bi8vLyxatKjA/lu3bmk8/vbt22jUqFGh7QBQrVo1AP9+nvXq1SvRD0xERMXBOcREJEsdOnQAAPz+++/FPiZ/Hu7GjRsL7MvNzcWWLVuKPZafnx8AYN26dcjIyPjP/gYGBtJ5ClPS69HX15eqxYVdz549e/Do0aNijaVJ/pSL/IT15X179+7VeHxhsV27dg0XLlyAmZmZtATdW2+9BQsLCxw8eFArcRMRvYgJMRHJkre3Nzp06ICjR49i5MiRhT6I4uLFi9i9e7f0um/fvqhatSoOHDig9uAOURQxceLEElVUmzVrhnbt2iExMRHDhw9HZmam2v7Y2FjphjPg30poVFRUoeN98cUXMDY2xpgxY7B169YC+7Ozs7F582bcvXtXavvoo48AoEDsycnJ+PLLL4t9LZq4urpCoVDg77//VruZ7enTpxgxYsR/Jq/z58+XbpwDnj9I5dNPP4UoiggKCpIq24aGhhg7dizS09PRu3fvQivP9+7dw+rVq7VyXUSkY8p3kQsion+hiHWINfV3cnIqcn9CQoLYuHFjEYBoaWkptm3bVhw0aJDYpUsX0dHRsdC1brdt2yYqlUoRgOjt7S0OHDhQdHNzE/X19cXg4OASrUN89+5dsW7duiIAsUqVKmL37t3Fvn37ik2aNBEVCoX4ww8/SH2zsrJEGxsbEYDYpk0bMSgoSBw6dKh49OhRtdhMTEyktZe7desmDhgwQPT19RVNTU1FAOL58+fVYujbt68IQDQ1NRW7d+8u9u7dW7S0tBSbNGkiNm/eXCvLruW/L8bGxmKXLl3Ed999V7S1tRWtrKykJfOWL1+udkz+smsjR44U9fX1RX9/f7Ffv36inZ2dCEBs0KCBmJqaqnZMXl6e+P7774sARAMDA9Hb21scMGCA2Lt3b7FBgwaiIAiih4dHsT4bIqIXsUJMRLJlY2ODY8eOYd68eXBzc8P58+exefNmREZGwsXFBbNnz8aYMWPUjunRowf279+Pdu3a4fLly9ixYwfs7e1x8OBBtGjRokTnd3BwwOnTpzFlyhRUr14de/fuxa5du/DkyRN8/PHH6Nq1q9TXyMgIO3bsQIcOHXDhwgWsWLECS5culR7vnB9bZGQkPv74YwiCgL1792LHjh1ITExEt27dsHHjRri5uanF8Ntvv2HmzJlwcHDA7t27ceLECQwaNAh///13oatzlMaiRYvwf+3coXGFUBBA0QWFowZ6+VSDpgYUhgIQeAQFoEDSA0Ml/6uIiKjEZPYcvea5OzuzbxzHaJom9n2P4zji9XrFdV0//iLxZZqmGIYhnueJbduiKIroui6O44i6rr/NlmUZy7LEtm3Rtm3c9x3rusZ5nlFVVfR9H/M8/8mbgFyK9/uXp78AAPCP2RADAJCaIAYAIDVBDABAaoIYAIDUBDEAAKkJYgAAUhPEAACkJogBAEhNEAMAkJogBgAgNUEMAEBqghgAgNQ+jSW5Ri/X0/cAAAAASUVORK5CYII=", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light", - "tags": [] - }, + "metadata": {}, "output_type": "display_data" } ], @@ -682,12 +657,23 @@ "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", "X_test_dtm = vect.transform(X_test)\n", "\n", - "classifier = LinearSVC(class_weight='balanced') # instantiate a logistic regression model\n", + "classifier = LinearSVC(class_weight='balanced') # instantiate a Linear Support Vector Machine model\n", "classifier.fit(X_train_dtm, y_train) # fit the model with training data\n", "\n", "# Make predictions on test data\n", "y_pred_class = classifier.predict(X_test_dtm)\n", "\n", + "# Like other Sklearn models, LinearSVC doesn't have implement .predict_proba, but we can get the same results\n", + "# by using .decision_function (predicts the confidence scores) and then applying softmax on the output\n", + "\n", + "# Softmax Function\n", + "def softmax(x):\n", + " e_x = np.exp(x - np.max(x))\n", + " return e_x / e_x.sum(axis=0)\n", + "\n", + "y_prob_intermediate = classifier.decision_function(X_test_dtm) ## Predicts the Confidence Scores\n", + "y_pred_prob = softmax(y_prob_intermediate)\n", + "\n", "# calculate evaluation measures:\n", "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", @@ -705,6 +691,13 @@ "source": [ "So, how do we choose whats the best? If we look at overall accuracy alone, we should be choosing the very first classifier in this notebook. However, that is also doing poorly with identifying \"relevant\" articles. If we choose purely based on how good it is doing with \"relevant\" category, we should choose the second one we built. If we choose purely based on how good it is doing with \"irrelevant\" category, surely, nothing beats not building any classifier and just calling everything irrelevant! So, what to choose as the best among these depends on what we are looking for in our usecase! " ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -714,7 +707,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -728,9 +721,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.17" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From f021e4a7083972ee2858d4ed6df020b0bf2a1d48 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Sun, 13 Aug 2023 08:38:45 +0000 Subject: [PATCH 02/14] Updated Ch4/02_Doc2Vec_Example.ipynb by removing errors and updating libraries --- Ch4/02_Doc2Vec_Example.ipynb | 169 +++++++++++++---------------------- 1 file changed, 63 insertions(+), 106 deletions(-) diff --git a/Ch4/02_Doc2Vec_Example.ipynb b/Ch4/02_Doc2Vec_Example.ipynb index c1a371e..cd54b8c 100644 --- a/Ch4/02_Doc2Vec_Example.ipynb +++ b/Ch4/02_Doc2Vec_Example.ipynb @@ -28,51 +28,38 @@ "name": "stdout", "output_type": "stream", "text": [ - "Collecting nltk==3.5\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)\n", - "\u001b[K |████████████████████████████████| 1.4MB 5.1MB/s \n", - "\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (7.1.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (1.0.1)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (2019.12.20)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (4.41.1)\n", - "Building wheels for collected packages: nltk\n", - " Building wheel for nltk (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for nltk: filename=nltk-3.5-cp37-none-any.whl size=1434691 sha256=a68222bfb8c06405a2c5f264264ffa3daf49f5d73637541f961a024360751028\n", - " Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306\n", - "Successfully built nltk\n", - "Installing collected packages: nltk\n", - " Found existing installation: nltk 3.2.5\n", - " Uninstalling nltk-3.2.5:\n", - " Successfully uninstalled nltk-3.2.5\n", - "Successfully installed nltk-3.5\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting gensim==3.8.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)\n", - "\u001b[K |████████████████████████████████| 24.2MB 1.3MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.19.5)\n", - "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.4.1)\n", - "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (5.1.0)\n", - "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.15.0)\n", - "Installing collected packages: gensim\n", - " Found existing installation: gensim 3.6.0\n", - " Uninstalling gensim-3.6.0:\n", - " Successfully uninstalled gensim-3.6.0\n", - "Successfully installed gensim-3.8.3\n", - "Collecting scikit-learn==0.21.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9f/c5/e5267eb84994e9a92a2c6a6ee768514f255d036f3c8378acfa694e9f2c99/scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7MB)\n", - "\u001b[K |████████████████████████████████| 6.7MB 5.1MB/s \n", - "\u001b[?25hRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Installing collected packages: scikit-learn\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed scikit-learn-0.21.3\n" + "Requirement already satisfied: nltk in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (3.8.1)\n", + "Requirement already satisfied: tqdm in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (4.66.1)\n", + "Requirement already satisfied: joblib in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (1.3.2)\n", + "Requirement already satisfied: click in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (8.1.6)\n", + "Requirement already satisfied: regex>=2021.8.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (2023.8.8)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: pandas in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (2.0.3)\n", + "Requirement already satisfied: tzdata>=2022.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", + "Requirement already satisfied: pytz>=2020.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", + "Requirement already satisfied: numpy>=1.20.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (1.24.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: gensim in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (4.3.1)\n", + "Requirement already satisfied: numpy>=1.18.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from gensim) (1.24.3)\n", + "Requirement already satisfied: scipy>=1.7.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from gensim) (1.11.1)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from gensim) (6.3.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: scikit-learn in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (1.3.0)\n", + "Requirement already satisfied: numpy>=1.17.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.24.3)\n", + "Requirement already satisfied: scipy>=1.5.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.11.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (3.2.0)\n", + "Requirement already satisfied: joblib>=1.1.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.3.2)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], @@ -81,10 +68,10 @@ "\n", "# ===========================\n", "\n", - "!pip install nltk==3.5\n", - "!pip install pandas==1.1.5\n", - "!pip install gensim==3.8.3\n", - "!pip install scikit-learn==0.21.3\n", + "!pip install nltk\n", + "!pip install pandas\n", + "!pip install gensim\n", + "!pip install scikit-learn\n", "\n", "# ===========================" ] @@ -126,7 +113,7 @@ "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Unzipping corpora/stopwords.zip.\n" + "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], @@ -152,41 +139,7 @@ "id": "NGAFbmrA4EpM", "outputId": "f78def1c-c291-4fba-dd41-f24f1456757c" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2021-07-16 08:27:55-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 2479133 (2.4M) [text/plain]\n", - "Saving to: ‘DATAPATH/train_data.csv’\n", - "\n", - "train_data.csv 100%[===================>] 2.36M --.-KB/s in 0.1s \n", - "\n", - "2021-07-16 08:27:55 (22.4 MB/s) - ‘DATAPATH/train_data.csv’ saved [2479133/2479133]\n", - "\n", - "--2021-07-16 08:27:55-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 783640 (765K) [text/plain]\n", - "Saving to: ‘DATAPATH/test_data.csv’\n", - "\n", - "test_data.csv 100%[===================>] 765.27K --.-KB/s in 0.05s \n", - "\n", - "2021-07-16 08:27:55 (15.4 MB/s) - ‘DATAPATH/test_data.csv’ saved [783640/783640]\n", - "\n", - "total 3.2M\n", - "drwxr-xr-x 2 root root 4.0K Jul 16 08:27 .\n", - "drwxr-xr-x 1 root root 4.0K Jul 16 08:27 ..\n", - "-rw-r--r-- 1 root root 766K Jul 16 08:27 test_data.csv\n", - "-rw-r--r-- 1 root root 2.4M Jul 16 08:27 train_data.csv\n" - ] - } - ], + "outputs": [], "source": [ "#Load the dataset and explore.\n", "try:\n", @@ -283,9 +236,7 @@ ] }, "execution_count": 5, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -309,6 +260,7 @@ { "data": { "text/plain": [ + "sentiment\n", "worry 7433\n", "neutral 6340\n", "sadness 4828\n", @@ -322,13 +274,11 @@ "enthusiasm 522\n", "boredom 157\n", "anger 98\n", - "Name: sentiment, dtype: int64" + "Name: count, dtype: int64" ] }, "execution_count": 6, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -354,9 +304,7 @@ ] }, "execution_count": 7, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -445,6 +393,7 @@ "\n", "#prepare training data in doc2vec format:\n", "train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]\n", + "\n", "#Train a doc2vec model to learn tweet representations. Use only training data!!\n", "model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)\n", "model.build_vocab(train_doc2vec)\n", @@ -470,13 +419,13 @@ "text": [ " precision recall f1-score support\n", "\n", - " happiness 0.37 0.45 0.41 713\n", - " neutral 0.46 0.53 0.49 1595\n", - " worry 0.58 0.46 0.51 1882\n", + " happiness 0.34 0.54 0.41 713\n", + " neutral 0.47 0.54 0.50 1595\n", + " worry 0.61 0.39 0.48 1882\n", "\n", - " accuracy 0.48 4190\n", - " macro avg 0.47 0.48 0.47 4190\n", - "weighted avg 0.50 0.48 0.49 4190\n", + " accuracy 0.47 4190\n", + " macro avg 0.47 0.49 0.46 4190\n", + "weighted avg 0.51 0.47 0.47 4190\n", "\n" ] } @@ -484,9 +433,10 @@ "source": [ "#Infer the feature representation for training and test data using the trained model\n", "model= Doc2Vec.load(\"d2v.model\")\n", + "\n", "#infer in multiple steps to get a stable representation. \n", - "train_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in train_data]\n", - "test_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in test_data]\n", + "train_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in train_data]\n", + "test_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in test_data]\n", "\n", "#Use any regular classifier like logistic regression\n", "from sklearn.linear_model import LogisticRegression\n", @@ -500,6 +450,13 @@ "\n", "#print(confusion_matrix(test_cats,preds))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -509,7 +466,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -523,9 +480,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.17" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From 2f30c8aa1d248ac74fc2656623c88142c8286d1f Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Tue, 15 Aug 2023 09:26:42 +0000 Subject: [PATCH 03/14] Updated Ch4/03_Word2Vec_Example.ipynb by removing errors and updating libraries --- Ch4/03_Word2Vec_Example.ipynb | 980 +++++++++++++++------------------- 1 file changed, 423 insertions(+), 557 deletions(-) diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb index 20ec9b5..ed185ac 100644 --- a/Ch4/03_Word2Vec_Example.ipynb +++ b/Ch4/03_Word2Vec_Example.ipynb @@ -1,572 +1,438 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sVtvH58nb_Hp" + }, + "source": [ + "# Word2Vec for Text Classification\n", + "\n", + "In this short notebook, we will see an example of how to use a pre-trained Word2vec model for doing feature extraction and performing text classification.\n", + "\n", + "We will use the sentiment labelled sentences dataset from UCI repository\n", + "http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences\n", + "\n", + "The dataset consists of 1500 positive, and 1500 negative sentiment sentences from Amazon, Yelp, IMDB. Let us first combine all the three separate data files into one using the following unix command:\n", + "\n", + "```cat amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt > sentiment_sentences.txt```\n", + "\n", + "For a pre-trained embedding model, we will use the Google News vectors.\n", + "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM\n", + "\n", + "Let us get started!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "03_Word2Vec_Example.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } + "id": "77UP8YyEdS2W", + "outputId": "1bb0a097-0232-42fd-ec29-b2e96ce857f5" + }, + "outputs": [], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# !pip install numpy\n", + "# !pip install pandas\n", + "# !pip install gensim\n", + "# !pip install wget\n", + "# !pip install nltk\n", + "# !pip install scikit-learn\n", + "# !pip install gdown\n", + "\n", + "# ===========================" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "sVtvH58nb_Hp" - }, - "source": [ - "# Word2Vec for Text Classification\n", - "\n", - "In this short notebook, we will see an example of how to use a pre-trained Word2vec model for doing feature extraction and performing text classification.\n", - "\n", - "We will use the sentiment labelled sentences dataset from UCI repository\n", - "http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences\n", - "\n", - "The dataset consists of 1500 positive, and 1500 negative sentiment sentences from Amazon, Yelp, IMDB. Let us first combine all the three separate data files into one using the following unix command:\n", - "\n", - "```cat amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt > sentiment_sentences.txt```\n", - "\n", - "For a pre-trained embedding model, we will use the Google News vectors.\n", - "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM\n", - "\n", - "Let us get started!" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "77UP8YyEdS2W", - "outputId": "1bb0a097-0232-42fd-ec29-b2e96ce857f5" - }, - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "!pip install numpy==1.19.5\n", - "!pip install pandas==1.1.5\n", - "!pip install gensim==3.8.3\n", - "!pip install wget==3.2\n", - "!pip install nltk==3.5\n", - "!pip install scikit-learn==0.21.3\n", - "\n", - "# ===========================" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting gensim==3.8.3\n", - " Downloading gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)\n", - "\u001b[K |████████████████████████████████| 24.2 MB 84.7 MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.19.5)\n", - "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.15.0)\n", - "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (1.4.1)\n", - "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.7/dist-packages (from gensim==3.8.3) (5.1.0)\n", - "Installing collected packages: gensim\n", - " Attempting uninstall: gensim\n", - " Found existing installation: gensim 3.6.0\n", - " Uninstalling gensim-3.6.0:\n", - " Successfully uninstalled gensim-3.6.0\n", - "Successfully installed gensim-3.8.3\n", - "Collecting wget==3.2\n", - " Downloading wget-3.2.zip (10 kB)\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9673 sha256=4877de9e41ccfba395a6bc044ccad7ba2ea4f6324ca63bbf9da41b644eb8efea\n", - " Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n", - "Collecting nltk==3.5\n", - " Downloading nltk-3.5.zip (1.4 MB)\n", - "\u001b[K |████████████████████████████████| 1.4 MB 14.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (7.1.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (1.0.1)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (2019.12.20)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from nltk==3.5) (4.41.1)\n", - "Building wheels for collected packages: nltk\n", - " Building wheel for nltk (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434691 sha256=efc90917aca010ac50551beb55d51252d0b46e103b87d83a9b66c70d6b6fd4ba\n", - " Stored in directory: /root/.cache/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266\n", - "Successfully built nltk\n", - "Installing collected packages: nltk\n", - " Attempting uninstall: nltk\n", - " Found existing installation: nltk 3.2.5\n", - " Uninstalling nltk-3.2.5:\n", - " Successfully uninstalled nltk-3.2.5\n", - "Successfully installed nltk-3.5\n", - "Collecting scikit-learn==0.21.3\n", - " Downloading scikit_learn-0.21.3-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)\n", - "\u001b[K |████████████████████████████████| 6.7 MB 14.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Installing collected packages: scikit-learn\n", - " Attempting uninstall: scikit-learn\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed scikit-learn-0.21.3\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "URLGvBLv9T0M" - }, - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JQX8DAmBb_Hr", - "outputId": "a89dcee7-f76f-4bd9-ba60-8642b88ab50c" - }, - "source": [ - "#basic imports\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "import os\n", - "import wget\n", - "import gzip\n", - "import shutil\n", - "from time import time\n", - "\n", - "#pre-processing imports\n", - "import nltk\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt')\n", - "from nltk.tokenize import word_tokenize\n", - "from nltk.corpus import stopwords\n", - "from string import punctuation\n", - "\n", - "#imports related to modeling\n", - "import numpy as np\n", - "from gensim.models import Word2Vec, KeyedVectors\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import classification_report" - ], - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Unzipping corpora/stopwords.zip.\n", - "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Unzipping tokenizers/punkt.zip.\n" - ], - "name": "stderr" - } - ] + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "URLGvBLv9T0M" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "JQX8DAmBb_Hr", + "outputId": "a89dcee7-f76f-4bd9-ba60-8642b88ab50c" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "resources": { - "http://localhost:8080/nbextensions/google.colab/files.js": { - "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", - "ok": true, - "headers": [ - [ - "content-type", - "application/javascript" - ] - ], - "status": 200, - "status_text": "" - } - }, - "base_uri": "https://localhost:8080/", - "height": 140 - }, - "id": "S8RM8c6AS8AX", - "outputId": "0b366a76-49b0-4170-dce6-33572a37a929" - }, - "source": [ - "try:\n", - " from google.colab import files\n", - " \n", - " # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt' present in \"sentiment labelled sentences\" folder\n", - " uploaded = files.upload()\n", - " \n", - " !mkdir DATAPATH\n", - " !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n", - " !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n", - " \n", - "except ModuleNotFoundError:\n", - "\n", - " fil = 'sentiment_sentences.txt'\n", - "\n", - " if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n", - " file = open(os.path.join(path, fil), 'w')\n", - " file.close()\n", - " \n", - " # combined the three files to make sentiment_sentences.txt\n", - " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", - "\n", - " with open('Data/sentiment_sentences.txt', 'w') as outfile:\n", - " for fname in filenames:\n", - " with open('Data/sentiment labelled sentences/' + fname) as infile:\n", - " outfile.write(infile.read())\n", - " print(\"File created\")\n", - " else:\n", - " print(\"File already exists\")" - ], - "execution_count": 4, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " Upload widget is only available when the cell has been executed in the\n", - " current browser session. Please rerun this cell to enable.\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "Saving amazon_cells_labelled.txt to amazon_cells_labelled.txt\n", - "Saving imdb_labelled.txt to imdb_labelled.txt\n", - "Saving yelp_labelled.txt to yelp_labelled.txt\n" - ], - "name": "stdout" - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + } + ], + "source": [ + "#basic imports\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import os\n", + "import gzip\n", + "import shutil\n", + "from time import time\n", + "\n", + "#pre-processing imports\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from string import punctuation\n", + "\n", + "#imports related to modeling\n", + "import numpy as np\n", + "from gensim.models import Word2Vec, KeyedVectors\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "\n", + "#google-drive download imports\n", + "import gdown" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 140, + "resources": { + "http://localhost:8080/nbextensions/google.colab/files.js": { + "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", + "headers": [ + [ + "content-type", + "application/javascript" + ] + ], + "ok": true, + "status": 200, + "status_text": "" + } + } }, + "id": "S8RM8c6AS8AX", + "outputId": "0b366a76-49b0-4170-dce6-33572a37a929" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "COUGXAxcb_H5", - "scrolled": true, - "outputId": "b88ee64f-6c36-412e-ce57-f9387eec3051" - }, - "source": [ - "#Load the pre-trained word2vec model and the dataset\n", - "try:\n", - " \n", - " from google.colab import files\n", - " data_path= \"DATAPATH\"\n", - " !wget -P DATAPATH https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", - " !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz \n", - " path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n", - " training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n", - " \n", - "except ModuleNotFoundError:\n", - " \n", - " data_path= \"Data\"\n", - " \n", - " if not os.path.exists('GoogleNews-vectors-negative300.bin'):\n", - " if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n", - " if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n", - " wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n", - "\n", - " with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:\n", - " with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:\n", - " shutil.copyfileobj(f_in, f_out)\n", - "\n", - " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", - " else:\n", - " path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n", - "\n", - " else:\n", - " path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n", - " else:\n", - " path_to_model = 'GoogleNews-vectors-negative300.bin'\n", - " \n", - " training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n", - " \n", - " \n", - "#Load W2V model. This will take some time. \n", - "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", - "print('done loading Word2Vec')\n", - "\n", - "#Read text data, cats.\n", - "#the file path consists of tab separated sentences and cats.\n", - "texts = []\n", - "cats = []\n", - "fh = open(training_data_path)\n", - "for line in fh:\n", - " text, sentiment = line.split(\"\\t\")\n", - " texts.append(text)\n", - " cats.append(sentiment)" - ], - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "text": [ - "--2021-07-20 08:36:30-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", - "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.130.248\n", - "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.130.248|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 1647046227 (1.5G) [application/x-gzip]\n", - "Saving to: ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’\n", - "\n", - "GoogleNews-vectors- 100%[===================>] 1.53G 35.4MB/s in 46s \n", - "\n", - "2021-07-20 08:37:16 (34.1 MB/s) - ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n", - "\n", - "CPU times: user 19.6 s, sys: 3.11 s, total: 22.7 s\n", - "Wall time: 35.2 s\n", - "done loading Word2Vec\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "File already exists\n" + ] + } + ], + "source": [ + "try:\n", + " from google.colab import files\n", + " \n", + " # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt' present in \"sentiment labelled sentences\" folder\n", + " uploaded = files.upload()\n", + " \n", + " !mkdir DATAPATH\n", + " !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n", + " !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n", + " \n", + "except ModuleNotFoundError:\n", + "\n", + " fil = 'sentiment_sentences.txt'\n", + "\n", + " if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n", + " file = open(os.path.join(path, fil), 'w')\n", + " file.close()\n", + " \n", + " # combined the three files to make sentiment_sentences.txt\n", + " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", + "\n", + " with open('Data/sentiment_sentences.txt', 'w') as outfile:\n", + " for fname in filenames:\n", + " with open('Data/sentiment labelled sentences/' + fname) as infile:\n", + " outfile.write(infile.read())\n", + " print(\"File created\")\n", + " else:\n", + " print(\"File already exists\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "COUGXAxcb_H5", + "outputId": "b88ee64f-6c36-412e-ce57-f9387eec3051", + "scrolled": true + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "m-WjFyC6b_IE", - "outputId": "87270b42-96b9-4420-f22a-6f13160e5cbe" - }, - "source": [ - "#Inspect the model\n", - "word2vec_vocab = w2v_model.vocab.keys()\n", - "word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]\n", - "print(len(word2vec_vocab))" - ], - "execution_count": 6, - "outputs": [ - { - "output_type": "stream", - "text": [ - "3000000\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Data Present at location : ./Data/GoogleNews-vectors-negative300.bin\n" + ] + } + ], + "source": [ + "#Load the pre-trained word2vec model and the dataset\n", + "\n", + "def check_if_file_exists(filename: str, locations: list) -> str :\n", + " for location in locations:\n", + " if os.path.exists(os.path.join(location, filename)):\n", + " return location\n", + " return None\n", + "\n", + "def extract_data(location: str) -> None:\n", + " with gzip.open(os.path.join(location, 'GoogleNews-vectors-negative300.bin.gz'), 'rb') as f_in:\n", + " with open(os.path.join('./Data', './GoogleNews-vectors-negative300.bin'), 'wb') as f_out:\n", + " shutil.copyfileobj(f_in, f_out)\n", + "\n", + "try:\n", + " from google.colab import files\n", + " data_path= \"DATAPATH\"\n", + " !gdown -O DATAPATH https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\n", + " !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz \n", + " path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n", + " training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n", + " \n", + "except ModuleNotFoundError:\n", + "\n", + " data_path = './Data/'\n", + " compressed_file_name = 'GoogleNews-vectors-negative300.bin.gz'\n", + " extracted_file_name = 'GoogleNews-vectors-negative300.bin'\n", + " \n", + " # Check if Extracted File exists\n", + " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", + " \n", + " if location_of_extracted_file:\n", + " # Extracted File exists\n", + " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", + " \n", + " else:\n", + " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", + " \n", + " if location_of_compressed_file:\n", + " # Compressed File exists\n", + " extract_data(os.path.join(location_of_compressed_file))\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + " \n", + " else:\n", + " # Download File\n", + " output_path = './Data/'\n", + " gdown.download(\"https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\", output=output_path)\n", + "\n", + " # Extract File\n", + " extract_data(output_path)\n", + "\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " print(f\"Data Present at location : {path_to_model}\")\n", + " training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n", + " \n", + " \n", + "#Load W2V model. This will take some time. \n", + "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", + "print('done loading Word2Vec')\n", + "\n", + "#Read text data, cats.\n", + "#the file path consists of tab separated sentences and cats.\n", + "texts = []\n", + "cats = []\n", + "fh = open(training_data_path)\n", + "for line in fh:\n", + " text, sentiment = line.split(\"\\t\")\n", + " texts.append(text)\n", + " cats.append(sentiment)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "XEz30Jztb_IP", - "outputId": "18794f4b-828f-4c7c-9708-b9af3143d700" - }, - "source": [ - "#Inspect the dataset\n", - "print(len(cats), len(texts))\n", - "print(texts[1])\n", - "print(cats[1])" - ], - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "text": [ - "3000 3000\n", - "Good case, Excellent value.\n", - "1\n", - "\n" - ], - "name": "stdout" - } - ] + "id": "m-WjFyC6b_IE", + "outputId": "87270b42-96b9-4420-f22a-6f13160e5cbe" + }, + "outputs": [], + "source": [ + "#Inspect the model\n", + "word2vec_vocab = w2v_model.key_to_index.keys()\n", + "word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]\n", + "print(len(word2vec_vocab))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MFOGaDTwb_Ig", - "outputId": "b9983e21-f00e-4c3e-ebe4-e2c8be738398" - }, - "source": [ - "#preprocess the text.\n", - "def preprocess_corpus(texts):\n", - " mystopwords = set(stopwords.words(\"english\"))\n", - " def remove_stops_digits(tokens):\n", - " #Nested function that lowercases, removes stopwords and digits from a list of tokens\n", - " return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()\n", - " and token not in punctuation]\n", - " #This return statement below uses the above function to process twitter tokenizer output further. \n", - " return [remove_stops_digits(word_tokenize(text)) for text in texts]\n", - "\n", - "texts_processed = preprocess_corpus(texts)\n", - "print(len(cats), len(texts_processed))\n", - "print(texts_processed[1])\n", - "print(cats[1])" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "text": [ - "3000 3000\n", - "['good', 'case', 'excellent', 'value']\n", - "1\n", - "\n" - ], - "name": "stdout" - } - ] + "id": "XEz30Jztb_IP", + "outputId": "18794f4b-828f-4c7c-9708-b9af3143d700" + }, + "outputs": [], + "source": [ + "#Inspect the dataset\n", + "print(len(cats), len(texts))\n", + "print(texts[1])\n", + "print(cats[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "fXRiGtY1b_Iq", - "outputId": "fdba211b-e6bd-453e-b70d-79546d6ef005" - }, - "source": [ - "# Creating a feature vector by averaging all embeddings for all sentences\n", - "def embedding_feats(list_of_lists):\n", - " DIMENSION = 300\n", - " zero_vector = np.zeros(DIMENSION)\n", - " feats = []\n", - " for tokens in list_of_lists:\n", - " feat_for_this = np.zeros(DIMENSION)\n", - " count_for_this = 0 + 1e-5 # to avoid divide-by-zero \n", - " for token in tokens:\n", - " if token in w2v_model:\n", - " feat_for_this += w2v_model[token]\n", - " count_for_this +=1\n", - " if(count_for_this!=0):\n", - " feats.append(feat_for_this/count_for_this) \n", - " else:\n", - " feats.append(zero_vector)\n", - " return feats\n", - "\n", - "\n", - "train_vectors = embedding_feats(texts_processed)\n", - "print(len(train_vectors))" - ], - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "text": [ - "3000\n" - ], - "name": "stdout" - } - ] + "id": "MFOGaDTwb_Ig", + "outputId": "b9983e21-f00e-4c3e-ebe4-e2c8be738398" + }, + "outputs": [], + "source": [ + "#preprocess the text.\n", + "def preprocess_corpus(texts):\n", + " mystopwords = set(stopwords.words(\"english\"))\n", + " def remove_stops_digits(tokens):\n", + " #Nested function that lowercases, removes stopwords and digits from a list of tokens\n", + " return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()\n", + " and token not in punctuation]\n", + " #This return statement below uses the above function to process twitter tokenizer output further. \n", + " return [remove_stops_digits(word_tokenize(text)) for text in texts]\n", + "\n", + "texts_processed = preprocess_corpus(texts)\n", + "print(len(cats), len(texts_processed))\n", + "print(texts_processed[1])\n", + "print(cats[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mr9IaQppb_Ix", - "outputId": "2c372ab4-38d8-4884-99dc-9bb3bbba16d0" - }, - "source": [ - "#Take any classifier (LogisticRegression here, and train/test it like before.\n", - "classifier = LogisticRegression(random_state=1234)\n", - "train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)\n", - "classifier.fit(train_data, train_cats)\n", - "print(\"Accuracy: \", classifier.score(test_data, test_cats))\n", - "preds = classifier.predict(test_data)\n", - "print(classification_report(test_cats, preds))" - ], - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Accuracy: 0.8453333333333334\n", - " precision recall f1-score support\n", - "\n", - " 0\n", - " 0.87 0.83 0.85 388\n", - " 1\n", - " 0.82 0.86 0.84 362\n", - "\n", - " accuracy 0.85 750\n", - " macro avg 0.85 0.85 0.85 750\n", - "weighted avg 0.85 0.85 0.85 750\n", - "\n" - ], - "name": "stdout" - } - ] + "id": "fXRiGtY1b_Iq", + "outputId": "fdba211b-e6bd-453e-b70d-79546d6ef005" + }, + "outputs": [], + "source": [ + "# Creating a feature vector by averaging all embeddings for all sentences\n", + "def embedding_feats(list_of_lists):\n", + " DIMENSION = 300\n", + " zero_vector = np.zeros(DIMENSION)\n", + " feats = []\n", + " for tokens in list_of_lists:\n", + " feat_for_this = np.zeros(DIMENSION)\n", + " count_for_this = 0 + 1e-5 # to avoid divide-by-zero \n", + " for token in tokens:\n", + " if token in w2v_model:\n", + " feat_for_this += w2v_model[token]\n", + " count_for_this +=1\n", + " if(count_for_this!=0):\n", + " feats.append(feat_for_this/count_for_this) \n", + " else:\n", + " feats.append(zero_vector)\n", + " return feats\n", + "\n", + "\n", + "train_vectors = embedding_feats(texts_processed)\n", + "print(len(train_vectors))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "metadata": { - "id": "k7wjLB8rb_JB" - }, - "source": [ - "Not bad. With little efforts we got 81% accuracy. Thats a great starting model to have!!" - ] - } - ] -} \ No newline at end of file + "id": "mr9IaQppb_Ix", + "outputId": "2c372ab4-38d8-4884-99dc-9bb3bbba16d0" + }, + "outputs": [], + "source": [ + "#Take any classifier (LogisticRegression here, and train/test it like before.\n", + "classifier = LogisticRegression(random_state=1234)\n", + "train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)\n", + "classifier.fit(train_data, train_cats)\n", + "print(\"Accuracy: \", classifier.score(test_data, test_cats))\n", + "preds = classifier.predict(test_data)\n", + "print(classification_report(test_cats, preds))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k7wjLB8rb_JB" + }, + "source": [ + "Not bad. With little efforts we got 81% accuracy. Thats a great starting model to have!!" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "03_Word2Vec_Example.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 236bbc01d2df8c6f02de0577b0ae0b1a74e151c9 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Tue, 15 Aug 2023 09:57:29 +0000 Subject: [PATCH 04/14] Updated Ch4/04_FastText_Example.ipynb by removing errors and updating libraries --- Ch4/04_FastText_Example.ipynb | 248 +++++++++++++++++----------------- 1 file changed, 121 insertions(+), 127 deletions(-) diff --git a/Ch4/04_FastText_Example.ipynb b/Ch4/04_FastText_Example.ipynb index b04f9d4..6608a17 100644 --- a/Ch4/04_FastText_Example.ipynb +++ b/Ch4/04_FastText_Example.ipynb @@ -20,49 +20,15 @@ "id": "UBnT5t_LiCU2", "outputId": "ca0bcea9-75a7-4237-e58e-154c3d72e89f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting wget==3.2\n", - " Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9675 sha256=0e1e014b6bf086637aea4bfe15707b7d8d825e7280cd2f9c6ec1943ef00e80c7\n", - " Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n", - "Collecting fasttext==0.9.2\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)\n", - "\u001b[K |████████████████████████████████| 71kB 6.8MB/s \n", - "\u001b[?25hRequirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (2.6.2)\n", - "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (57.0.0)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from fasttext==0.9.2) (1.19.5)\n", - "Building wheels for collected packages: fasttext\n", - " Building wheel for fasttext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3091748 sha256=f30effec512519a72b11f0eaf7aa8a6b57df1643345f8e51bf7b1cb010552792\n", - " Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592\n", - "Successfully built fasttext\n", - "Installing collected packages: fasttext\n", - "Successfully installed fasttext-0.9.2\n" - ] - } - ], + "outputs": [], "source": [ "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", "\n", "# ===========================\n", "\n", - "!pip install pandas==1.1.5\n", - "!pip install wget==3.2\n", - "!pip install fasttext==0.9.2\n", + "# !pip install pandas==1.1.5\n", + "# !pip install wget==3.2\n", + "# !pip install fasttext==0.9.2\n", "\n", "# ===========================" ] @@ -90,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 14, "metadata": { "id": "YKgZXvTGb61z" }, @@ -100,12 +66,13 @@ "import os\n", "import pandas as pd\n", "import wget\n", - "import tarfile" + "import tarfile\n", + "import gdown" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -114,44 +81,37 @@ "outputId": "debf3639-77d2-4a2c-8aa1-3ff8438b9585" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From (uriginal): https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\n", + "From (redirected): https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k&confirm=t&uuid=a7accc21-6f49-4d9d-b323-f6fedaec8e8f\n", + "To: /root/Working/Working/practical-nlp-code/Ch4/Data/dbpedia_csv.tar.gz\n", + "100%|█████████████████████████████████████████████████████████████| 68.3M/68.3M [00:09<00:00, 7.58MB/s]\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "--2021-07-16 08:57:35-- https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", - "Resolving github.com (github.com)... 140.82.121.4\n", - "Connecting to github.com (github.com)|140.82.121.4|:443... connected.\n", - "HTTP request sent, awaiting response... 301 Moved Permanently\n", - "Location: https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz [following]\n", - "--2021-07-16 08:57:35-- https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", - "Reusing existing connection to github.com:443.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz [following]\n", - "--2021-07-16 08:57:35-- https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 68431223 (65M) [application/octet-stream]\n", - "Saving to: ‘DATAPATH/dbpedia_csv.tar.gz’\n", - "\n", - "dbpedia_csv.tar.gz 100%[===================>] 65.26M 206MB/s in 0.3s \n", - "\n", - "2021-07-16 08:57:42 (206 MB/s) - ‘DATAPATH/dbpedia_csv.tar.gz’ saved [68431223/68431223]\n", - "\n", - "dbpedia_csv/\n", - "dbpedia_csv/test.csv\n", - "dbpedia_csv/classes.txt\n", - "dbpedia_csv/train.csv\n", - "dbpedia_csv/readme.txt\n", - "total 66M\n", - "drwxr-xr-x 3 root root 4.0K Jul 16 08:57 .\n", - "drwxr-xr-x 1 root root 4.0K Jul 16 08:57 ..\n", - "drwxrwxr-x 2 1000 1000 4.0K Mar 29 2015 dbpedia_csv\n", - "-rw-r--r-- 1 root root 66M Jul 16 08:57 dbpedia_csv.tar.gz\n" + "Data Present at location : ./Data/dbpedia_csv\n" ] } ], "source": [ + "def check_if_file_exists(filename: str, locations: list) -> str :\n", + " for location in locations:\n", + " if os.path.exists(os.path.join(location, filename)):\n", + " return location\n", + " return None\n", + "\n", + "def extract_tar_file(file_path: str, extraction_path: str) -> None:\n", + " tar = tarfile.open(file_path, \"r:gz\")\n", + " tar.extractall(extraction_path)\n", + " tar.close()\n", + "\n", "try :\n", " \n", " from google.colab import files\n", @@ -169,26 +129,41 @@ " data_path = 'DATAPATH'\n", " \n", "except ModuleNotFoundError:\n", + " data_path = './Data/'\n", + " compressed_file_name = 'dbpedia_csv.tar.gz'\n", + " extracted_file_name = 'dbpedia_csv'\n", " \n", - " if not os.path.exists(os.getcwd()+'\\\\Data\\\\dbpedia_csv') :\n", - " # downloading the data\n", - " url=\"https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\"\n", - " path=os.getcwd()+'\\Data'\n", - " wget.download(url,path)\n", - "\n", - " # untaring the required file\n", - " temp=path+'\\dbpedia_csv.tar.gz'\n", - " tar = tarfile.open(temp, \"r:gz\")\n", - " tar.extractall(path) \n", - " tar.close()\n", + " # Check if Extracted File exists\n", + " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data'])\n", " \n", - " # specifying the data_path\n", - " data_path='Data'" + " if location_of_extracted_file:\n", + " # Extracted File exists\n", + " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", + " \n", + " else:\n", + " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data'])\n", + " \n", + " if location_of_compressed_file:\n", + " # Compressed File exists\n", + " extract_tar_file(os.path.join(location_of_compressed_file, compressed_file_name), data_path)\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + " \n", + " else:\n", + " # Download File\n", + " output_path = './Data/'\n", + " gdown.download(\"https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\", output=output_path)\n", + "\n", + " # Extract File\n", + " extract_data(output_path+compressed_file_name, output_path)\n", + "\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " print(f\"Data Present at location : {path_to_model}\")" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -218,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -296,20 +271,23 @@ "" ], "text/plain": [ - " class ... class_name\n", - "0 1 ... Company\n", - "1 1 ... Company\n", - "2 1 ... Company\n", - "3 1 ... Company\n", - "4 1 ... Company\n", + " class name \\\n", + "0 1 E. D. Abbott Ltd \n", + "1 1 Schwan-Stabilo \n", + "2 1 Q-workshop \n", + "3 1 Marvell Software Solutions Israel \n", + "4 1 Bergan Mercy Medical Center \n", "\n", - "[5 rows x 4 columns]" + " description class_name \n", + "0 Abbott of Farnham E D Abbott Limited was a Br... Company \n", + "1 Schwan-STABILO is a German maker of pens for ... Company \n", + "2 Q-workshop is a Polish company located in Poz... Company \n", + "3 Marvell Software Solutions Israel known as RA... Company \n", + "4 Bergan Mercy Medical Center is a hospital loc... Company " ] }, - "execution_count": 6, - "metadata": { - "tags": [] - }, + "execution_count": 18, + "metadata": {}, "output_type": "execute_result" } ], @@ -340,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -352,27 +330,26 @@ { "data": { "text/plain": [ + "class_name\n", + "Company 40000\n", + "EducationalInstitution 40000\n", + "Artist 40000\n", "Athlete 40000\n", + "OfficeHolder 40000\n", "MeanOfTransportation 40000\n", - "Film 40000\n", - "Artist 40000\n", "Building 40000\n", - "Company 40000\n", - "Plant 40000\n", - "Album 40000\n", "NaturalPlace 40000\n", "Village 40000\n", - "EducationalInstitution 40000\n", "Animal 40000\n", + "Plant 40000\n", + "Album 40000\n", + "Film 40000\n", "WrittenWork 40000\n", - "OfficeHolder 40000\n", - "Name: class_name, dtype: int64" + "Name: count, dtype: int64" ] }, - "execution_count": 7, - "metadata": { - "tags": [] - }, + "execution_count": 19, + "metadata": {}, "output_type": "execute_result" } ], @@ -382,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 20, "metadata": { "id": "Sn-3kIqMb62d" }, @@ -420,7 +397,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -433,8 +410,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.38 s, sys: 193 ms, total: 4.57 s\n", - "Wall time: 4.63 s\n" + "CPU times: user 3.98 s, sys: 206 ms, total: 4.19 s\n", + "Wall time: 4.26 s\n" ] } ], @@ -447,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 22, "metadata": { "id": "imMZ9-Bkb62t" }, @@ -472,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -481,12 +458,29 @@ "outputId": "3d7c130a-fd3b-472c-8585-2e965017763f" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Read 31M words\n", + "Number of words: 1116962\n", + "Number of labels: 14\n", + "Progress: 100.0% words/sec/thread: 1168450 lr: 0.000174 avg.loss: 0.003673 ETA: 0h 0m 0s 23.8% words/sec/thread: 1198254 lr: 0.762147 avg.loss: 0.009301 ETA: 0h12m38s% words/sec/thread: 1173386 lr: 0.577732 avg.loss: 0.006055 ETA: 0h 9m47s 0.419437 avg.loss: 0.004894 ETA: 0h 7m 5s 73.1% words/sec/thread: 1159819 lr: 0.268968 avg.loss: 0.004059 ETA: 0h 4m36s 0.003774 ETA: 0h 3m23s 81.4% words/sec/thread: 1157751 lr: 0.186418 avg.loss: 0.003744 ETA: 0h 3m12s 0.003730 ETA: 0h 3m 7s 82.8% words/sec/thread: 1155522 lr: 0.172032 avg.loss: 0.003694 ETA: 0h 2m57s 96.3% words/sec/thread: 1169432 lr: 0.036623 avg.loss: 0.003380 ETA: 0h 0m37s" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1h 3min 10s, sys: 12.8 s, total: 1h 3min 23s\n", - "Wall time: 32min 17s\n" + "CPU times: user 33min 7s, sys: 19 s, total: 33min 26s\n", + "Wall time: 17min 5s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Progress: 100.0% words/sec/thread: 1168425 lr: 0.000000 avg.loss: 0.003688 ETA: 0h 0m 0s\n" ] } ], @@ -505,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -518,11 +512,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Test Samples: 70000 Precision@1 : 92.2486 Recall@1 : 92.2486\n", - "Test Samples: 70000 Precision@2 : 48.5014 Recall@2 : 97.0029\n", - "Test Samples: 70000 Precision@3 : 32.5619 Recall@3 : 97.6857\n", - "Test Samples: 70000 Precision@4 : 24.4968 Recall@4 : 97.9871\n", - "Test Samples: 70000 Precision@5 : 19.6420 Recall@5 : 98.2100\n" + "Test Samples: 70000 Precision@1 : 90.7343 Recall@1 : 90.7343\n", + "Test Samples: 70000 Precision@2 : 48.0407 Recall@2 : 96.0814\n", + "Test Samples: 70000 Precision@3 : 32.3319 Recall@3 : 96.9957\n", + "Test Samples: 70000 Precision@4 : 24.3021 Recall@4 : 97.2086\n", + "Test Samples: 70000 Precision@5 : 19.4711 Recall@5 : 97.3557\n" ] } ], @@ -549,7 +543,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -563,9 +557,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.17" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From 40c096d08b19f3288daf8294e02dcf5f79d550f7 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Tue, 15 Aug 2023 09:58:07 +0000 Subject: [PATCH 05/14] Updated Ch4/03_Word2Vec_Example.ipynb by removing errors and updating libraries --- Ch4/03_Word2Vec_Example.ipynb | 84 ++++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 12 deletions(-) diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb index ed185ac..232eeee 100644 --- a/Ch4/03_Word2Vec_Example.ipynb +++ b/Ch4/03_Word2Vec_Example.ipynb @@ -188,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -202,7 +202,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data Present at location : ./Data/GoogleNews-vectors-negative300.bin\n" + "Data Present at location : ./Data/GoogleNews-vectors-negative300.bin\n", + "CPU times: user 46.2 s, sys: 2 s, total: 48.2 s\n", + "Wall time: 49.1 s\n", + "done loading Word2Vec\n" ] } ], @@ -280,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -288,7 +291,15 @@ "id": "m-WjFyC6b_IE", "outputId": "87270b42-96b9-4420-f22a-6f13160e5cbe" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3000000\n" + ] + } + ], "source": [ "#Inspect the model\n", "word2vec_vocab = w2v_model.key_to_index.keys()\n", @@ -298,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -306,7 +317,18 @@ "id": "XEz30Jztb_IP", "outputId": "18794f4b-828f-4c7c-9708-b9af3143d700" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3000 3000\n", + "Good case, Excellent value.\n", + "1\n", + "\n" + ] + } + ], "source": [ "#Inspect the dataset\n", "print(len(cats), len(texts))\n", @@ -316,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -324,7 +346,18 @@ "id": "MFOGaDTwb_Ig", "outputId": "b9983e21-f00e-4c3e-ebe4-e2c8be738398" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3000 3000\n", + "['good', 'case', 'excellent', 'value']\n", + "1\n", + "\n" + ] + } + ], "source": [ "#preprocess the text.\n", "def preprocess_corpus(texts):\n", @@ -344,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -352,7 +385,15 @@ "id": "fXRiGtY1b_Iq", "outputId": "fdba211b-e6bd-453e-b70d-79546d6ef005" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3000\n" + ] + } + ], "source": [ "# Creating a feature vector by averaging all embeddings for all sentences\n", "def embedding_feats(list_of_lists):\n", @@ -379,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -387,7 +428,26 @@ "id": "mr9IaQppb_Ix", "outputId": "2c372ab4-38d8-4884-99dc-9bb3bbba16d0" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.796\n", + " precision recall f1-score support\n", + "\n", + " 0\n", + " 0.82 0.77 0.79 383\n", + " 1\n", + " 0.78 0.82 0.80 367\n", + "\n", + " accuracy 0.80 750\n", + " macro avg 0.80 0.80 0.80 750\n", + "weighted avg 0.80 0.80 0.80 750\n", + "\n" + ] + } + ], "source": [ "#Take any classifier (LogisticRegression here, and train/test it like before.\n", "classifier = LogisticRegression(random_state=1234)\n", From de686dd0145ff7579c5c46965678b8d9ac813dd8 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Tue, 15 Aug 2023 10:31:38 +0000 Subject: [PATCH 06/14] Updated Ch4/05_DeepNN_Example.ipynb by removing errors and updating libraries --- Ch4/05_DeepNN_Example.ipynb | 1249 +++++++++++++++++------------------ 1 file changed, 604 insertions(+), 645 deletions(-) diff --git a/Ch4/05_DeepNN_Example.ipynb b/Ch4/05_DeepNN_Example.ipynb index a9c7abb..c8b184b 100644 --- a/Ch4/05_DeepNN_Example.ipynb +++ b/Ch4/05_DeepNN_Example.ipynb @@ -1,669 +1,628 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "aLNg_Puse6EX" + }, + "source": [ + "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "05_DeepNN_Example.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } + "id": "eOJLveJqtEO3", + "outputId": "067a74b2-c5df-464d-a3fa-3f4517a9090a" + }, + "outputs": [], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# !pip install numpy==1.19.5\n", + "# !pip install wget==3.2\n", + "# !pip install tensorflow==1.14.0\n", + "\n", + "# ===========================" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "aLNg_Puse6EX" - }, - "source": [ - "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset. " - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eOJLveJqtEO3", - "outputId": "067a74b2-c5df-464d-a3fa-3f4517a9090a" - }, - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "!pip install numpy==1.19.5\n", - "!pip install wget==3.2\n", - "!pip install tensorflow==1.14.0\n", - "\n", - "# ===========================" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Collecting wget==3.2\n", - " Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9675 sha256=0590de33e3a5654cc81a0a21cf66fa3e8af32bf31e65c5a543d101b6d3fba858\n", - " Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n", - "Collecting tensorflow==1.14.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/28/96efba1a516cdacc2e2d6d081f699c001d414cc8ca3250e6d59ae657eb2b/tensorflow-1.14.0-cp37-cp37m-manylinux1_x86_64.whl (109.3MB)\n", - "\u001b[K |████████████████████████████████| 109.3MB 104kB/s \n", - "\u001b[?25hRequirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.12.1)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.34.1)\n", - "Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed0510d/tensorflow_estimator-1.14.0-py2.py3-none-any.whl (488kB)\n", - "\u001b[K |████████████████████████████████| 491kB 49.6MB/s \n", - "\u001b[?25hRequirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.0)\n", - "Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.19.5)\n", - "Collecting tensorboard<1.15.0,>=1.14.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)\n", - "\u001b[K |████████████████████████████████| 3.2MB 33.6MB/s \n", - "\u001b[?25hRequirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.2)\n", - "Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.8.1)\n", - "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.2.0)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.12.0)\n", - "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.15.0)\n", - "Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (3.17.3)\n", - "Collecting keras-applications>=1.0.6\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl (50kB)\n", - "\u001b[K |████████████████████████████████| 51kB 8.4MB/s \n", - "\u001b[?25hRequirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.4.0)\n", - "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.36.2)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (57.0.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.3.4)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (1.0.1)\n", - "Requirement already satisfied: h5py in /usr/local/lib/python3.7/dist-packages (from keras-applications>=1.0.6->tensorflow==1.14.0) (3.1.0)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (4.6.1)\n", - "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py->keras-applications>=1.0.6->tensorflow==1.14.0) (1.5.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.5.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.7.4.3)\n", - "\u001b[31mERROR: kapre 0.3.5 has requirement tensorflow>=2.0.0, but you'll have tensorflow 1.14.0 which is incompatible.\u001b[0m\n", - "Installing collected packages: tensorflow-estimator, tensorboard, keras-applications, tensorflow\n", - " Found existing installation: tensorflow-estimator 2.5.0\n", - " Uninstalling tensorflow-estimator-2.5.0:\n", - " Successfully uninstalled tensorflow-estimator-2.5.0\n", - " Found existing installation: tensorboard 2.5.0\n", - " Uninstalling tensorboard-2.5.0:\n", - " Successfully uninstalled tensorboard-2.5.0\n", - " Found existing installation: tensorflow 2.5.0\n", - " Uninstalling tensorflow-2.5.0:\n", - " Successfully uninstalled tensorflow-2.5.0\n", - "Successfully installed keras-applications-1.0.8 tensorboard-1.14.0 tensorflow-1.14.0 tensorflow-estimator-1.14.0\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ixb_5zcYtEO5" - }, - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "xqUcb7NBb5--" - }, - "source": [ - "#Make the necessary imports\n", - "import os\n", - "import sys\n", - "import numpy as np\n", - "import tarfile\n", - "import wget\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\") \n", - "from zipfile import ZipFile\n", - "from tensorflow.keras.preprocessing.text import Tokenizer\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", - "from tensorflow.keras.utils import to_categorical\n", - "from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D\n", - "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM\n", - "from tensorflow.keras.models import Model, Sequential\n", - "from tensorflow.keras.initializers import Constant" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0MqW5vWwfiCP" - }, - "source": [ - "Here we set all the paths of all the external datasets and models such as [glove](https://nlp.stanford.edu/projects/glove/) and [IMDB reviews dataset](http://ai.stanford.edu/~amaas/data/sentiment/)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HUKTqLHud7fo", - "scrolled": false - }, - "source": [ - "%%capture\n", - "try:\n", - " \n", - " from google.colab import files\n", - " \n", - " !wget -P DATAPATH http://nlp.stanford.edu/data/glove.6B.zip\n", - " !unzip DATAPATH/glove.6B.zip -d DATAPATH/glove.6B\n", - " \n", - " !wget -P DATAPATH http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", - " !tar -xvf DATAPATH/aclImdb_v1.tar.gz -C DATAPATH\n", - " \n", - " BASE_DIR = 'DATAPATH'\n", - " \n", - "except ModuleNotFoundError:\n", - " \n", - " if not os.path.exists('Data/glove.6B'):\n", - " os.mkdir('Data/glove.6B')\n", - " \n", - " url='http://nlp.stanford.edu/data/glove.6B.zip' \n", - " wget.download(url,'Data') \n", - " \n", - " temp='Data/glove.6B.zip' \n", - " file = ZipFile(temp) \n", - " file.extractall('Data/glove.6B') \n", - " file.close()\n", - " \n", - " \n", - " \n", - " if not os.path.exists('Data/aclImdb'):\n", - " \n", - " url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' \n", - " wget.download(url,'Data')\n", - " \n", - " temp='Data/aclImdb_v1.tar.gz' \n", - " tar = tarfile.open(temp, \"r:gz\")\n", - " tar.extractall('Data') \n", - " tar.close()\n", - " \n", - " BASE_DIR = 'Data'" - ], - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qvl1qb78fUib" - }, - "source": [ - "GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')\n", - "TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/train')\n", - "TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/test')" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Yu9xmAZEd7fp" - }, - "source": [ - "#Within these, I only have a pos/ and a neg/ folder containing text files \n", - "MAX_SEQUENCE_LENGTH = 1000\n", - "MAX_NUM_WORDS = 20000 \n", - "EMBEDDING_DIM = 100 \n", - "VALIDATION_SPLIT = 0.2\n", - "\n", - "#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py\n", - "#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EmifkoA8b5_N" - }, - "source": [ - "### Loading and Preprocessing\n", - " " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "WI4O1usEb5_O" - }, - "source": [ - "#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.\n", - "def get_data(data_dir):\n", - " texts = [] # list of text samples\n", - " labels_index = {'pos':1, 'neg':0} # dictionary mapping label name to numeric id\n", - " labels = [] # list of label ids\n", - " for name in sorted(os.listdir(data_dir)):\n", - " path = os.path.join(data_dir, name)\n", - " if os.path.isdir(path):\n", - " if name=='pos' or name=='neg':\n", - " label_id = labels_index[name]\n", - " for fname in sorted(os.listdir(path)):\n", - " fpath = os.path.join(path, fname)\n", - " text = open(fpath,encoding='utf8').read()\n", - " texts.append(text)\n", - " labels.append(label_id)\n", - " return texts, labels\n", - "\n", - "train_texts, train_labels = get_data(TRAIN_DATA_DIR)\n", - "test_texts, test_labels = get_data(TEST_DATA_DIR)\n", - "labels_index = {'pos':1, 'neg':0} \n", - "\n", - "#Just to see how the data looks like. \n", - "#print(train_texts[0])\n", - "#print(train_labels[0])\n", - "#print(test_texts[24999])\n", - "#print(test_labels[24999])" - ], - "execution_count": 7, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "QhhqM0Jdd7fs", - "outputId": "9b5b394e-bc52-4779-d85d-a0383446051d" - }, - "source": [ - "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer \n", - "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data. \n", - "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) \n", - "tokenizer.fit_on_texts(train_texts) \n", - "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes \n", - "test_sequences = tokenizer.texts_to_sequences(test_texts) \n", - "word_index = tokenizer.word_index \n", - "print('Found %s unique tokens.' % len(word_index))" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Found 88582 unique tokens.\n" - ], - "name": "stdout" - } - ] - }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Ixb_5zcYtEO5" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "xqUcb7NBb5--" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_e0V1-bBb5_d", - "outputId": "d866429d-5bb6-43a7-c66e-ed5abbafc4cd" - }, - "source": [ - "#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier\n", - "#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH\n", - "trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", - "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", - "trainvalid_labels = to_categorical(np.asarray(train_labels))\n", - "test_labels = to_categorical(np.asarray(test_labels))\n", - "\n", - "# split the training data into a training set and a validation set\n", - "indices = np.arange(trainvalid_data.shape[0])\n", - "np.random.shuffle(indices)\n", - "trainvalid_data = trainvalid_data[indices]\n", - "trainvalid_labels = trainvalid_labels[indices]\n", - "num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])\n", - "x_train = trainvalid_data[:-num_validation_samples]\n", - "y_train = trainvalid_labels[:-num_validation_samples]\n", - "x_val = trainvalid_data[-num_validation_samples:]\n", - "y_val = trainvalid_labels[-num_validation_samples:]\n", - "#This is the data we will use for CNN and RNN training\n", - "print('Splitting the train data into train and valid is done')" - ], - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Splitting the train data into train and valid is done\n" - ], - "name": "stdout" - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-15 09:43:18.128696: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-08-15 09:43:18.575098: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-08-15 09:43:20.130588: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + ] + } + ], + "source": [ + "#Make the necessary imports\n", + "import os\n", + "import sys\n", + "import numpy as np\n", + "import tarfile\n", + "import wget\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\") \n", + "from zipfile import ZipFile\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from tensorflow.keras.utils import to_categorical\n", + "from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D\n", + "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM\n", + "from tensorflow.keras.models import Model, Sequential\n", + "from tensorflow.keras.initializers import Constant" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0MqW5vWwfiCP" + }, + "source": [ + "Here we set all the paths of all the external datasets and models such as [glove](https://nlp.stanford.edu/projects/glove/) and [IMDB reviews dataset](http://ai.stanford.edu/~amaas/data/sentiment/)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "HUKTqLHud7fo" + }, + "outputs": [], + "source": [ + "%%capture\n", + "try:\n", + " \n", + " from google.colab import files\n", + " \n", + " !wget -P DATAPATH http://nlp.stanford.edu/data/glove.6B.zip\n", + " !unzip DATAPATH/glove.6B.zip -d DATAPATH/glove.6B\n", + " \n", + " !wget -P DATAPATH http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", + " !tar -xvf DATAPATH/aclImdb_v1.tar.gz -C DATAPATH\n", + " \n", + " BASE_DIR = 'DATAPATH'\n", + " \n", + "except ModuleNotFoundError:\n", + " \n", + " if not os.path.exists('Data/glove.6B'):\n", + " os.mkdir('Data/glove.6B')\n", + " \n", + " url='http://nlp.stanford.edu/data/glove.6B.zip' \n", + " wget.download(url,'Data') \n", + " \n", + " temp='Data/glove.6B.zip' \n", + " file = ZipFile(temp) \n", + " file.extractall('Data/glove.6B') \n", + " file.close()\n", + " \n", + " \n", + " \n", + " if not os.path.exists('Data/aclImdb'):\n", + " \n", + " url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' \n", + " wget.download(url,'Data')\n", + " \n", + " temp='Data/aclImdb_v1.tar.gz' \n", + " tar = tarfile.open(temp, \"r:gz\")\n", + " tar.extractall('Data') \n", + " tar.close()\n", + " \n", + " BASE_DIR = 'Data'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "qvl1qb78fUib" + }, + "outputs": [], + "source": [ + "GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')\n", + "TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/train')\n", + "TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/test')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "Yu9xmAZEd7fp" + }, + "outputs": [], + "source": [ + "#Within these, I only have a pos/ and a neg/ folder containing text files \n", + "MAX_SEQUENCE_LENGTH = 1000\n", + "MAX_NUM_WORDS = 20000 \n", + "EMBEDDING_DIM = 100 \n", + "VALIDATION_SPLIT = 0.2\n", + "\n", + "#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py\n", + "#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EmifkoA8b5_N" + }, + "source": [ + "### Loading and Preprocessing\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "WI4O1usEb5_O" + }, + "outputs": [], + "source": [ + "#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.\n", + "def get_data(data_dir):\n", + " texts = [] # list of text samples\n", + " labels_index = {'pos':1, 'neg':0} # dictionary mapping label name to numeric id\n", + " labels = [] # list of label ids\n", + " for name in sorted(os.listdir(data_dir)):\n", + " path = os.path.join(data_dir, name)\n", + " if os.path.isdir(path):\n", + " if name=='pos' or name=='neg':\n", + " label_id = labels_index[name]\n", + " for fname in sorted(os.listdir(path)):\n", + " fpath = os.path.join(path, fname)\n", + " text = open(fpath,encoding='utf8').read()\n", + " texts.append(text)\n", + " labels.append(label_id)\n", + " return texts, labels\n", + "\n", + "train_texts, train_labels = get_data(TRAIN_DATA_DIR)\n", + "test_texts, test_labels = get_data(TEST_DATA_DIR)\n", + "labels_index = {'pos':1, 'neg':0} \n", + "\n", + "#Just to see how the data looks like. \n", + "#print(train_texts[0])\n", + "#print(train_labels[0])\n", + "#print(test_texts[24999])\n", + "#print(test_labels[24999])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "QhhqM0Jdd7fs", + "outputId": "9b5b394e-bc52-4779-d85d-a0383446051d" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "WUHqg2vvb5_l", - "outputId": "8387eda1-18f0-4254-9819-e63191b8fc04" - }, - "source": [ - "print('Preparing embedding matrix.')\n", - "\n", - "# first, build index mapping words in the embeddings set\n", - "# to their embedding vector\n", - "embeddings_index = {}\n", - "with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf8') as f:\n", - " for line in f:\n", - " values = line.split()\n", - " word = values[0]\n", - " coefs = np.asarray(values[1:], dtype='float32')\n", - " embeddings_index[word] = coefs\n", - "\n", - "print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))\n", - "#print(embeddings_index[\"google\"])\n", - "\n", - "# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.\n", - "num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n", - "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n", - "for word, i in word_index.items():\n", - " if i > MAX_NUM_WORDS:\n", - " continue\n", - " embedding_vector = embeddings_index.get(word)\n", - " if embedding_vector is not None:\n", - " # words not found in embedding index will be all-zeros.\n", - " embedding_matrix[i] = embedding_vector\n", - "\n", - "# load these pre-trained word embeddings into an Embedding layer\n", - "# note that we set trainable = False so as to keep the embeddings fixed\n", - "embedding_layer = Embedding(num_words,\n", - " EMBEDDING_DIM,\n", - " embeddings_initializer=Constant(embedding_matrix),\n", - " input_length=MAX_SEQUENCE_LENGTH,\n", - " trainable=False)\n", - "print(\"Preparing of embedding matrix is done\")" - ], - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Preparing embedding matrix.\n", - "Found 400000 word vectors in Glove embeddings.\n", - "Preparing of embedding matrix is done\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 88582 unique tokens.\n" + ] + } + ], + "source": [ + "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer \n", + "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data. \n", + "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) \n", + "tokenizer.fit_on_texts(train_texts) \n", + "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes \n", + "test_sequences = tokenizer.texts_to_sequences(test_texts) \n", + "word_index = tokenizer.word_index \n", + "print('Found %s unique tokens.' % len(word_index))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "_e0V1-bBb5_d", + "outputId": "d866429d-5bb6-43a7-c66e-ed5abbafc4cd" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "vEastnX8gdxR" - }, - "source": [ - "### 1D CNN Model with pre-trained embedding" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Splitting the train data into train and valid is done\n" + ] + } + ], + "source": [ + "#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier\n", + "#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH\n", + "trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", + "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", + "trainvalid_labels = to_categorical(np.asarray(train_labels))\n", + "test_labels = to_categorical(np.asarray(test_labels))\n", + "\n", + "# split the training data into a training set and a validation set\n", + "indices = np.arange(trainvalid_data.shape[0])\n", + "np.random.shuffle(indices)\n", + "trainvalid_data = trainvalid_data[indices]\n", + "trainvalid_labels = trainvalid_labels[indices]\n", + "num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])\n", + "x_train = trainvalid_data[:-num_validation_samples]\n", + "y_train = trainvalid_labels[:-num_validation_samples]\n", + "x_val = trainvalid_data[-num_validation_samples:]\n", + "y_val = trainvalid_labels[-num_validation_samples:]\n", + "#This is the data we will use for CNN and RNN training\n", + "print('Splitting the train data into train and valid is done')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "WUHqg2vvb5_l", + "outputId": "8387eda1-18f0-4254-9819-e63191b8fc04" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "TTY-4K-Ob5_t", - "outputId": "836681ca-936e-400a-8973-0754759bb7cd" - }, - "source": [ - "print('Define a 1D CNN model.')\n", - "\n", - "cnnmodel = Sequential()\n", - "cnnmodel.add(embedding_layer)\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(GlobalMaxPooling1D())\n", - "cnnmodel.add(Dense(128, activation='relu'))\n", - "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", - "\n", - "cnnmodel.compile(loss='categorical_crossentropy',\n", - " optimizer='rmsprop',\n", - " metrics=['acc'])\n", - "#Train the model. Tune to validation set. \n", - "cnnmodel.fit(x_train, y_train,\n", - " batch_size=128,\n", - " epochs=1, validation_data=(x_val, y_val))\n", - "#Evaluate on test set:\n", - "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", - "print('Test accuracy with CNN:', acc)" - ], - "execution_count": 11, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Define a 1D CNN model.\n", - "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 156s 8ms/sample - loss: 0.6706 - acc: 0.5972 - val_loss: 0.5116 - val_acc: 0.7512\n", - "25000/25000 [==============================] - 67s 3ms/sample - loss: 0.5239 - acc: 0.7415\n", - "Test accuracy with CNN: 0.74152\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing embedding matrix.\n", + "Found 400000 word vectors in Glove embeddings.\n", + "Preparing of embedding matrix is done\n" + ] + } + ], + "source": [ + "print('Preparing embedding matrix.')\n", + "\n", + "# first, build index mapping words in the embeddings set\n", + "# to their embedding vector\n", + "embeddings_index = {}\n", + "with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf8') as f:\n", + " for line in f:\n", + " values = line.split()\n", + " word = values[0]\n", + " coefs = np.asarray(values[1:], dtype='float32')\n", + " embeddings_index[word] = coefs\n", + "\n", + "print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))\n", + "#print(embeddings_index[\"google\"])\n", + "\n", + "# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.\n", + "num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n", + "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n", + "for word, i in word_index.items():\n", + " if i > MAX_NUM_WORDS:\n", + " continue\n", + " embedding_vector = embeddings_index.get(word)\n", + " if embedding_vector is not None:\n", + " # words not found in embedding index will be all-zeros.\n", + " embedding_matrix[i] = embedding_vector\n", + "\n", + "# load these pre-trained word embeddings into an Embedding layer\n", + "# note that we set trainable = False so as to keep the embeddings fixed\n", + "embedding_layer = Embedding(num_words,\n", + " EMBEDDING_DIM,\n", + " embeddings_initializer=Constant(embedding_matrix),\n", + " input_length=MAX_SEQUENCE_LENGTH,\n", + " trainable=False)\n", + "print(\"Preparing of embedding matrix is done\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vEastnX8gdxR" + }, + "source": [ + "### 1D CNN Model with pre-trained embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "TTY-4K-Ob5_t", + "outputId": "836681ca-936e-400a-8973-0754759bb7cd" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "VdDj2FJzgi_W" - }, - "source": [ - "### 1D CNN model with training your own embedding" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Define a 1D CNN model.\n" + ] }, { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zI0bISwRb5_w", - "outputId": "d7697504-dacb-415c-b131-b89d6b10c771" - }, - "source": [ - "print(\"Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\")\n", - "cnnmodel = Sequential()\n", - "cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(GlobalMaxPooling1D())\n", - "cnnmodel.add(Dense(128, activation='relu'))\n", - "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", - "\n", - "cnnmodel.compile(loss='categorical_crossentropy',\n", - " optimizer='rmsprop',\n", - " metrics=['acc'])\n", - "#Train the model. Tune to validation set. \n", - "cnnmodel.fit(x_train, y_train,\n", - " batch_size=128,\n", - " epochs=1, validation_data=(x_val, y_val))\n", - "#Evaluate on test set:\n", - "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", - "print('Test accuracy with CNN:', acc)" - ], - "execution_count": 12, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n", - "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 234s 12ms/sample - loss: 0.5323 - acc: 0.6927 - val_loss: 0.3179 - val_acc: 0.8644\n", - "25000/25000 [==============================] - 84s 3ms/sample - loss: 0.3409 - acc: 0.8495\n", - "Test accuracy with CNN: 0.84948\n" - ], - "name": "stdout" - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-15 09:55:06.494956: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 80000000 exceeds 10% of free system memory.\n", + "2023-08-15 09:55:07.639684: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 51200000 exceeds 10% of free system memory.\n", + "2023-08-15 09:55:07.663242: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65273856 exceeds 10% of free system memory.\n", + "2023-08-15 09:55:08.392478: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 26083328 exceeds 10% of free system memory.\n", + "2023-08-15 09:55:08.392575: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65273856 exceeds 10% of free system memory.\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "6GwhXpmSgt4H" - }, - "source": [ - "### LSTM Model with training your own embedding " - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "157/157 [==============================] - 122s 768ms/step - loss: 0.6612 - acc: 0.6183 - val_loss: 0.5199 - val_acc: 0.7728\n", + "782/782 [==============================] - 46s 59ms/step - loss: 0.5269 - acc: 0.7656\n", + "Test accuracy with CNN: 0.76555997133255\n" + ] + } + ], + "source": [ + "print('Define a 1D CNN model.')\n", + "\n", + "cnnmodel = Sequential()\n", + "cnnmodel.add(embedding_layer)\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(GlobalMaxPooling1D())\n", + "cnnmodel.add(Dense(128, activation='relu'))\n", + "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", + "\n", + "cnnmodel.compile(loss='categorical_crossentropy',\n", + " optimizer='rmsprop',\n", + " metrics=['acc'])\n", + "#Train the model. Tune to validation set. \n", + "cnnmodel.fit(x_train, y_train,\n", + " batch_size=128,\n", + " epochs=1, validation_data=(x_val, y_val))\n", + "#Evaluate on test set:\n", + "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", + "print('Test accuracy with CNN:', acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VdDj2FJzgi_W" + }, + "source": [ + "### 1D CNN model with training your own embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "zI0bISwRb5_w", + "outputId": "d7697504-dacb-415c-b131-b89d6b10c771" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SvBt2Brib5_4", - "outputId": "008fe9fa-13bf-4127-ba46-67916426ddbe" - }, - "source": [ - "print(\"Defining and training an LSTM model, training embedding layer on the fly\")\n", - "\n", - "#model\n", - "rnnmodel = Sequential()\n", - "rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", - "rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", - "rnnmodel.add(Dense(2, activation='sigmoid'))\n", - "rnnmodel.compile(loss='binary_crossentropy',\n", - " optimizer='adam',\n", - " metrics=['accuracy'])\n", - "print('Training the RNN')\n", - "\n", - "rnnmodel.fit(x_train, y_train,\n", - " batch_size=32,\n", - " epochs=1,\n", - " validation_data=(x_val, y_val))\n", - "score, acc = rnnmodel.evaluate(test_data, test_labels,\n", - " batch_size=32)\n", - "print('Test accuracy with RNN:', acc)" - ], - "execution_count": 13, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Defining and training an LSTM model, training embedding layer on the fly\n", - "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", - "Training the RNN\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 1365s 68ms/sample - loss: 0.4997 - acc: 0.7506 - val_loss: 0.3839 - val_acc: 0.8403\n", - "25000/25000 [==============================] - 198s 8ms/sample - loss: 0.3962 - acc: 0.8300\n", - "Test accuracy with RNN: 0.82998\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n", + "157/157 [==============================] - 200s 1s/step - loss: 0.6526 - acc: 0.5850 - val_loss: 0.5804 - val_acc: 0.7002\n", + "782/782 [==============================] - 55s 70ms/step - loss: 0.5655 - acc: 0.7103\n", + "Test accuracy with CNN: 0.7102800011634827\n" + ] + } + ], + "source": [ + "print(\"Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\")\n", + "cnnmodel = Sequential()\n", + "cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(GlobalMaxPooling1D())\n", + "cnnmodel.add(Dense(128, activation='relu'))\n", + "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", + "\n", + "cnnmodel.compile(loss='categorical_crossentropy',\n", + " optimizer='rmsprop',\n", + " metrics=['acc'])\n", + "#Train the model. Tune to validation set. \n", + "cnnmodel.fit(x_train, y_train,\n", + " batch_size=128,\n", + " epochs=1, validation_data=(x_val, y_val))\n", + "#Evaluate on test set:\n", + "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", + "print('Test accuracy with CNN:', acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6GwhXpmSgt4H" + }, + "source": [ + "### LSTM Model with training your own embedding " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "SvBt2Brib5_4", + "outputId": "008fe9fa-13bf-4127-ba46-67916426ddbe" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "tJYzsZFSg9z-" - }, - "source": [ - "### LSTM Model using pre-trained Embedding Layer" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Defining and training an LSTM model, training embedding layer on the fly\n", + "Training the RNN\n", + "625/625 [==============================] - 709s 1s/step - loss: 0.5129 - accuracy: 0.7477 - val_loss: 0.4001 - val_accuracy: 0.8254\n", + "782/782 [==============================] - 156s 199ms/step - loss: 0.3995 - accuracy: 0.8270\n", + "Test accuracy with RNN: 0.8270000219345093\n" + ] + } + ], + "source": [ + "print(\"Defining and training an LSTM model, training embedding layer on the fly\")\n", + "\n", + "#model\n", + "rnnmodel = Sequential()\n", + "rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", + "rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", + "rnnmodel.add(Dense(2, activation='sigmoid'))\n", + "rnnmodel.compile(loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + "print('Training the RNN')\n", + "\n", + "rnnmodel.fit(x_train, y_train,\n", + " batch_size=32,\n", + " epochs=1,\n", + " validation_data=(x_val, y_val))\n", + "score, acc = rnnmodel.evaluate(test_data, test_labels,\n", + " batch_size=32)\n", + "print('Test accuracy with RNN:', acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tJYzsZFSg9z-" + }, + "source": [ + "### LSTM Model using pre-trained Embedding Layer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "Eymx0IyCb5_-", + "outputId": "da0fa303-a4c4-4b92-ff42-54f1a1d51e45" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Eymx0IyCb5_-", - "outputId": "da0fa303-a4c4-4b92-ff42-54f1a1d51e45" - }, - "source": [ - "print(\"Defining and training an LSTM model, using pre-trained embedding layer\")\n", - "\n", - "rnnmodel2 = Sequential()\n", - "rnnmodel2.add(embedding_layer)\n", - "rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", - "rnnmodel2.add(Dense(2, activation='sigmoid'))\n", - "rnnmodel2.compile(loss='binary_crossentropy',\n", - " optimizer='adam',\n", - " metrics=['accuracy'])\n", - "print('Training the RNN')\n", - "\n", - "rnnmodel2.fit(x_train, y_train,\n", - " batch_size=32,\n", - " epochs=1,\n", - " validation_data=(x_val, y_val))\n", - "score, acc = rnnmodel2.evaluate(test_data, test_labels,\n", - " batch_size=32)\n", - "print('Test accuracy with RNN:', acc)" - ], - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Defining and training an LSTM model, using pre-trained embedding layer\n", - "Training the RNN\n", - "Train on 20000 samples, validate on 5000 samples\n", - "20000/20000 [==============================] - 1156s 58ms/sample - loss: 0.6122 - acc: 0.6602 - val_loss: 0.4538 - val_acc: 0.8017\n", - "25000/25000 [==============================] - 200s 8ms/sample - loss: 0.4666 - acc: 0.7930\n", - "Test accuracy with RNN: 0.793\n" - ], - "name": "stdout" - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Defining and training an LSTM model, using pre-trained embedding layer\n", + "Training the RNN\n", + "625/625 [==============================] - 548s 875ms/step - loss: 0.6192 - accuracy: 0.6578 - val_loss: 0.4754 - val_accuracy: 0.7876\n", + "782/782 [==============================] - 140s 179ms/step - loss: 0.4757 - accuracy: 0.7847\n", + "Test accuracy with RNN: 0.7847200036048889\n" + ] } - ] -} \ No newline at end of file + ], + "source": [ + "print(\"Defining and training an LSTM model, using pre-trained embedding layer\")\n", + "\n", + "rnnmodel2 = Sequential()\n", + "rnnmodel2.add(embedding_layer)\n", + "rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", + "rnnmodel2.add(Dense(2, activation='sigmoid'))\n", + "rnnmodel2.compile(loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + "print('Training the RNN')\n", + "\n", + "rnnmodel2.fit(x_train, y_train,\n", + " batch_size=32,\n", + " epochs=1,\n", + " validation_data=(x_val, y_val))\n", + "score, acc = rnnmodel2.evaluate(test_data, test_labels,\n", + " batch_size=32)\n", + "print('Test accuracy with RNN:', acc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "05_DeepNN_Example.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 042f5bfa5dc86f4028a09fd64b5d48143fae5f88 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Tue, 15 Aug 2023 11:41:08 +0000 Subject: [PATCH 07/14] Updated Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb by removing errors and updating libraries --- ...6_BERT_IMDB_Sentiment_Classification.ipynb | 541 ++---------------- 1 file changed, 49 insertions(+), 492 deletions(-) diff --git a/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb b/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb index 30eb9b3..a82ddd6 100644 --- a/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb +++ b/Ch4/06_BERT_IMDB_Sentiment_Classification.ipynb @@ -24,91 +24,21 @@ "id": "MK-POIlJE0Eu", "outputId": "490a8c7e-e8b3-4522-e448-37b50ef91109" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: tensorflow==1.14.0 in /usr/local/lib/python3.7/dist-packages (1.14.0)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.12.0)\n", - "Requirement already satisfied: tensorflow-estimator<1.15.0rc0,>=1.14.0rc0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.14.0)\n", - "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.0)\n", - "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.12.1)\n", - "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.36.2)\n", - "Requirement already satisfied: tensorboard<1.15.0,>=1.14.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.14.0)\n", - "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.15.0)\n", - "Requirement already satisfied: google-pasta>=0.1.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.2.0)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.34.1)\n", - "Requirement already satisfied: numpy<2.0,>=1.14.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.19.5)\n", - "Requirement already satisfied: gast>=0.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.4.0)\n", - "Requirement already satisfied: keras-applications>=1.0.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.0.8)\n", - "Requirement already satisfied: protobuf>=3.6.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (3.17.3)\n", - "Requirement already satisfied: astor>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (0.8.1)\n", - "Requirement already satisfied: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.7/dist-packages (from tensorflow==1.14.0) (1.1.2)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (57.2.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.3.4)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (1.0.1)\n", - "Requirement already satisfied: h5py in /usr/local/lib/python3.7/dist-packages (from keras-applications>=1.0.6->tensorflow==1.14.0) (3.1.0)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (4.6.1)\n", - "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py->keras-applications>=1.0.6->tensorflow==1.14.0) (1.5.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.5.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<1.15.0,>=1.14.0->tensorflow==1.14.0) (3.7.4.3)\n", - "Requirement already satisfied: torch==1.9.0 in /usr/local/lib/python3.7/dist-packages (1.9.0+cu102)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.9.0) (3.7.4.3)\n", - "Requirement already satisfied: scikit-learn==0.21.3 in /usr/local/lib/python3.7/dist-packages (0.21.3)\n", - "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.4.1)\n", - "Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.19.5)\n", - "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.21.3) (1.0.1)\n", - "Requirement already satisfied: pytorch_pretrained_bert==0.6.2 in /usr/local/lib/python3.7/dist-packages (0.6.2)\n", - "Requirement already satisfied: pytorch-nlp==0.5.0 in /usr/local/lib/python3.7/dist-packages (0.5.0)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (2019.12.20)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (1.19.5)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (2.23.0)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (4.41.1)\n", - "Requirement already satisfied: torch>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (1.9.0+cu102)\n", - "Requirement already satisfied: boto3 in /usr/local/lib/python3.7/dist-packages (from pytorch_pretrained_bert==0.6.2) (1.18.1)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (2021.5.30)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->pytorch_pretrained_bert==0.6.2) (3.0.4)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch>=0.4.1->pytorch_pretrained_bert==0.6.2) (3.7.4.3)\n", - "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from boto3->pytorch_pretrained_bert==0.6.2) (0.10.0)\n", - "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /usr/local/lib/python3.7/dist-packages (from boto3->pytorch_pretrained_bert==0.6.2) (0.5.0)\n", - "Requirement already satisfied: botocore<1.22.0,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from boto3->pytorch_pretrained_bert==0.6.2) (1.21.1)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.7/dist-packages (from botocore<1.22.0,>=1.21.1->boto3->pytorch_pretrained_bert==0.6.2) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.22.0,>=1.21.1->boto3->pytorch_pretrained_bert==0.6.2) (1.15.0)\n", - "Requirement already satisfied: tqdm==4.41.1 in /usr/local/lib/python3.7/dist-packages (4.41.1)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Requirement already satisfied: matplotlib==3.2.2 in /usr/local/lib/python3.7/dist-packages (3.2.2)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.3.1)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.8.1)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (2.4.7)\n", - "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (1.19.5)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.2.2) (0.10.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib==3.2.2) (1.15.0)\n", - "Requirement already satisfied: beautifulsoup4==4.6.3 in /usr/local/lib/python3.7/dist-packages (4.6.3)\n" - ] - } - ], + "outputs": [], "source": [ "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", "\n", "# ===========================\n", "\n", - "!pip install numpy==1.19.5\n", - "!pip install tensorflow==1.14.0\n", - "!pip install torch==1.9.0\n", - "!pip install scikit-learn==0.21.3\n", - "!pip install pytorch_pretrained_bert==0.6.2 pytorch-nlp==0.5.0 \n", - "!pip install tqdm==4.41.1\n", - "!pip install pandas==1.1.5\n", - "!pip install matplotlib==3.2.2\n", - "!pip install beautifulsoup4==4.6.3\n", + "# !pip install numpy==1.19.5\n", + "# !pip install tensorflow==1.14.0\n", + "# !pip install torch==1.9.0\n", + "# !pip install scikit-learn==0.21.3\n", + "# !pip install pytorch_pretrained_bert==0.6.2 pytorch-nlp==0.5.0 \n", + "# !pip install tqdm==4.41.1\n", + "# !pip install pandas==1.1.5\n", + "# !pip install matplotlib==3.2.2\n", + "# !pip install beautifulsoup4==4.6.3\n", "\n", "# ===========================" ] @@ -140,7 +70,17 @@ "metadata": { "id": "TtokjlkCQbiw" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-15 11:05:48.226287: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-08-15 11:05:48.661080: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-08-15 11:05:50.154202: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + ] + } + ], "source": [ "#importing a few necessary packages and setting the DATA directory\n", "DATA_DIR=\".\"\n", @@ -151,8 +91,6 @@ "import pickle\n", "import tensorflow as tf\n", "\n", - "\n", - "\n", "# BERT imports\n", "import torch\n", "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", @@ -199,36 +137,7 @@ "id": "BI8AvyFZRAha", "outputId": "b254d1da-f187-4c77-f1e0-748a5e6a8e90" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " Upload widget is only available when the cell has been executed in the\n", - " current browser session. Please rerun this cell to enable.\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving IMDB Dataset.csv to IMDB Dataset.csv\n" - ] - } - ], + "outputs": [], "source": [ "# uploading and reading the dataset\n", "# source for dataset: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews\n", @@ -242,7 +151,7 @@ "except ModuleNotFoundError :\n", " \n", " # After downnloading the dataset, put the IMDB Dataset.csv file in Data folder.\n", - " df = pd.read_csv(\"Data/IMDB Dataset.csv\",engine='python', error_bad_lines=False)" + " df = pd.read_csv(\"Data/IMDB Dataset.csv\",engine='python', on_bad_lines='warn')" ] }, { @@ -322,9 +231,7 @@ ] }, "execution_count": 5, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -361,15 +268,14 @@ { "data": { "text/plain": [ + "sentiment\n", "1 25000\n", "0 25000\n", - "Name: sentiment, dtype: int64" + "Name: count, dtype: int64" ] }, "execution_count": 7, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -476,9 +382,7 @@ ] }, "execution_count": 9, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -526,17 +430,12 @@ "outputs": [ { "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, "text/plain": [ "\"[CLS] One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked They are right, as this is exactly what happened with meThe first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO Trust me, this is not a show for the faint hearted or timid This show pulls no punches with regards to drugs, sex or violence Its is hardcore, in the classic use of the wordIt is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda Em City is home to manyAryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and moreso scuffles, death stares, dodgy dealings and shady agreements are never far awayI would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare Forget pretty pictures painted for mainstream audiences, forget charm, forget romanceOZ doesn't mess around The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence Not just violence, but injustice crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience Watching Oz, you may become comfortable with what is uncomfortable viewingthats if you can get in touch with your darker side [SEP]\"" ] }, "execution_count": 11, - "metadata": { - "tags": [] - }, + "metadata": {}, "output_type": "execute_result" } ], @@ -564,13 +463,6 @@ "outputId": "e80ff8c7-991d-45a4-9caf-600f9e694998" }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 231508/231508 [00:00<00:00, 312015.27B/s]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -718,13 +610,6 @@ "outputId": "d36884cd-ea8b-4954-ad2d-303d065f0ea0" }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 407873900/407873900 [00:34<00:00, 11838492.74B/s]\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -740,260 +625,7 @@ " )\n", " (encoder): BertEncoder(\n", " (layer): ModuleList(\n", - " (0): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (1): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (2): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (3): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (4): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (5): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (6): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (7): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (8): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (9): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (10): BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): BertLayerNorm()\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (11): BertLayer(\n", + " (0-11): 12 x BertLayer(\n", " (attention): BertAttention(\n", " (self): BertSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", @@ -1058,7 +690,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1073,97 +705,8 @@ "output_type": "stream", "text": [ "t_total value of -1 results in schedule not being applied\n", - "Epoch: 0%| | 0/4 [00:00" - ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" } ], "source": [ @@ -1257,6 +800,20 @@ "plt.plot(train_loss_set)\n", "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1267,7 +824,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1281,9 +838,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.17" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From 335d0ea3f7ceab28cd92648b1b2a94da47a0ab3c Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Tue, 15 Aug 2023 11:41:35 +0000 Subject: [PATCH 08/14] Updated Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb by removing errors and updating libraries --- ...Sentiment_Classification_IMDB_ktrain.ipynb | 394 +++++------------- 1 file changed, 109 insertions(+), 285 deletions(-) diff --git a/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb b/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb index ffed776..83eed16 100644 --- a/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb +++ b/Ch4/07_BERT_Sentiment_Classification_IMDB_ktrain.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -19,170 +19,22 @@ "id": "TF5qfV_flTbr", "outputId": "b536d10d-767d-4a8d-9cd6-2ea607550b1b" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy==1.19.5 in /usr/local/lib/python3.7/dist-packages (1.19.5)\n", - "Requirement already satisfied: pandas==1.1.5 in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (2018.9)\n", - "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas==1.1.5) (1.19.5)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas==1.1.5) (1.15.0)\n", - "Collecting ktrain==0.26.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/4c/88/10d29578f47d0d140bf669d5598e9f5a50465ddc423b32031c65e840d003/ktrain-0.26.3.tar.gz (25.3MB)\n", - "\u001b[K |████████████████████████████████| 25.3MB 1.6MB/s \n", - "\u001b[?25hCollecting scikit-learn==0.23.2\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/cb/64623369f348e9bfb29ff898a57ac7c91ed4921f228e9726546614d63ccb/scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8MB)\n", - "\u001b[K |████████████████████████████████| 6.8MB 41.4MB/s \n", - "\u001b[?25hRequirement already satisfied: matplotlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (3.2.2)\n", - "Requirement already satisfied: pandas>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (1.1.5)\n", - "Requirement already satisfied: fastprogress>=0.1.21 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (1.0.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (2.23.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (1.0.1)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (21.0)\n", - "Requirement already satisfied: ipython in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (5.5.0)\n", - "Collecting langdetect\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz (981kB)\n", - "\u001b[K |████████████████████████████████| 983kB 43.3MB/s \n", - "\u001b[?25hRequirement already satisfied: jieba in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (0.42.1)\n", - "Collecting cchardet\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/80/72/a4fba7559978de00cf44081c548c5d294bf00ac7dcda2db405d2baa8c67a/cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263kB)\n", - "\u001b[K |████████████████████████████████| 266kB 50.8MB/s \n", - "\u001b[?25hCollecting syntok\n", - " Downloading https://files.pythonhosted.org/packages/8c/76/a49e73a04b3e3a14ce232e8e28a1587f8108baa665644fe8c40e307e792e/syntok-1.3.1.tar.gz\n", - "Collecting seqeval==0.0.19\n", - " Downloading https://files.pythonhosted.org/packages/93/e5/b7705156a77f742cfe4fc6f22d0c71591edb2d243328dff2f8fc0f933ab6/seqeval-0.0.19.tar.gz\n", - "Collecting transformers<=4.3.3,>=4.0.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)\n", - "\u001b[K |████████████████████████████████| 1.9MB 37.5MB/s \n", - "\u001b[?25hCollecting sentencepiece\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)\n", - "\u001b[K |████████████████████████████████| 1.2MB 41.8MB/s \n", - "\u001b[?25hCollecting keras_bert>=0.86.0\n", - " Downloading https://files.pythonhosted.org/packages/6a/e4/3b2e2927c15c22f44005cb0ab0eaf2f7e623ea2b6488e4b7c5aca6c162c2/keras-bert-0.88.0.tar.gz\n", - "Requirement already satisfied: networkx>=2.3 in /usr/local/lib/python3.7/dist-packages (from ktrain==0.26.3) (2.5.1)\n", - "Collecting whoosh\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ba/19/24d0f1f454a2c1eb689ca28d2f178db81e5024f42d82729a4ff6771155cf/Whoosh-2.7.4-py2.py3-none-any.whl (468kB)\n", - "\u001b[K |████████████████████████████████| 471kB 34.8MB/s \n", - "\u001b[?25hRequirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.23.2->ktrain==0.26.3) (1.4.1)\n", - "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.7/dist-packages (from scikit-learn==0.23.2->ktrain==0.26.3) (1.19.5)\n", - "Collecting threadpoolctl>=2.0.0\n", - " Downloading https://files.pythonhosted.org/packages/c6/e8/c216b9b60cbba4642d3ca1bae7a53daa0c24426f662e0e3ce3dc7f6caeaa/threadpoolctl-2.2.0-py3-none-any.whl\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (0.10.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (1.3.1)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (2.4.7)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib>=3.0.0->ktrain==0.26.3) (2.8.1)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0.1->ktrain==0.26.3) (2018.9)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (2021.5.30)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->ktrain==0.26.3) (1.24.3)\n", - "Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (0.8.1)\n", - "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (5.0.5)\n", - "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (57.2.0)\n", - "Requirement already satisfied: decorator in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (4.4.2)\n", - "Requirement already satisfied: pexpect; sys_platform != \"win32\" in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (4.8.0)\n", - "Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (1.0.18)\n", - "Requirement already satisfied: pickleshare in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (0.7.5)\n", - "Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from ipython->ktrain==0.26.3) (2.6.1)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from langdetect->ktrain==0.26.3) (1.15.0)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from syntok->ktrain==0.26.3) (2019.12.20)\n", - "Requirement already satisfied: Keras>=2.2.4 in /usr/local/lib/python3.7/dist-packages (from seqeval==0.0.19->ktrain==0.26.3) (2.4.3)\n", - "Collecting sacremoses\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", - "\u001b[K |████████████████████████████████| 901kB 41.4MB/s \n", - "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)\n", - "\u001b[K |████████████████████████████████| 3.3MB 37.9MB/s \n", - "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (4.41.1)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (3.0.12)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (4.6.1)\n", - "Collecting keras-transformer>=0.39.0\n", - " Downloading https://files.pythonhosted.org/packages/8a/35/6b079e920fe09a9349028bc2f209447e5636d90e29c5cf060bcc3177803a/keras-transformer-0.39.0.tar.gz\n", - "Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.7/dist-packages (from traitlets>=4.2->ipython->ktrain==0.26.3) (0.2.0)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.7/dist-packages (from pexpect; sys_platform != \"win32\"->ipython->ktrain==0.26.3) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython->ktrain==0.26.3) (0.2.5)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from Keras>=2.2.4->seqeval==0.0.19->ktrain==0.26.3) (3.13)\n", - "Requirement already satisfied: h5py in /usr/local/lib/python3.7/dist-packages (from Keras>=2.2.4->seqeval==0.0.19->ktrain==0.26.3) (3.1.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (7.1.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (3.5.0)\n", - "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers<=4.3.3,>=4.0.0->ktrain==0.26.3) (3.7.4.3)\n", - "Collecting keras-pos-embd>=0.12.0\n", - " Downloading https://files.pythonhosted.org/packages/d8/d2/1cc072ea68b573f366e08936177a33e237e66fa7d5338289d4bee64696cf/keras-pos-embd-0.12.0.tar.gz\n", - "Collecting keras-multi-head>=0.28.0\n", - " Downloading https://files.pythonhosted.org/packages/a5/e6/a83f26b2e1582de237b125f595874d808e40698f31d44d5903e872d5b64d/keras-multi-head-0.28.0.tar.gz\n", - "Collecting keras-layer-normalization>=0.15.0\n", - " Downloading https://files.pythonhosted.org/packages/33/e1/0da586d544a0940a56a2f4aa704b7dbd95eaa8ceda6168b48f5ac95e6608/keras-layer-normalization-0.15.0.tar.gz\n", - "Collecting keras-position-wise-feed-forward>=0.7.0\n", - " Downloading https://files.pythonhosted.org/packages/58/02/cd3e7e51cf45d3825818384a2f7d9c340b60c9bf55a5682b7318e1c16eab/keras-position-wise-feed-forward-0.7.0.tar.gz\n", - "Collecting keras-embed-sim>=0.9.0\n", - " Downloading https://files.pythonhosted.org/packages/2d/48/78f6d134f1ede597d91186819c9e428ada51cd8d9ea28e5faf37ed2ee602/keras-embed-sim-0.9.0.tar.gz\n", - "Requirement already satisfied: cached-property; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from h5py->Keras>=2.2.4->seqeval==0.0.19->ktrain==0.26.3) (1.5.2)\n", - "Collecting keras-self-attention>=0.50.0\n", - " Downloading https://files.pythonhosted.org/packages/ea/75/e6bc5b43ee968fef714f2f10a2a1674639ec85d2428cc47b2fe1f9af0115/keras-self-attention-0.50.0.tar.gz\n", - "Building wheels for collected packages: ktrain, langdetect, syntok, seqeval, keras-bert, keras-transformer, keras-pos-embd, keras-multi-head, keras-layer-normalization, keras-position-wise-feed-forward, keras-embed-sim, keras-self-attention\n", - " Building wheel for ktrain (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for ktrain: filename=ktrain-0.26.3-cp37-none-any.whl size=25282390 sha256=0f129e50aaa4d78ab674e5f6b95d1c66df8f4fa6b62a1ac02b1867c70bbbdecd\n", - " Stored in directory: /root/.cache/pip/wheels/16/05/be/d6e659b3349016b1059e19fa028f165af4eeae2c196f329112\n", - " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for langdetect: filename=langdetect-1.0.9-cp37-none-any.whl size=993242 sha256=489499c000ae032ae91b31fdedc7ec2d0ef1fccbd3b997508abed818cd45e520\n", - " Stored in directory: /root/.cache/pip/wheels/7e/18/13/038c34057808931c7ddc6c92d3aa015cf1a498df5a70268996\n", - " Building wheel for syntok (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for syntok: filename=syntok-1.3.1-cp37-none-any.whl size=20919 sha256=d2ed41e31e9075584cdf09f7dcd35228826294d924d63d47ea163b49411be409\n", - " Stored in directory: /root/.cache/pip/wheels/51/c6/a4/be1920586c49469846bcd2888200bdecfe109ec421dab9be2d\n", - " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for seqeval: filename=seqeval-0.0.19-cp37-none-any.whl size=9932 sha256=0f0d22a626859918451e3439effefce4ee362409cc6a0afe0d953ebb60ab7e3b\n", - " Stored in directory: /root/.cache/pip/wheels/8d/1f/bf/1198beceed805a2099060975f6281d1b01046dd279e19c97be\n", - " Building wheel for keras-bert (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-bert: filename=keras_bert-0.88.0-cp37-none-any.whl size=34206 sha256=847f5cdc7a31d9961b28e9c07b757d07c882be271c96a892b97c68b0ce425518\n", - " Stored in directory: /root/.cache/pip/wheels/7f/d8/86/b4d91b941f6f3256c487b258d5e4268a3301203b717dd11f11\n", - " Building wheel for keras-transformer (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-transformer: filename=keras_transformer-0.39.0-cp37-none-any.whl size=12841 sha256=ddc1c3d23d4f739bf6269455f093aeac6995c0ec9a6d38ac3af3e5744d012e57\n", - " Stored in directory: /root/.cache/pip/wheels/77/42/35/d33c5907bca04ac5742e9eceefb644b680286de26728506a70\n", - " Building wheel for keras-pos-embd (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-pos-embd: filename=keras_pos_embd-0.12.0-cp37-none-any.whl size=7471 sha256=c3a4694a4c7002edf0f340cc03c67b10a1d2b07aa9a6d8cadd4c417d56be27dd\n", - " Stored in directory: /root/.cache/pip/wheels/36/d8/36/06ed09215806dca9ff504d8c0dda5da68d7f2c67d34a231d82\n", - " Building wheel for keras-multi-head (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-multi-head: filename=keras_multi_head-0.28.0-cp37-none-any.whl size=15559 sha256=8bdc034bc047b17f8a5b6c5f9e22e5a9d500f0d7c6068048d8549ce75d8a0237\n", - " Stored in directory: /root/.cache/pip/wheels/ec/92/bd/b3407bc29501f7e28eb970a6c425a9a375485c5d8197df6a8f\n", - " Building wheel for keras-layer-normalization (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-layer-normalization: filename=keras_layer_normalization-0.15.0-cp37-none-any.whl size=5224 sha256=1f6ae80fd5d9dfe471270c411aff14452b56bcad9abb0ada2e955ee9de5ad0b4\n", - " Stored in directory: /root/.cache/pip/wheels/de/ea/db/833c8a9b8326e703e9f8a78c0d4153294e6a1b1f97a1836397\n", - " Building wheel for keras-position-wise-feed-forward (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-position-wise-feed-forward: filename=keras_position_wise_feed_forward-0.7.0-cp37-none-any.whl size=5542 sha256=8794e806be2e654221719710e19e5219aa71df372998360a12e73281735ff2b8\n", - " Stored in directory: /root/.cache/pip/wheels/d2/d2/f6/58ce0aae0055dbccba8b40e62a6c22ab997105ad8c431a9e80\n", - " Building wheel for keras-embed-sim (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-embed-sim: filename=keras_embed_sim-0.9.0-cp37-none-any.whl size=4505 sha256=1167638915a16210f6e8f52ed29f112b1ff7abfb096bbb3c25f9b9c3d0ab52c6\n", - " Stored in directory: /root/.cache/pip/wheels/c1/d5/7d/bef5ee93c88bc6150294cc74cbb081647c505bf816918dd7ff\n", - " Building wheel for keras-self-attention (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for keras-self-attention: filename=keras_self_attention-0.50.0-cp37-none-any.whl size=19416 sha256=40ca0b3fc31fcb65f163ad549b188cedd452b68d48d21e636b467155b7e43d73\n", - " Stored in directory: /root/.cache/pip/wheels/29/93/0d/891573db60f74d0e43bd7db1496c3ef898f8b5946a4c24cbda\n", - "Successfully built ktrain langdetect syntok seqeval keras-bert keras-transformer keras-pos-embd keras-multi-head keras-layer-normalization keras-position-wise-feed-forward keras-embed-sim keras-self-attention\n", - "Installing collected packages: threadpoolctl, scikit-learn, langdetect, cchardet, syntok, seqeval, sacremoses, tokenizers, transformers, sentencepiece, keras-pos-embd, keras-self-attention, keras-multi-head, keras-layer-normalization, keras-position-wise-feed-forward, keras-embed-sim, keras-transformer, keras-bert, whoosh, ktrain\n", - " Found existing installation: scikit-learn 0.22.2.post1\n", - " Uninstalling scikit-learn-0.22.2.post1:\n", - " Successfully uninstalled scikit-learn-0.22.2.post1\n", - "Successfully installed cchardet-2.1.7 keras-bert-0.88.0 keras-embed-sim-0.9.0 keras-layer-normalization-0.15.0 keras-multi-head-0.28.0 keras-pos-embd-0.12.0 keras-position-wise-feed-forward-0.7.0 keras-self-attention-0.50.0 keras-transformer-0.39.0 ktrain-0.26.3 langdetect-1.0.9 sacremoses-0.0.45 scikit-learn-0.23.2 sentencepiece-0.1.96 seqeval-0.0.19 syntok-1.3.1 threadpoolctl-2.2.0 tokenizers-0.10.3 transformers-4.3.3 whoosh-2.7.4\n" - ] - } - ], + "outputs": [], "source": [ "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", "\n", "# ===========================\n", "\n", - "!pip install numpy==1.19.5\n", - "!pip install pandas==1.1.5\n", - "!pip install ktrain==0.26.3\n", + "# !pip install numpy==1.19.5\n", + "# !pip install pandas==1.1.5\n", + "# !pip install ktrain==0.26.3\n", "\n", "# ===========================" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "id": "_UN7tuqnlTbs" }, @@ -203,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -211,91 +63,28 @@ "id": "58WB13Jx3rQm", "outputId": "9af6cd3f-771e-4807-d041-bb8a3290bea1" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting tensorflow==2.4.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/94/0a/012cc33c643d844433d13001dd1db179e7020b05ddbbd0a9dc86c38a8efa/tensorflow-2.4.0-cp37-cp37m-manylinux2010_x86_64.whl (394.7MB)\n", - "\u001b[K |████████████████████████████████| 394.7MB 41kB/s \n", - "\u001b[?25hCollecting h5py~=2.10.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/3f/c0/abde58b837e066bca19a3f7332d9d0493521d7dd6b48248451a9e3fe2214/h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9MB)\n", - "\u001b[K |████████████████████████████████| 2.9MB 45.8MB/s \n", - "\u001b[?25hRequirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.1.0)\n", - "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (0.12.0)\n", - "Requirement already satisfied: numpy~=1.19.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.19.5)\n", - "Requirement already satisfied: six~=1.15.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.15.0)\n", - "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.12.1)\n", - "Collecting tensorflow-estimator<2.5.0,>=2.4.0rc0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/74/7e/622d9849abf3afb81e482ffc170758742e392ee129ce1540611199a59237/tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462kB)\n", - "\u001b[K |████████████████████████████████| 471kB 39.7MB/s \n", - "\u001b[?25hCollecting grpcio~=1.32.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/06/54/1c8be62beafe7fb1548d2968e518ca040556b46b0275399d4f3186c56d79/grpcio-1.32.0-cp37-cp37m-manylinux2014_x86_64.whl (3.8MB)\n", - "\u001b[K |████████████████████████████████| 3.8MB 37.5MB/s \n", - "\u001b[?25hRequirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (3.3.0)\n", - "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (3.7.4.3)\n", - "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.12)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (3.17.3)\n", - "Requirement already satisfied: tensorboard~=2.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (2.5.0)\n", - "Collecting gast==0.3.3\n", - " Downloading https://files.pythonhosted.org/packages/d6/84/759f5dd23fec8ba71952d97bcc7e2c9d7d63bdc582421f3cd4be845f0c98/gast-0.3.3-py2.py3-none-any.whl\n", - "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (0.36.2)\n", - "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.6.3)\n", - "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (0.2.0)\n", - "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.4.0) (1.1.2)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (0.4.4)\n", - "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (1.32.1)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (1.8.0)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (1.0.1)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (2.23.0)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (57.2.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (3.3.4)\n", - "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow==2.4.0) (0.6.1)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow==2.4.0) (1.3.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (0.2.8)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (4.2.2)\n", - "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3.6\" in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (4.7.2)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (2021.5.30)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow==2.4.0) (2.10)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard~=2.4->tensorflow==2.4.0) (4.6.1)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow==2.4.0) (3.1.1)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow==2.4.0) (0.4.8)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.4->tensorflow==2.4.0) (3.5.0)\n", - "Installing collected packages: h5py, tensorflow-estimator, grpcio, gast, tensorflow\n", - " Found existing installation: h5py 3.1.0\n", - " Uninstalling h5py-3.1.0:\n", - " Successfully uninstalled h5py-3.1.0\n", - " Found existing installation: tensorflow-estimator 2.5.0\n", - " Uninstalling tensorflow-estimator-2.5.0:\n", - " Successfully uninstalled tensorflow-estimator-2.5.0\n", - " Found existing installation: grpcio 1.34.1\n", - " Uninstalling grpcio-1.34.1:\n", - " Successfully uninstalled grpcio-1.34.1\n", - " Found existing installation: gast 0.4.0\n", - " Uninstalling gast-0.4.0:\n", - " Successfully uninstalled gast-0.4.0\n", - " Found existing installation: tensorflow 2.5.0\n", - " Uninstalling tensorflow-2.5.0:\n", - " Successfully uninstalled tensorflow-2.5.0\n", - "Successfully installed gast-0.3.3 grpcio-1.32.0 h5py-2.10.0 tensorflow-2.4.0 tensorflow-estimator-2.4.0\n" - ] - } - ], + "outputs": [], "source": [ "# use tensorflow 2.4.0 for this notebook\n", - "!pip install tensorflow==2.4.0" + "# !pip install tensorflow==2.4.0" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "id": "KN6N85ah8VXf" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "#Importing\n", "import ktrain\n", @@ -304,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -312,16 +101,7 @@ "id": "Mr1YXudk8Vti", "outputId": "4634f5ee-9c9d-4a32-9118-1845b6c43b7f" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", - "84131840/84125825 [==============================] - 6s 0us/step\n" - ] - } - ], + "outputs": [], "source": [ "##obtain the dataset\n", "import os\n", @@ -335,7 +115,7 @@ " )\n", " IMDB_DATADIR = os.path.join(os.path.dirname(dataset), \"aclImdb\")\n", "except ModuleNotFoundError :\n", - " if not os.path.exists(os.getcwd()+\"\\\\Data\\\\aclImdb\") :\n", + " if not os.path.exists(os.getcwd()+\"/Data/aclImdb\") :\n", " import tensorflow as tf\n", " dataset = tf.keras.utils.get_file(\n", " fname=\"aclImdb.tar.gz\", \n", @@ -348,7 +128,7 @@ " else :\n", "\n", " # set path to dataset\n", - " IMDB_DATADIR=os.getcwd()+\"\\\\Data\\\\aclImdb\"" + " IMDB_DATADIR=os.getcwd()+\"/Data/aclImdb\"" ] }, { @@ -363,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -378,14 +158,6 @@ "output_type": "stream", "text": [ "detected encoding: utf-8\n", - "downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...\n", - "[██████████████████████████████████████████████████]\n", - "extracting pretrained BERT model...\n", - "done.\n", - "\n", - "cleanup downloaded zip...\n", - "done.\n", - "\n", "preprocessing train...\n", "language: en\n" ] @@ -393,15 +165,40 @@ { "data": { "text/html": [ - "done." + "\n", + "\n" ], "text/plain": [ "" ] }, - "metadata": { - "tags": [] + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "done." + ], + "text/plain": [ + "" + ] }, + "metadata": {}, "output_type": "display_data" }, { @@ -416,15 +213,40 @@ { "data": { "text/html": [ - "done." + "\n", + "\n" ], "text/plain": [ "" ] }, - "metadata": { - "tags": [] + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "done." + ], + "text/plain": [ + "" + ] }, + "metadata": {}, "output_type": "display_data" } ], @@ -448,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -462,7 +284,21 @@ "output_type": "stream", "text": [ "Is Multi-Label? False\n", - "maxlen is 500\n", + "maxlen is 500\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.9/site-packages/keras/src/initializers/initializers.py:120: UserWarning: The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initializer instance more than once.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "done.\n" ] } @@ -483,7 +319,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -499,32 +335,20 @@ "\n", "\n", "begin training using onecycle policy with max lr of 2e-05...\n", - "Epoch 1/4\n", - "4167/4167 [==============================] - 2358s 561ms/step - loss: 0.3327 - accuracy: 0.8504 - val_loss: 0.1843 - val_accuracy: 0.9311\n", - "Epoch 2/4\n", - "4167/4167 [==============================] - 2325s 558ms/step - loss: 0.1542 - accuracy: 0.9423 - val_loss: 0.2223 - val_accuracy: 0.9138\n", - "Epoch 3/4\n", - "4167/4167 [==============================] - 2323s 557ms/step - loss: 0.0899 - accuracy: 0.9677 - val_loss: 0.1847 - val_accuracy: 0.9350\n", - "Epoch 4/4\n", - "4167/4167 [==============================] - 2322s 557ms/step - loss: 0.0247 - accuracy: 0.9934 - val_loss: 0.2330 - val_accuracy: 0.9416\n" + "Epoch 1/4\n" ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" } ], "source": [ "learner.fit_onecycle(2e-5, 4)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -535,7 +359,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -549,9 +373,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.9.17" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From ca04c1e535e84bb431886ebe5745849534afe06d Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Meena Date: Tue, 15 Aug 2023 11:42:21 +0000 Subject: [PATCH 09/14] Updated Ch4/03_and_04 by changing Markdown related to Results --- Ch4/03_Word2Vec_Example.ipynb | 9 ++++++++- Ch4/04_FastText_Example.ipynb | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb index 232eeee..19b5e87 100644 --- a/Ch4/03_Word2Vec_Example.ipynb +++ b/Ch4/03_Word2Vec_Example.ipynb @@ -464,8 +464,15 @@ "id": "k7wjLB8rb_JB" }, "source": [ - "Not bad. With little efforts we got 81% accuracy. Thats a great starting model to have!!" + "Not bad. With little efforts we got 80% accuracy. Thats a great starting model to have!!" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/Ch4/04_FastText_Example.ipynb b/Ch4/04_FastText_Example.ipynb index 6608a17..3a57ee4 100644 --- a/Ch4/04_FastText_Example.ipynb +++ b/Ch4/04_FastText_Example.ipynb @@ -532,8 +532,15 @@ "id": "nrxSYRs3b621" }, "source": [ - "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 93% Precision and Recall are hard numbers to beat, too!" + "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 90% Precision and Recall are hard numbers to beat, too!" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 7fecdff4e97461bb59e4e03ddd1d429e2c9b74e1 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Date: Tue, 22 Aug 2023 23:50:38 +0530 Subject: [PATCH 10/14] fix: Added Freezed Library Versions to Ch4/01_OnePipeline_ManyClassifiers.ipynb --- Ch4/01_OnePipeline_ManyClassifiers.ipynb | 1391 +++++++++++----------- 1 file changed, 703 insertions(+), 688 deletions(-) diff --git a/Ch4/01_OnePipeline_ManyClassifiers.ipynb b/Ch4/01_OnePipeline_ManyClassifiers.ipynb index 0c99c5f..b5b5122 100644 --- a/Ch4/01_OnePipeline_ManyClassifiers.ipynb +++ b/Ch4/01_OnePipeline_ManyClassifiers.ipynb @@ -1,729 +1,744 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "janWv1vG5xUD" - }, - "source": [ - "# Text Classification with Naive Bayes, Logistic Regression, SVM" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gBCjEALX5xWj" - }, - "source": [ - "**Overview:** This notebook aims to give you a brief overview of performing text classification using Naive Bayes, Logistic Regression and Support Vector Machines. We will be using a dataset called \"Economic news article tone and relevance\" from [Figure-Eight](https://github.com/practical-nlp/practical-nlp/blob/master/Ch4/Data/Full-Economic-News-DFE-839861.csv) which consists of approximately 8000 news articles, which were tagged as relevant or not relevant to the US Economy. Our goal in this notebook is to explore the process of training and testing text classifiers for this problem, using this data set and two text classification algorithms: Multinomial Naive Bayes and Logistic Regression, implemented in sklearn. \n", - "\n", - "##### Dataset Link: In the a folder called Data in folder Ch4 of this repo\n", - "

\n", - "Let's import few necessary packages before we start our work" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "janWv1vG5xUD" + }, + "source": [ + "# Text Classification with Naive Bayes, Logistic Regression, SVM" + ] }, - "id": "Mee0VQbBXDto", - "outputId": "459d0120-aa17-4536-bc9e-e2395bfa6886" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: numpy in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (1.24.3)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Requirement already satisfied: pandas in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (2.0.3)\n", - "Requirement already satisfied: tzdata>=2022.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: numpy>=1.20.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (1.24.3)\n", - "Requirement already satisfied: pytz>=2020.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", - "Requirement already satisfied: six>=1.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Requirement already satisfied: scikit-learn in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (1.3.0)\n", - "Requirement already satisfied: numpy>=1.17.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.24.3)\n", - "Requirement already satisfied: scipy>=1.5.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.11.1)\n", - "Requirement already satisfied: joblib>=1.1.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (3.2.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Requirement already satisfied: matplotlib in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (3.7.2)\n", - "Requirement already satisfied: numpy>=1.20 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (1.24.3)\n", - "Requirement already satisfied: packaging>=20.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (23.1)\n", - "Requirement already satisfied: cycler>=0.10 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (0.11.0)\n", - "Requirement already satisfied: pillow>=6.2.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (10.0.0)\n", - "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (3.0.9)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (2.8.2)\n", - "Requirement already satisfied: importlib-resources>=3.2.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (6.0.1)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (1.4.4)\n", - "Requirement already satisfied: fonttools>=4.22.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (4.42.0)\n", - "Requirement already satisfied: contourpy>=1.0.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from matplotlib) (1.1.0)\n", - "Requirement already satisfied: zipp>=3.1.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.16.2)\n", - "Requirement already satisfied: six>=1.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "!pip install numpy\n", - "!pip install pandas\n", - "!pip install scikit-learn\n", - "!pip install matplotlib\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "n7dE_FbM1lk5" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "QBvvarqE5xWm" - }, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "import numpy as np\n", - "import pandas as pd # to work with csv files\n", - "\n", - "# matplotlib imports are used to plot confusion matrices for the classifiers\n", - "import matplotlib as mpl \n", - "import matplotlib.cm as cm \n", - "import matplotlib.pyplot as plt \n", - "\n", - "# import feature extraction methods from sklearn\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.feature_extraction import _stop_words # This Module has become private after sklearn 0.24 thus stop_words changed to _stop_words\n", - "\n", - "# pre-processing of text\n", - "import string\n", - "import re\n", - "\n", - "# import classifiers from sklearn\n", - "from sklearn.naive_bayes import MultinomialNB\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.svm import LinearSVC\n", - "\n", - "# import different metrics to evaluate the classifiers\n", - "from sklearn.metrics import accuracy_score\n", - "\n", - "# from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix \n", - "from sklearn import metrics\n", - "\n", - "# import time function from time module to track the training duration\n", - "from time import time" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1giNRemr1lk7" - }, - "source": [ - "### Section 1: Load and explore the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "markdown", + "metadata": { + "id": "gBCjEALX5xWj" + }, + "source": [ + "**Overview:** This notebook aims to give you a brief overview of performing text classification using Naive Bayes, Logistic Regression and Support Vector Machines. We will be using a dataset called \"Economic news article tone and relevance\" from [Figure-Eight](https://github.com/practical-nlp/practical-nlp/blob/master/Ch4/Data/Full-Economic-News-DFE-839861.csv) which consists of approximately 8000 news articles, which were tagged as relevant or not relevant to the US Economy. Our goal in this notebook is to explore the process of training and testing text classifiers for this problem, using this data set and two text classification algorithms: Multinomial Naive Bayes and Logistic Regression, implemented in sklearn.\n", + "\n", + "##### Dataset Link: In the a folder called Data in folder Ch4 of this repo\n", + "

\n", + "Let's import few necessary packages before we start our work" + ] }, - "id": "fVD8N_E51lk7", - "outputId": "b5893f5e-1123-43f7-d3a5-2e4fb92bfdc9" - }, - "outputs": [], - "source": [ - "try:\n", - " from google.colab import files\n", - " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", - " !ls -lah DATAPATH\n", - " our_data = pd.read_csv(\"DATAPATH/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )\n", - "\n", - "except ModuleNotFoundError:\n", - " our_data = pd.read_csv(\"Data/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 102 + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Mee0VQbBXDto", + "outputId": "7ee35588-1066-4c90-dd1a-f5d30bb13f02" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: numpy==1.23.5 in /usr/local/lib/python3.10/dist-packages (1.23.5)\n", + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: matplotlib==3.7.1 in /usr/local/lib/python3.10/dist-packages (3.7.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (1.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (4.42.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (1.4.4)\n", + "Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (1.23.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (3.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib==3.7.1) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib==3.7.1) (1.16.0)\n", + "Requirement already satisfied: scikit-learn==1.2.2 in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.10.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (3.2.0)\n" + ] + } + ], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "!pip install numpy==1.23.5\n", + "!pip install pandas==1.5.3\n", + "!pip install matplotlib==3.7.1\n", + "!pip install scikit-learn==1.2.2\n", + "\n", + "# ===========================" + ] }, - "id": "LbED8Q185xWu", - "outputId": "2ded8ddf-5553-4f4a-b55f-16454270648d" - }, - "outputs": [ { - "data": { - "text/plain": [ - "(8000, 15)" + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "n7dE_FbM1lk5" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "relevance\n", - "no 0.821375\n", - "yes 0.177500\n", - "not sure 0.001125\n", - "Name: count, dtype: float64" + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "QBvvarqE5xWm" + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import numpy as np\n", + "import pandas as pd # to work with csv files\n", + "\n", + "# matplotlib imports are used to plot confusion matrices for the classifiers\n", + "import matplotlib as mpl\n", + "import matplotlib.cm as cm\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# import feature extraction methods from sklearn\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction import _stop_words # This Module has become private after sklearn 0.24 thus stop_words changed to _stop_words\n", + "\n", + "# pre-processing of text\n", + "import string\n", + "import re\n", + "\n", + "# import classifiers from sklearn\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import LinearSVC\n", + "\n", + "# import different metrics to evaluate the classifiers\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "# from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn import metrics\n", + "\n", + "# import time function from time module to track the training duration\n", + "from time import time" ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "display(our_data.shape) # Number of rows (instances) and columns in the dataset\n", - "our_data[\"relevance\"].value_counts()/our_data.shape[0] # Class distribution in the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vCED1t7F5xW9" - }, - "source": [ - "There is an imbalance in the data with **not relevant** being 82% in the dataset. That is, most of the articles are not relevant to US Economy, which makes sense in a real-world scenario, as news articles discuss various topics. We should keep this class imbalance mind when interpreting the classifier performance later. Let us first convert the class labels into binary outcome variables for convenience. 1 for Yes (relevant), and 0 for No (not relevant), and ignore \"Not sure\". " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "BYW_S3585xXF", - "outputId": "b64bb281-6512-43b5-eda9-73d43becb1ae" - }, - "outputs": [ { - "data": { - "text/plain": [ - "(7991, 2)" + "cell_type": "markdown", + "metadata": { + "id": "1giNRemr1lk7" + }, + "source": [ + "### Section 1: Load and explore the dataset" ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# convert label to a numerical variable\n", - "our_data = our_data[our_data.relevance != \"not sure\"] # removing the data where we don't want relevance=\"not sure\".\n", - "our_data.shape\n", - "our_data['relevance'] = our_data.relevance.map({'yes':1, 'no':0}) # relevant is 1, not-relevant is 0. \n", - "our_data = our_data[[\"text\",\"relevance\"]] # Let us take only the two columns we need.\n", - "our_data.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fOKz8xQr5xXJ" - }, - "source": [ - "### Section 2: Text Pre-processing" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yhC5TZuL5xXK" - }, - "source": [ - "Typical steps involve tokenization, lower casing, removing, stop words, punctuation markers etc, and vectorization. Other processes such as stemming/lemmatization can also be performed. Here, we are performing the following steps: removing br tags, punctuation, numbers, and stopwords. While we are using sklearn's list of stopwords, there are several other stop word lists (e.g., from NLTK) or sometimes, custom stopword lists are needed depending on the task. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "7MZSHdHZ5xXL" - }, - "outputs": [], - "source": [ - "stopwords = _stop_words.ENGLISH_STOP_WORDS\n", - "def clean(doc): # doc is a string of text\n", - " doc = doc.replace(\"
\", \" \") # This text contains a lot of
tags.\n", - " doc = \"\".join([char for char in doc if char not in string.punctuation and not char.isdigit()])\n", - " doc = \" \".join([token for token in doc.split() if token not in stopwords])\n", - " # remove punctuation and numbers\n", - " return doc" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3CfVm42o5xXS" - }, - "source": [ - "### Section 3: Modeling\n", - "\n", - "Now we are ready for the modelling. We are going to use algorithms from sklearn package. We will go through the following steps:\n", - "\n", - "1 Split the data into training and test sets (75% train, 25% test) \n", - "2 Extract features from the training data using CountVectorizer, which is a bag of words feature implementation. We will use the pre-processing function above in conjunction with Count Vectorizer \n", - "3 Transform the test data into the same feature vector as the training data. \n", - "4 Train the classifier \n", - "5 Evaluate the classifier " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "GimJJHhg5xYl", - "outputId": "7ed9cad8-3bd8-416d-a352-4a44fad9dc80" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(7991,) (7991,)\n", - "(5993,) (5993,)\n", - "(1998,) (1998,)\n" - ] - } - ], - "source": [ - "import sklearn\n", - "#from sklearn.cross_validation import train_test_split\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "# Step 1: train-test split\n", - "X = our_data.text # the column text contains textual data to extract features from\n", - "y = our_data.relevance # this is the column we are learning to predict. \n", - "print(X.shape, y.shape)\n", - "# split X and y into training and testing sets. By default, it splits 75% training and 25% test\n", - "# random_state=1 for reproducibility\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n", - "print(X_train.shape, y_train.shape)\n", - "print(X_test.shape, y_test.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "fVD8N_E51lk7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a36f603b-afc2-475e-f6c9-46b3299db3f4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-08-22 16:03:42-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12383529 (12M) [text/plain]\n", + "Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv’\n", + "\n", + "Full-Economic-News- 100%[===================>] 11.81M 71.4MB/s in 0.2s \n", + "\n", + "2023-08-22 16:03:43 (71.4 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv’ saved [12383529/12383529]\n", + "\n", + "total 12M\n", + "drwxr-xr-x 2 root root 4.0K Aug 22 16:03 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 22 16:03 ..\n", + "-rw-r--r-- 1 root root 12M Aug 22 16:03 Full-Economic-News-DFE-839861.csv\n" + ] + } + ], + "source": [ + "try:\n", + " from google.colab import files\n", + " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", + " !ls -lah DATAPATH\n", + " our_data = pd.read_csv(\"DATAPATH/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )\n", + "\n", + "except ModuleNotFoundError:\n", + " our_data = pd.read_csv(\"Data/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )" + ] }, - "id": "gsUyIBUD5xZI", - "outputId": "f4082e6a-a1e9-4b4a-c247-8b1b84c7edae" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(5993, 49753) (1998, 49753)\n" - ] - } - ], - "source": [ - "# Step 2-3: Preprocess and Vectorize train and test data\n", - "vect = CountVectorizer(preprocessor=clean) # instantiate a vectoriezer\n", - "X_train_dtm = vect.fit_transform(X_train)# use it to extract features from training data\n", - "# transform testing data (using training data's features)\n", - "X_test_dtm = vect.transform(X_test)\n", - "print(X_train_dtm.shape, X_test_dtm.shape)\n", - "# i.e., the dimension of our feature vector is 49753!" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 106 + }, + "id": "LbED8Q185xWu", + "outputId": "7672d092-6fda-401a-9651-05e35794a3a0" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "(8000, 15)" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "no 0.821375\n", + "yes 0.177500\n", + "not sure 0.001125\n", + "Name: relevance, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "display(our_data.shape) # Number of rows (instances) and columns in the dataset\n", + "our_data[\"relevance\"].value_counts()/our_data.shape[0] # Class distribution in the dataset" + ] }, - "id": "nDLwA4CL5xZq", - "outputId": "3cb119d8-3017-4ebb-89b9-86dca66e3e92" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 35.4 ms, sys: 0 ns, total: 35.4 ms\n", - "Wall time: 48.9 ms\n" - ] - } - ], - "source": [ - "# Step 3: Train the classifier and predict for test data\n", - "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", - "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", - "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 494 + "cell_type": "markdown", + "metadata": { + "id": "vCED1t7F5xW9" + }, + "source": [ + "There is an imbalance in the data with **not relevant** being 82% in the dataset. That is, most of the articles are not relevant to US Economy, which makes sense in a real-world scenario, as news articles discuss various topics. We should keep this class imbalance mind when interpreting the classifier performance later. Let us first convert the class labels into binary outcome variables for convenience. 1 for Yes (relevant), and 0 for No (not relevant), and ignore \"Not sure\"." + ] }, - "id": "LiCHjvc75xZ3", - "outputId": "1409e48f-0ed6-4705-8688-4e6126662863" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.7822822822822822\n", - "ROC_AOC_Score: 0.7251117679464362\n" - ] + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BYW_S3585xXF", + "outputId": "a3e800a7-e175-4308-dbfe-33ef45e4ba85" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(7991, 2)" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "# convert label to a numerical variable\n", + "our_data = our_data[our_data.relevance != \"not sure\"] # removing the data where we don't want relevance=\"not sure\".\n", + "our_data.shape\n", + "our_data['relevance'] = our_data.relevance.map({'yes':1, 'no':0}) # relevant is 1, not-relevant is 0.\n", + "our_data = our_data[[\"text\",\"relevance\"]] # Let us take only the two columns we need.\n", + "our_data.shape" + ] }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAArsAAAJnCAYAAACXn5vWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB3DUlEQVR4nO3dd1xW9fvH8fd9MxUEVJYDxZV77z1ylCsb5irFlDI1S8uVX3dpNsw0UytXZq606ciRe++Ve6ei4sIJwn1+f/jjrjtQAUHg8Hr2OI9vfM7nnHMdiC8XF9f5HIthGIYAAAAAE7KmdgAAAABASiHZBQAAgGmR7AIAAMC0SHYBAABgWiS7AAAAMC2SXQAAAJgWyS4AAABMi2QXAAAApkWyCwAAANMi2QXg4MiRI2rYsKG8vb1lsVj0888/J+v5T548KYvFomnTpiXrec0gODhYISEhqRrDkCFDZLFYEjU3PDz8iceU0M/VzZs31blzZwUGBspiseidd95JmSABpFkku0AadOzYMb3xxhvKnz+/3N3d5eXlperVq+uLL77QnTt3UvTaHTp00N69e/Xhhx9qxowZqlChQopez4z++usvDRkyRCdPnkztUJLFiBEjkv2XnidlxIgRmjZtmt58803NmDFDr776aopc56uvvuIXOCCNck7tAAA4WrhwoVq2bCk3Nze1b99eJUqUUFRUlNatW6fevXtr//79+vrrr1Pk2nfu3NHGjRs1YMAAde/ePUWukTdvXt25c0cuLi4pcv604K+//tLQoUNVp04dBQcHJ/i4Q4cOyWpN3RrE//73P/Xr189hbMSIEXrppZfUokWL1AnqMfz555+qUqWKBg8enKLX+eqrr+Tr65vqlXkAcZHsAmnIiRMn1Lp1a+XNm1d//vmncuTIYd/XrVs3HT16VAsXLkyx61+6dEmS5OPjk2LXsFgscnd3T7HzpzeGYeju3bvKlCmT3NzcUjscOTs7y9nZPD8aLl68qGLFiqV2GEny7/82ACQdbQxAGvLxxx/r5s2bmjx5skOiG6tgwYJ6++237R9HR0dr+PDhKlCggNzc3BQcHKz3339fkZGRDscFBweradOmWrdunSpVqiR3d3flz59f3333nX3OkCFDlDdvXklS7969ZbFY7FXJkJCQeCuU8fVSLlu2TDVq1JCPj488PT1VuHBhvf/++/b9D+rZ/fPPP1WzZk15eHjIx8dHzz33nA4cOBDv9Y4ePaqQkBD5+PjI29tbHTt21O3btx/8if1/derUUYkSJbRnzx7Vrl1bmTNnVsGCBfXjjz9KklavXq3KlSsrU6ZMKly4sJYvX+5w/KlTp9S1a1cVLlxYmTJlUvbs2dWyZUuHdoVp06apZcuWkqS6devKYrHIYrFo1apVkv75Wvzxxx+qUKGCMmXKpEmTJtn3xVYGDcNQ3bp15efnp4sXL9rPHxUVpZIlS6pAgQK6detWvPdpGIZ8fX3Vq1cv+5jNZpOPj4+cnJx07do1+/ioUaPk7OysmzdvOnyOY1ksFt26dUvTp0+338t/q5fXrl1L0tdj7dq1atmypfLkySM3NzcFBQWpZ8+eydKqs2rVKlksFp04cUILFy60xx77tYqMjNTgwYNVsGBB+7X79OkT53tn6tSpqlevnvz9/eXm5qZixYppwoQJDnOCg4O1f/9+rV692n6dOnXqSHpwD/S0adMc4ok9z4P+27h27ZreeecdBQUFyc3NTQULFtSoUaNks9kczjt79myVL19eWbJkkZeXl0qWLKkvvvjiMT+bQPpmnl/fARP47bfflD9/flWrVi1B8zt37qzp06frpZde0rvvvqvNmzdr5MiROnDggH766SeHuUePHtVLL72kTp06qUOHDpoyZYpCQkJUvnx5FS9eXC+88IJ8fHzUs2dPtWnTRo0bN5anp2ei4t+/f7+aNm2qUqVKadiwYXJzc9PRo0e1fv36hx63fPlyPfvss8qfP7+GDBmiO3fuaNy4capevbp27NgRJ9F++eWXlS9fPo0cOVI7duzQt99+K39/f40aNeqRMV69elVNmzZV69at1bJlS02YMEGtW7fWzJkz9c4776hLly5q27atPvnkE7300ks6c+aMsmTJIknaunWrNmzYoNatWyt37tw6efKkJkyYoDp16uivv/5S5syZVatWLfXo0UNjx47V+++/r6JFi0qS/X+l++0Kbdq00RtvvKHQ0FAVLlw4TpwWi0VTpkxRqVKl1KVLFy1YsECSNHjwYO3fv1+rVq2Sh4dHvPdosVhUvXp1rVmzxj62Z88eXb9+XVarVevXr1eTJk0k3U84y5Yt+8Cv9YwZM9S5c2dVqlRJr7/+uiSpQIECyfL1mDdvnm7fvq0333xT2bNn15YtWzRu3Dj9/fffmjdv3kOPfZSiRYtqxowZ6tmzp3Lnzq13331XkuTn5yebzabmzZtr3bp1ev3111W0aFHt3btXn3/+uQ4fPuzQnzxhwgQVL15czZs3l7Ozs3777Td17dpVNptN3bp1kySNGTNGb731ljw9PTVgwABJUkBAQJLiju+/jdu3b6t27do6e/as3njjDeXJk0cbNmxQ//79df78eY0ZM0bS/V8027Rpo6efftr+uT9w4IDWr1/v8EsykOEYANKE69evG5KM5557LkHzd+3aZUgyOnfu7DD+3nvvGZKMP//80z6WN29eQ5KxZs0a+9jFixcNNzc3491337WPnThxwpBkfPLJJw7n7NChg5E3b944MQwePNj49/+NfP7554Yk49KlSw+MO/YaU6dOtY+VKVPG8Pf3Ny5fvmwf2717t2G1Wo327dvHud5rr73mcM7nn3/eyJ49+wOvGat27dqGJOOHH36wjx08eNCQZFitVmPTpk328T/++CNOnLdv345zzo0bNxqSjO+++84+Nm/ePEOSsXLlyjjzY78WS5YsiXdfhw4dHMYmTZpkSDK+//57Y9OmTYaTk5PxzjvvPPJeP/nkE8PJycmIiIgwDMMwxo4da+TNm9eoVKmS0bdvX8MwDCMmJsbw8fExevbsaT/uv19TwzAMDw+POHH9e25Svx7xfT5HjhxpWCwW49SpUw+NKb7PVXzy5s1rNGnSxGFsxowZhtVqNdauXeswPnHiREOSsX79+ofG2KhRIyN//vwOY8WLFzdq164dZ258sRuGYUydOtWQZJw4ccIh1vj+2xg+fLjh4eFhHD582GG8X79+hpOTk3H69GnDMAzj7bffNry8vIzo6Og41wMyMtoYgDQiIiJCkuxVxEdZtGiRJDn8qVqSvYL1397eYsWKqWbNmvaP/fz8VLhwYR0/fjzJMf9XbK/vL7/8EufPqw9y/vx57dq1SyEhIcqWLZt9vFSpUmrQoIH9Pv+tS5cuDh/XrFlTly9ftn8OH8bT01OtW7e2f1y4cGH5+PioaNGiqly5sn089t///fn5d+/kvXv3dPnyZRUsWFA+Pj7asWNHAu72vnz58qlRo0YJmvv666+rUaNGeuutt/Tqq6+qQIECGjFixCOPq1mzpmJiYrRhwwZJ9yu4NWvWVM2aNbV27VpJ0r59+3Tt2jWH/y6SIqlfj39/Pm/duqXw8HBVq1ZNhmFo586djxXTw8ybN09FixZVkSJFFB4ebt/q1asnSVq5cmW8MV6/fl3h4eGqXbu2jh8/ruvXryd7bPH9tzFv3jzVrFlTWbNmdYi3fv36iomJsVfwfXx8dOvWLS1btizZ4wLSM5JdII3w8vKSJN24cSNB80+dOiWr1aqCBQs6jAcGBsrHx0enTp1yGM+TJ0+cc2TNmlVXr15NYsRxtWrVStWrV1fnzp0VEBCg1q1ba+7cuQ9NfGPjjO9P+UWLFlV4eHic3tT/3kvWrFklKUH3kjt37jg9lN7e3goKCooz9t9z3rlzR4MGDbL3Tfr6+srPz0/Xrl1LVOKTL1++BM+VpMmTJ+v27ds6cuSIpk2blqAHlsqVK6fMmTPbE9vYZLdWrVratm2b7t69a99Xo0aNRMXzX0n9epw+fdr+S46np6f8/PxUu3ZtSUqRRDLWkSNHtH//fvn5+TlsTz31lCQ59EivX79e9evXt/eS+/n52XvQUyrZjS/eJUuWxIm3fv36DvF27dpVTz31lJ599lnlzp1br732mpYsWZLsMQLpDT27QBrh5eWlnDlzat++fYk6LqEvAHBycop33DCMJF8jJibG4eNMmTJpzZo1WrlypRYuXKglS5Zozpw5qlevnpYuXfrAGBLrce7lQccm5JxvvfWWpk6dqnfeeUdVq1a1v3ijdevWCa5kS0r00/WrVq2yPzi1d+9eVa1a9ZHHuLi4qHLlylqzZo2OHj2qsLAw1axZUwEBAbp37542b96stWvXqkiRIvLz80tUPP+VlK9HTEyMGjRooCtXrqhv374qUqSIPDw8dPbsWYWEhCTq85lYNptNJUuW1OjRo+PdH/uLz7Fjx/T000+rSJEiGj16tIKCguTq6qpFixbp888/T1CMCf3eiRXffxs2m00NGjRQnz594j0mNkn39/fXrl279Mcff2jx4sVavHixpk6dqvbt22v69OmPjBUwK5JdIA1p2rSpvv76a23cuPGRCU3evHlls9l05MgRh4efLly4oGvXrtlXVkgOWbNmdXiCP9Z/q8eSZLVa9fTTT+vpp5/W6NGjNWLECA0YMEArV660V6L+ex/S/Qdz/uvgwYPy9fV94INYT9qPP/6oDh066LPPPrOP3b17N87nJqG/gCTE+fPn9dZbb6lhw4ZydXXVe++9p0aNGiXo61uzZk2NGjVKy5cvl6+vr4oUKSKLxaLixYtr7dq1Wrt2rZo2bfrI8yTn/cTau3evDh8+rOnTp6t9+/b28SfxJ/gCBQpo9+7devrppx96b7/99psiIyP166+/OlSv/93mEOtB54mtcl+7ds1hSb/4vnceFu/Nmzfj/f75L1dXVzVr1kzNmjWTzWZT165dNWnSJA0cODDOX4GAjII2BiAN6dOnjzw8PNS5c2dduHAhzv5jx47ZlxFq3LixJNmfxI4VW62Kfdo+ORQoUEDXr1/Xnj177GPnz5+Ps+LDlStX4hxbpkwZSYqzpFOsHDlyqEyZMpo+fbpD0rhv3z4tXbrUfp9pgZOTU5xq5bhx4+JU6WKT8/h+QUis0NBQ2Ww2TZ48WV9//bWcnZ3VqVOnBFWxa9asqcjISI0ZM0Y1atSwJ2Q1a9bUjBkzdO7cuQT163p4eCTLvfxbbDX43/dhGMYTWSbr5Zdf1tmzZ/XNN9/E2Xfnzh1720x8MV6/fl1Tp06Nc9yDPkexK1f8e2WM2KXcEhPvxo0b9ccff8TZd+3aNUVHR0uSLl++7LDParWqVKlSkh78/QdkBFR2gTSkQIEC+uGHH9SqVSsVLVrU4Q1qGzZs0Lx58+xrnJYuXVodOnTQ119/rWvXrql27drasmWLpk+frhYtWqhu3brJFlfr1q3Vt29fPf/88+rRo4du376tCRMm6KmnnnJ4MGvYsGFas2aNmjRporx58+rixYv66quvlDt37of2hX7yySd69tlnVbVqVXXq1Mm+9Ji3t7eGDBmSbPfxuJo2baoZM2bI29tbxYoV08aNG7V8+XJlz57dYV6ZMmXk5OSkUaNG6fr163Jzc7Ov1ZoYU6dO1cKFCzVt2jTlzp1b0v3k+pVXXtGECRPUtWvXhx5ftWpVOTs769ChQ/ZlwySpVq1a9rViE5Lsli9fXsuXL9fo0aOVM2dO5cuXz+FhvqQoUqSIChQooPfee09nz56Vl5eX5s+fn6w95A/y6quvau7cuerSpYtWrlyp6tWrKyYmRgcPHtTcuXPt69zGVtObNWumN954Qzdv3tQ333wjf39/nT9/3uGc5cuX14QJE/TBBx+oYMGC8vf3V7169dSwYUPlyZNHnTp1Uu/eveXk5KQpU6bIz89Pp0+fTlC8vXv31q+//qqmTZvalwu8deuW9u7dqx9//FEnT56Ur6+vOnfurCtXrqhevXrKnTu3Tp06pXHjxqlMmTIOf/0BMpzUWgYCwIMdPnzYCA0NNYKDgw1XV1cjS5YsRvXq1Y1x48YZd+/etc+7d++eMXToUCNfvnyGi4uLERQUZPTv399hjmHEv/ySYdxfiuvfyyU9aOkxwzCMpUuXGiVKlDBcXV2NwoULG99//32cZZVWrFhhPPfcc0bOnDkNV1dXI2fOnEabNm0clkyKb+kxwzCM5cuXG9WrVzcyZcpkeHl5Gc2aNTP++usvhzmx1/vv0mbxLeMUn9q1axvFixePM/6gz48ko1u3bvaPr169anTs2NHw9fU1PD09jUaNGhkHDx6Mdxmsb775xsifP7/h5OTksAzZg64Vuy/2PGfOnDG8vb2NZs2axZn3/PPPGx4eHsbx48cfer+GYRgVK1Y0JBmbN2+2j/3999+GJCMoKCjO/PiWyjp48KBRq1YtI1OmTIYke4yP+/X466+/jPr16xuenp6Gr6+vERoaauzevTvOfx/JvfSYYRhGVFSUMWrUKKN48eKGm5ubkTVrVqN8+fLG0KFDjevXr9vn/frrr0apUqUMd3d3Izg42Bg1apQxZcqUOPcXFhZmNGnSxMiSJYshyeH7avv27UblypUNV1dXI0+ePMbo0aMfuPTYg/7buHHjhtG/f3+jYMGChqurq+Hr62tUq1bN+PTTT42oqCjDMAzjxx9/NBo2bGj4+/vbr/XGG28Y58+ff+TnCTAzi2Ek4G9hAAAAQDpEzy4AAABMi2QXAAAApkWyCwAAANMi2QUAAIBpkewCAADAtEh2AQAAYFq8VALxstlsOnfunLJkyZIirwoFAMCsDMPQjRs3lDNnTlmtqV9XvHv3rqKiolLk3K6urnJ3d0+RcycXkl3E69y5cwoKCkrtMAAASLfOnDljf/tharl7964yZckuRd9OkfMHBgbqxIkTaTrhJdlFvLJkySJJci3WQRYn11SOBsB/nV71aWqHAOABbkREqGC+IPvP0tQUFRUlRd+WW7EOUnL/PI+JUthf0xUVFUWyi/QntnXB4uRKsgukQV5eXqkdAoBHSFNtgM7uyf7z3LCkfotGQpDsAgAAmJ1FUnIn32kol3+Y9JGSAwAAAElAZRcAAMDsLNb7W3KfMx1IH1ECAAAASUBlFwAAwOwslhTo2U0fTbtUdgEAAGBaVHYBAADMjp5dAAAAwHyo7AIAAJhdBu7ZJdkFAAAwvRRoY0gnDQLpI0oAAAAgCajsAgAAmF0GbmOgsgsAAADTorILAABgdiw9BgAAAJgPlV0AAACzo2cXAAAAMB8quwAAAGZHzy4AAABgPlR2AQAAzC4D9+yS7AIAAJgdbQwAAACA+VDZBQAAMDuLJQUqu+mjjYHKLgAAAEyLyi4AAIDZWS33t+Q+ZzpAZRcAAACmRWUXAADA7FiNAQAAADAfKrsAAABmx0slAAAAYFq0MQAAAAApb/z48QoODpa7u7sqV66sLVu2PHT+mDFjVLhwYWXKlElBQUHq2bOn7t69m+DrkewCAACYXWwbQ3JviTRnzhz16tVLgwcP1o4dO1S6dGk1atRIFy9ejHf+Dz/8oH79+mnw4ME6cOCAJk+erDlz5uj9999P8DVJdgEAAPBEjB49WqGhoerYsaOKFSumiRMnKnPmzJoyZUq88zds2KDq1aurbdu2Cg4OVsOGDdWmTZtHVoP/jWQXAADA7GJ7dpN7kxQREeGwRUZGxhtCVFSUtm/frvr169vHrFar6tevr40bN8Z7TLVq1bR9+3Z7cnv8+HEtWrRIjRs3TvCtk+wCAAAgyYKCguTt7W3fRo4cGe+88PBwxcTEKCAgwGE8ICBAYWFh8R7Ttm1bDRs2TDVq1JCLi4sKFCigOnXqJKqNgdUYAAAAzC4Flx47c+aMvLy87MNubm7JdolVq1ZpxIgR+uqrr1S5cmUdPXpUb7/9toYPH66BAwcm6BwkuwAAAEgyLy8vh2T3QXx9feXk5KQLFy44jF+4cEGBgYHxHjNw4EC9+uqr6ty5sySpZMmSunXrll5//XUNGDBAVuujmxRoYwAAADC7FOzZTShXV1eVL19eK1assI/ZbDatWLFCVatWjfeY27dvx0lonZycJEmGYSToulR2AQAA8ET06tVLHTp0UIUKFVSpUiWNGTNGt27dUseOHSVJ7du3V65cuex9v82aNdPo0aNVtmxZexvDwIED1axZM3vS+ygkuwAAAGaXRl4X3KpVK126dEmDBg1SWFiYypQpoyVLltgfWjt9+rRDJfd///ufLBaL/ve//+ns2bPy8/NTs2bN9OGHHyY8TCOhNWBkKBEREfL29pZbyVBZnFxTOxwA/3F165epHQKAB4iIiFBAdm9dv349Qb2sKR2Lt7e33Op/JIuLe7Ke27h3V5HL+6WJ+3wYenYBAABgWrQxAAAAmF0aaWNIDVR2AQAAYFpUdgEAAMzOYkn0UmEJOmc6QGUXAAAApkVlFwAAwOyS8BKIBJ0zHUgfUQIAAABJQGUXAADA7DLwagwkuwAAAGZHGwMAAABgPlR2AQAAzC4DtzFQ2QUAAIBpUdkFAAAwO3p2AQAAAPOhsgsAAGB29OwCAAAA5kNlFwAAwOQsFossGbSyS7ILAABgchk52aWNAQAAAKZFZRcAAMDsLP+/Jfc50wEquwAAADAtKrsAAAAmR88uAAAAYEJUdgEAAEyOyi4AAABgQlR2AQAATI7KLgAAAGBCVHYBAABMLiNXdkl2AQAAzI6XSgAAAADmQ2UXAADA5DJyGwOVXQAAAJgWlV0AAACTs1iUApXd5D1dSqGyCwAAANOisgsAAGByFqVAz246Ke1S2QUAAIBpUdkFAAAwuYy8GgPJLgAAgNnxUgkAAADAfKjsAgAAmF0KtDEY6aSNgcouAAAATIvKLgAAgMmlxANqyb+UWcqgsgsAAADTorILAABgclR2AQAAABOisgsAAGB2rLMLAAAAmA+VXQAAAJPLyD27JLsAAAAml5GTXdoYAAAAYFpUdgEAAEyOyi4AAABgQlR2AQAATI7KLgAAAGBCJLsAAABmZ0mhLQnGjx+v4OBgubu7q3LlytqyZcsD59apU8delf731qRJkwRfj2QXAAAAT8ScOXPUq1cvDR48WDt27FDp0qXVqFEjXbx4Md75CxYs0Pnz5+3bvn375OTkpJYtWyb4miS7AAAAJhdfdTQ5tsQaPXq0QkND1bFjRxUrVkwTJ05U5syZNWXKlHjnZ8uWTYGBgfZt2bJlypw5c6KSXR5QAwAAMLmUfEAtIiLCYdzNzU1ubm5x5kdFRWn79u3q37+/fcxqtap+/frauHFjgq45efJktW7dWh4eHgmOk8ouAAAAkiwoKEje3t72beTIkfHOCw8PV0xMjAICAhzGAwICFBYW9sjrbNmyRfv27VPnzp0TFR+VXQAAAJNLycrumTNn5OXlZR+Pr6qbHCZPnqySJUuqUqVKiTqOZBcAAABJ5uXl5ZDsPoivr6+cnJx04cIFh/ELFy4oMDDwocfeunVLs2fP1rBhwxIdH20MAAAAZpcGlh5zdXVV+fLltWLFCvuYzWbTihUrVLVq1YceO2/ePEVGRuqVV15J3EVFsgukW2+8XEsHFw7V1U2fa81376lC8bwPnd+9bR3t/mmgrmwcrSOLh+vjd1+Qm6vjH3dy+nlrygft9ffKUbqycbS2zn1f5YrlScnbAExp4lfjVbhgsHw83VWzWmVtfcg6on/t36/WL7+owgWDlcnFonFfjHnouT/5+CNlcrHovV7vJG/QwBPQq1cvffPNN5o+fboOHDigN998U7du3VLHjh0lSe3bt3d4gC3W5MmT1aJFC2XPnj3R16SNAUiHXmpYTqPefV5vfThHW/edVPe2dfXrV91UusUwXbp6M878Vs9U0PAez6nLkJnauPu4CuX11zfDXpUhqe9nCyRJPlky6c9pvbR66xG16P6VLl29qYJ5/HQ14vYTvjsgfZs3d4769u6lceMnqmKlyvpy7Bg1b9JIu/cfkr+/f5z5t2/fVr58+fXCiy3V972eDz33tq1bNfmbSSpZslRKhQ+TSiuvC27VqpUuXbqkQYMGKSwsTGXKlNGSJUvsD62dPn1aVqtjLfbQoUNat26dli5dmqQ4SXaBdKjHK/U0dcEGzfh1kyTprQ9n69maxdWhRVV9OnVZnPlVSufTxl3HNWfJNknS6fNXNHfJNlUsEWyf827HBvo77KreGPK9fezUucspeyOACY0dM1odO4Wqfcj9StW4ryZq8eKFmj5tinr36RdnfoWKFVWhYkVJ0sABcffHunnzpjp2aKevJn6jj0Z8kDLBA09A9+7d1b1793j3rVq1Ks5Y4cKFZRhGkq9HGwOQzrg4O6ls0SD9ufmQfcwwDP25+ZAqlcoX7zGbdp9Q2WJB9laH4FzZ1ah6cS1Zt98+p0ntktrx12nN/Pg1nVoxUhtn9VXH56ul7M0AJhMVFaWdO7ar3tP17WNWq1X16tXXlk0JW0f0Qd55q5ueebaJw7mBhEorL5VIDVR2gXTGN6unnJ2ddPHKDYfxi5cjVDg4IN5j5izZpuxZPbRiak9ZZJGLi5O+nrdWn0z5509C+XL5KrRlTY39/k99PHmpyhfPq8/6vKSo6BjN/G1zit4TYBax64j6+zt+L/oHBOjQoYNJPu/cObO1a+cOrdu09XFDRAZlUQq0MST2CbVUQmU3GQ0ZMkRlypRJ7TCAOGqWL6TerzXS2yPnqGrbUWrV62s9W6O4+oU+Y59jtVq06+AZDf7yN+0+9LemLFivqT9tUOhLNVIxcgBnzpxR715va+p3M+Xu7p7a4QDpTppMdkNCQmSxWPTRRx85jP/888+J/q0kODhYY8aMSdC82JJ85syZVbJkSX377beJulZaRRJuLuFXbyo6Okb+2bI4jPtn91LY5Yh4jxnctYlmLdyiaT9t1P6j5/Tryj0a9OVv6t2xof17Kiw8QgeOO77B5uCJMAUFZk2ZGwFMKHYd0YsXHdcRvZiAdUQfZOeO7bp48aKqVionT3dnebo7a+2a1frqy7HydHdWTExMcoQOk8vIbQxpMtmVJHd3d40aNUpXr159YtccNmyYzp8/r3379umVV15RaGioFi9e/MSuDyTEvegY7TxwRnUrF7aPWSwW1a30lLbsORHvMZncXWWzOTb322y2/z/2/scbdx3XU3kdnxQvlMdfp89fScboAXNzdXVV2XLltfJPx3VEV65coUpVHr6O6IPUrfe0tu3cq83bdtm3cuUrqHWbdtq8bZecnJySK3zAlNJsslu/fn0FBgY+8P3KsebPn6/ixYvLzc1NwcHB+uyzz+z76tSpo1OnTqlnz54J+g0kS5YsCgwMVP78+dW3b19ly5ZNy5b982T7tWvX1LlzZ/n5+cnLy0v16tXT7t27H3rOb7/9VkWLFpW7u7uKFCmir776yr6vWrVq6tu3r8P8S5cuycXFRWvWrJEkzZgxQxUqVLDH1rZtW128eNE+f9WqVbJYLFqxYoUqVKigzJkzq1q1ajp06P7DS9OmTdPQoUO1e/du++dg2rRpD40Zad/Y7/9Ux+erqV2zyiqcL0Bj32+lzJnc9N0v91dn+Hb4qxr2VnP7/EVr9im0ZQ21bFReeXNmV73KRTTozaZatGavPQke9/2fqlQyn3q/1lD5g3zV6pkKeu3F6po0Z02q3COQXvV4p5emTv5G3383XQcPHFCPbm/q9q1bat/h/uoMnULaa+CAf9YRjYqK0u5du7R71y5FRUXp3Lmz2r1rl44dPSrp/s+m4iVKOGweHh7Klj27ipcokSr3iHQoDbxUIrWk2QfUnJycNGLECLVt21Y9evRQ7ty548zZvn27Xn75ZQ0ZMkStWrXShg0b1LVrV2XPnl0hISFasGCBSpcurddff12hoaEJvrbNZtNPP/2kq1evytXV1T7esmVLZcqUSYsXL5a3t7cmTZqkp59+WocPH1a2bNninGfmzJkaNGiQvvzyS5UtW1Y7d+5UaGioPDw81KFDB7Vr104ff/yxPvroI3siPmfOHOXMmVM1a9aUJN27d0/Dhw9X4cKFdfHiRfXq1UshISFatGiRw7UGDBigzz77TH5+furSpYtee+01rV+/Xq1atdK+ffu0ZMkSLV++XJLk7e0dJ9bIyEhFRkbaP46IiP/P4Ugbfly6Q75ZPTXozSYKyJ5Few6d1XPdxtsfWgsKzOZQyf3o2yUyDEODuzZVTn9vhV+9qYVr9mnIl7/Z52z/67RavfuNhr3VXO+//qxOnr2s3p/M1+zF2574/QHpWcuXWyn80iUNGzpIF8LCVKp0Gf3y+z/riJ4547iO6Plz51SlYln7x2NGf6oxoz9VzVq1tXTFqicdPmA6FuNxFi5LISEhIbp27Zp+/vlnVa1aVcWKFdPkyZP1888/6/nnn7evtdauXTtdunTJYZHhPn36aOHChdq///6SSsHBwXrnnXf0zjvvPPSawcHBOn/+vFxcXBQZGano6Ghly5ZNmzdvVsGCBbVu3To1adJEFy9elJubm/24ggULqk+fPnr99dc1ZMgQ/fzzz9q1a5d93/Dhw9WmTRv7/A8++ECLFi3Shg0bdOnSJeXMmVN//vmnPbmtVq2aatWqFadfOda2bdtUsWJF3bhxQ56enlq1apXq1q2r5cuX6+mnn5YkLVq0SE2aNNGdO3fk7u4eJ674DBkyREOHDo0z7lYyVBYn13iOAJCarm79MrVDAPAAERERCsjurevXr8vLyyvVY/H29lbervNkdcucrOe2Rd7Wqa9apon7fJg028YQa9SoUfZXyv3XgQMHVL16dYex6tWr68iRI0lq2O/du7d27dqlP//8U5UrV9bnn3+uggULSpJ2796tmzdvKnv27PL09LRvJ06c0LFjx+Kc69atWzp27Jg6derkMP+DDz6wz/fz81PDhg01c+ZMSdKJEye0ceNGtWvXzn6e7du3q1mzZsqTJ4+yZMmi2rVrS7r/hpF/K1Xqn7fp5MiRQ5Ic2h0epX///rp+/bp9O3PmTIKPBQAASKvSbBtDrFq1aqlRo0bq37+/QkJCUvRavr6+KliwoAoWLKh58+apZMmSqlChgooVK6abN28qR44c8b7Zw8fHJ87YzZv3X9n6zTffqHLlyg77/v0wQbt27dSjRw+NGzdOP/zwg0qWLKmSJUtKup8wN2rUSI0aNdLMmTPl5+en06dPq1GjRoqKinI4p4uLi/3fY1siYh9ASgg3NzeHijUAADCPtPK64NSQ5pNdSfroo49UpkwZFS5c2GG8aNGiWr9+vcPY+vXr9dRTT9kTSldX1yRVeYOCgtSqVSv1799fv/zyi8qVK6ewsDA5OzsrODj4kccHBAQoZ86cOn78uEOl9r+ee+45vf7661qyZIl++OEHtW/f3r7v4MGDunz5sj766CMFBQVJut/GkFhJ/RwAAACkd+ki2S1ZsqTatWunsWPHOoy/++67qlixooYPH65WrVpp48aN+vLLLx1WPAgODtaaNWvUunVrubm5ydfXN8HXffvtt1WiRAlt27ZN9evXV9WqVdWiRQt9/PHHeuqpp3Tu3DktXLhQzz//vCpUqBDn+KFDh6pHjx7y9vbWM888o8jISG3btk1Xr15Vr169JEkeHh5q0aKFBg4cqAMHDjj09+bJk0eurq4aN26cunTpon379mn48OGJ/fQpODhYJ06c0K5du5Q7d25lyZKFKi4AABmIxfLPUpPJec70IM337MYaNmxYnD/LlytXTnPnztXs2bNVokQJDRo0SMOGDXNodxg2bJhOnjypAgUKyM/PL1HXLFasmBo2bKhBgwbJYrFo0aJFqlWrljp27KinnnpKrVu31qlTp+xP2P5X586d9e2332rq1KkqWbKkateurWnTpilfvnwO89q1a6fdu3erZs2aypMnj33cz89P06ZN07x581SsWDF99NFH+vTTTxN1D5L04osv6plnnlHdunXl5+enWbNmJfocAAAg/bqf7Cb3SyVS+64SJk2uxoDUF/v0JqsxAGkTqzEAaVdaXI0h/1s/yurmkazntkXe0vFxL6WJ+3yYdNHGAAAAgMeQAm0M6eWlEummjQEAAABILCq7AAAAJpeRlx6jsgsAAADTorILAABgciw9BgAAAJgQlV0AAACTs1otslqTtxRrJPP5UgrJLgAAgMnRxgAAAACYEJVdAAAAk2PpMQAAAMCEqOwCAACYHD27AAAAgAlR2QUAADA5enYBAAAAE6KyCwAAYHJUdgEAAAATorILAABgchl5NQaSXQAAAJOzKAXaGJQ+sl3aGAAAAGBaVHYBAABMLiO3MVDZBQAAgGlR2QUAADA5lh4DAAAATIjKLgAAgMnRswsAAACYEJVdAAAAk8vIPbskuwAAACZHGwMAAABgQlR2AQAATC4jtzFQ2QUAAIBpUdkFAAAwuxTo2VX6KOxS2QUAAIB5UdkFAAAwOXp2AQAAABOisgsAAGByGXmdXZJdAAAAk6ONAQAAADAhKrsAAAAml5HbGKjsAgAAwLRIdgEAAEwutmc3ubekGD9+vIKDg+Xu7q7KlStry5YtD51/7do1devWTTly5JCbm5ueeuopLVq0KMHXo40BAAAAT8ScOXPUq1cvTZw4UZUrV9aYMWPUqFEjHTp0SP7+/nHmR0VFqUGDBvL399ePP/6oXLly6dSpU/Lx8UnwNUl2AQAATC6trMYwevRohYaGqmPHjpKkiRMnauHChZoyZYr69esXZ/6UKVN05coVbdiwQS4uLpKk4ODgRF2TNgYAAAAkWUREhMMWGRkZ77yoqCht375d9evXt49ZrVbVr19fGzdujPeYX3/9VVWrVlW3bt0UEBCgEiVKaMSIEYqJiUlwfCS7AAAAJhe7GkNyb5IUFBQkb29v+zZy5Mh4YwgPD1dMTIwCAgIcxgMCAhQWFhbvMcePH9ePP/6omJgYLVq0SAMHDtRnn32mDz74IMH3ThsDAAAAkuzMmTPy8vKyf+zm5pZs57bZbPL399fXX38tJycnlS9fXmfPntUnn3yiwYMHJ+gcJLsAAAAml5I9u15eXg7J7oP4+vrKyclJFy5ccBi/cOGCAgMD4z0mR44ccnFxkZOTk32saNGiCgsLU1RUlFxdXR95XdoYAAAATC4l2xgSytXVVeXLl9eKFSvsYzabTStWrFDVqlXjPaZ69eo6evSobDabfezw4cPKkSNHghJdiWQXAAAAT0ivXr30zTffaPr06Tpw4IDefPNN3bp1y746Q/v27dW/f3/7/DfffFNXrlzR22+/rcOHD2vhwoUaMWKEunXrluBr0sYAAABgcmll6bFWrVrp0qVLGjRokMLCwlSmTBktWbLE/tDa6dOnZbX+U4sNCgrSH3/8oZ49e6pUqVLKlSuX3n77bfXt2zfB1yTZBQAAwBPTvXt3de/ePd59q1atijNWtWpVbdq0KcnXI9kFAAAwOYsS32ObkHOmB/TsAgAAwLSo7AIAAJic1WKRNZlLu8l9vpRCZRcAAACmRWUXAADA5JKyLm5CzpkekOwCAACYXFpZeiw10MYAAAAA06KyCwAAYHJWy/0tuc+ZHlDZBQAAgGlR2QUAADA7Swr02KaTym6Ckt3XXnstyRewWCyaPHlyko8HAAAAkipBye60adOSfAGSXQAAgNTF0mOPsHLlypSOAwAAAEh2CUp2a9eundJxAAAAIIVY/v+f5D5nesBqDAAAADCtx1qNITo6WgsXLtSWLVsUHh6uypUr2x9mO3funMLDw1WsWDE5O7PoAwAAQGrJyOvsJjkLXbdunV555RWdOXNGhmHIYrHo3r179mR348aNevnllzVv3jy98MILyRYwAAAAEofXBSfSX3/9pWeeeUbnz5/XW2+9pblz58owDIc5zZo1U+bMmTV//vxkCRQAAABIrCRVdocPH667d+9q0aJFatiwYbxzXF1dVa5cOe3cufOxAgQAAMDjychLjyWpsrty5UpVqlTpgYlurFy5cuncuXNJCgwAAAB4XEmq7F67dk1BQUGPnHfr1i3du3cvKZcAAABAMrFaLLImcyk2uc+XUpJU2fX399fRo0cfOe/AgQMJSooBAACAlJCkZLdevXratWvXQ9+s9tNPP+no0aNq0KBBkoMDAADA44vt2U3uLT1IUrLbr18/ubq6qkWLFpowYYLCwsLs+65evaopU6aoU6dO8vDwUK9evZItWAAAACAxkpTsFilSRLNmzZLNZlP37t2VK1cuWSwWTZ8+Xb6+vgoNDVVkZKRmzpypfPnyJXfMAAAASITYdXaTe0sPkvy64BYtWmjfvn166623VKRIEbm7u8vV1VX58+fXG2+8oT179qh58+bJGSsAAACSICO3MTzWe3zz5s2rMWPGJFMoAAAAQPJ6rGQXAAAAaV9GXnrssZLdyMhIzZ8/X2vXrrW/PCJnzpyqUaOGXnzxRbm7uydLkAAAAEBSJDnZXb58uUJCQnT+/HkZhuGw7+uvv1afPn00bdo0lh4DAABIZZb/35L7nOlBkpLdzZs3q2nTpoqKilLlypXVpk0bBQcHS5JOnTqlWbNmadOmTWrWrJlWr16typUrJ2fMAAAAQIIkKdkdOHCg7t27pwkTJuiNN96Is/+tt97S119/rS5dumjQoEH6448/HjtQAAAAJE1KLBVm6qXHNm/erAoVKsSb6MZ6/fXXVbFiRW3atCnJwQEAAACPI0nJrtVqVcGCBR85r2DBgukm6wcAADArqyVltvQgSW0MlSpV0p49ex45b8+ePapUqVJSLgEAAIBkQhtDIg0fPlxHjhzR4MGDZbPZ4uw3DEODBw/WkSNHNHz48McOEgAAAEiKBFV2v/vuuzhjHTp00AcffKAZM2boxRdfVN68eSXdX41hwYIFOnnypEJDQ3Xo0CFWYwAAAEhl6aQQm+wsxn8XyY2H1WqNt1T970Nj9//3dBaLRTExMY8bJ56wiIgIeXt7y61kqCxOrqkdDoD/uLr1y9QOAcADREREKCC7t65fvy4vL69Uj8Xb21svf71Orpk9k/XcUbdvau7rNdLEfT5Mgiq7gwYNSjd9GQAAAHCUkXt2E5TsDhkyJIXDAAAAAJJfkl8XDAAAgPQhJZYKSy9LjyVpNQYAAAAgPXisyu66dev0yy+/6MiRI7px40ach9Ok+/0cK1aseJzLAAAA4DHQs5tIhmGoU6dOmj59uj3BtVgscVZnMAwj3XwiAAAAYD5JamOYOHGipk2bpvLly2vZsmV64YUXJEmHDh3S4sWLFRISIqvVqt69e+v48ePJGjAAAAASx5JCW3qQpMrutGnT5OHhocWLFyt79uz6/vvvJUmFChVSoUKF1KhRIzVu3FitWrVStWrV7C+cAAAAwJNntVhkTea/tif3+VJKkiq7Bw4cULVq1ZQ9e3ZJ//Rs/PvlES+99JLKly+vTz/9NBnCBAAAABIvScmuzWazJ7qSlDlzZknS1atXHeYVKlRIe/fufYzwAAAA8LgslpTZ0oMkJbu5cuXSuXPn7B/Htins3LnTYd7hw4fl7MxSvgAAAEgdSUp2y5Urp7/++svettCwYUMZhqE+ffro4MGDunHjhj755BNt375dZcuWTdaAAQAAkDixS48l95YeJCnZbd68ucLDw7Vw4UJJUunSpdW6dWvt3r1bxYsXl4+Pj/r16ydnZ2d9+OGHyRowAAAAkFBJ6jFo06aNXnjhBYcWhenTp6tUqVL6+eefdfXqVT311FPq06ePKlWqlGzBAgAAIPFSosc2nRR2k/4GNTc3N4ePXVxc1K9fP/Xr1++xgwIAAACSA0+PAQAAmBzr7AIAAMC00tLSY+PHj1dwcLDc3d1VuXJlbdmy5YFzp02bFuehOHd390RdL0GV3fz58yfqpP9msVh07NixJB8PAAAAc5gzZ4569eqliRMnqnLlyhozZowaNWqkQ4cOyd/fP95jvLy8dOjQIfvHiV0FIkHJ7smTJxN1UgAAAKQdKbFUWFLON3r0aIWGhqpjx46SpIkTJ2rhwoWaMmXKA5/7slgsCgwMTHKcCUp2bTZbki+A9K1a+1ZyzuSR2mEA+I8Z206ldggAHuDOrRupHcITFRER4fCxm5tbnIUMJCkqKkrbt29X//797WNWq1X169fXxo0bH3j+mzdvKm/evLLZbCpXrpxGjBih4sWLJzg+enYBAABMzppCmyQFBQXJ29vbvo0cOTLeGMLDwxUTE6OAgACH8YCAAIWFhcV7TOHChTVlyhT98ssv+v7772Wz2VStWjX9/fffCb53VmMAAABAkp05c0ZeXl72j+Or6iZV1apVVbVqVfvH1apVU9GiRTVp0iQNHz48Qecg2QUAADC5lOzZ9fLyckh2H8TX11dOTk66cOGCw/iFCxcS3JPr4uKismXL6ujRowmOkzYGAAAApDhXV1eVL19eK1assI/ZbDatWLHCoXr7MDExMdq7d69y5MiR4OtS2QUAADA5i0WypoHXBffq1UsdOnRQhQoVVKlSJY0ZM0a3bt2yr87Qvn175cqVy973O2zYMFWpUkUFCxbUtWvX9Mknn+jUqVPq3Llzgq9JsgsAAIAnolWrVrp06ZIGDRqksLAwlSlTRkuWLLE/tHb69GlZrf80Hly9elWhoaEKCwtT1qxZVb58eW3YsEHFihVL8DVJdgEAAEzOmgKV3aSer3v37urevXu8+1atWuXw8eeff67PP/88aRf6fyS7AAAAJpdWXiqRGpIl2T1y5IjCw8OVPXt2PfXUU8lxSgAAAOCxJXk1hsjISL3//vvy9fVVkSJFVKNGDX300Uf2/d9//73KlSunXbt2JUecAAAASKLYNobk3tKDJCW7d+7cUZ06dTRq1Ci5urqqcePGMgzDYU69evW0e/duzZ07N1kCBQAAABIrScnuxx9/rM2bN+u1117T8ePH9dtvv8WZkzNnThUrVkzLly9/7CABAACQdBZLymzpQZKS3Tlz5ihPnjyaMGGC3N3dHzivcOHCOnPmTJKDAwAAAB5Hkh5QO3HihJo0aSJn54cf7urqqqtXryYpMAAAACQPq8UiazKXYpP7fCklSZXdTJkyJSiJPXHihLJmzZqUSwAAAACPLUnJbpkyZbRt2zZdunTpgXNOnDihnTt3qmLFikkODgAAAI/PmkJbepCkOENDQ3Xjxg21adNG4eHhcfZfu3ZNr732mu7du6fXX3/9sYMEAABA0mXkB9SS1LPbpk0b/fbbb5o9e7by58+vatWqSZLWr1+v5557TqtXr1ZERITat2+vpk2bJmvAAAAAQEIluQI9c+ZMjRo1Su7u7lq6dKmk+29S++2332SxWPThhx9q6tSpyRYoAAAAksYqi/0htWTblD5Ku0l+XbDFYlHv3r3Vq1cv7dixQydPnpTNZlPu3LlVsWJFubq6JmecAAAAQKIlOdmN5eTkpIoVK/IgGgAAQBqVEj226aVnN708SAcAAAAkWpIqu6+99lqC51osFk2ePDkplwEAAEAysFrub8l9zvQgScnutGnTHjnHYrHIMAySXQAAAKSaJCW7K1eujHfcZrPpzJkzWrp0qWbPnq2ePXuqWbNmjxUgAAAAHo/Fkvyv900vPbtJSnZr16790P3t27dXkyZN1KFDBzVv3jxJgQEAACB58IBaCmjTpo2KFy+uIUOGpNQlAAAAgIdK0dUYChUqpG3btqXkJQAAAPAIsQ+oJfeWHqRYsmuz2bRnzx5ZraxuBgAAgNSR7Jno7du3tWvXLrVp00ZHjhx5ZH8vAAAAUpYlhf5JD5L0gJqTk9Mj5xiGIT8/P33yySdJuQQAAADw2JKU7AYFBcnygEfwXF1dlSNHDtWuXVvdunWTv7//YwUIAACAx8NLJRLp5MmTyRwGAAAAkPySlOz++uuvcnFx0bPPPpvc8QAAACCZZeTKbpIeUHv++ec1duzY5I4FAAAASFZJquz6+fkpa9asyR0LAAAAUoDFYnng81aPc870IEnJbp06dbRlyxYZhpFubhQAACCjoo0hkYYPH67w8HD17NlTd+/eTe6YAAAAgGSRpMrurFmz1LhxY40bN06zZ89W/fr1lSdPHrm7u8eZa7FYNHDgwMcOFAAAAEljsdzfkvuc6UGCkt38+fOrZcuWGjVqlCRpyJAhslgsMgxDFy9e1A8//PDAY0l2AQAAkFoSlOyePHlSly5dsn88derUFAsIAAAAyctqsciazKXY5D5fSklSG0OHDh2SOw4AAAAg2SUp2QUAAED6wWoMAAAAgAkluLK7a9cuDRs2LEkXGTRoUJKOAwAAQDJIgdUYlE4quwlOdnfv3q3du3cn6uSxL50g2QUAAEg9VllkTebsNLnPl1ISnOwWKFBA1atXT8lYAAAAgGSV4GS3Ro0amjJlSkrGAgAAgBSQkV8qwQNqAAAAMC2WHgMAADA5lh4DAAAATIjKLgAAgMnxuuBHsNlsKR0HAAAAkOyo7AIAAJgcqzEAAAAAJkRlFwAAwOSsSoGeXbO9QQ0AAADpE20MAAAAgAlR2QUAADA5q5K/wpleKqbpJU4AAAAg0ajsAgAAmJzFYpElmZtsk/t8KYXKLgAAAJ6Y8ePHKzg4WO7u7qpcubK2bNmSoONmz54ti8WiFi1aJOp6JLsAAAAmZ0mhLbHmzJmjXr16afDgwdqxY4dKly6tRo0a6eLFiw897uTJk3rvvfdUs2bNRF+TZBcAAABPxOjRoxUaGqqOHTuqWLFimjhxojJnzqwpU6Y88JiYmBi1a9dOQ4cOVf78+RN9TZJdAAAAk7NaLCmySVJERITDFhkZGW8MUVFR2r59u+rXr/9PXFar6tevr40bNz4w9mHDhsnf31+dOnVK2r0n6SgAAACkKynVwhAUFCRvb2/7NnLkyHivHx4erpiYGAUEBDiMBwQEKCwsLN5j1q1bp8mTJ+ubb75J0j1LrMYAAACAx3DmzBl5eXnZP3Zzc0uW8964cUOvvvqqvvnmG/n6+ib5PCS7AAAAJpeSrwv28vJySHYfxNfXV05OTrpw4YLD+IULFxQYGBhn/rFjx3Ty5Ek1a9bMPmaz2SRJzs7OOnTokAoUKPDI69LGAAAAgBTn6uqq8uXLa8WKFfYxm82mFStWqGrVqnHmFylSRHv37tWuXbvsW/PmzVW3bl3t2rVLQUFBCboulV0AAACTSysvlejVq5c6dOigChUqqFKlShozZoxu3bqljh07SpLat2+vXLlyaeTIkXJ3d1eJEiUcjvfx8ZGkOOMPQ7ILAACAJ6JVq1a6dOmSBg0apLCwMJUpU0ZLliyxP7R2+vRpWa3J23hAsgsAAGByViV/72pSz9e9e3d179493n2rVq166LHTpk1L9PXo2QUAAIBpUdkFAAAwubTSs5saSHYBAABM7r8vgkiuc6YHtDEAAADAtKjsAgAAmFxGbmOgsgsAAADTorILAABgcmlp6bEnLb3ECQAAACQalV0AAACTo2cXAAAAMCEquwAAACbHOrsAAACACVHZBQAAMDmL5f6W3OdMD0h2AQAATM4qi6zJ3HiQ3OdLKbQxAAAAwLSo7AIAAJhcRm5joLILAAAA06KyCwAAYHKW//8nuc+ZHlDZBQAAgGlR2QUAADA5enYBAAAAE6KyCwAAYHKWFFhnN7307JLsAgAAmBxtDAAAAIAJUdkFAAAwOSq7AAAAgAlR2QUAADA5XioBAAAAmBCVXQAAAJOzWu5vyX3O9IDKLgAAAEyLyi4AAIDJ0bMLAAAAmBCVXQAAAJPLyOvskuwCAACYnEXJ33aQTnJd2hgAAABgXlR2gXSqWYkAtSybU9kyu+j45dsav+aEDl28Fe/cBkX81PvpAg5jUdE2NZ20xf6xu4tVnarkUbX8WeXl7qKwiLv6eU+YFu6/mKL3AZjRyh+/09KZk3T9yiXlLlhUbXoNVb7iZR553JZlv+rbQT1UulYDdRv1jSQpOvqefpn0qfZuWKXwc6eVyTOLilaooRe69pWPX0AK3wnMIiMvPUayC6RDtQtm1xs18mrsqhM6eOGmXigdqBHNiqrTD7t07U50vMfciozWaz/stn9sGI77u1TPq9K5vTVq2TFduBGp8kHeeqt2Pl2+dU+bTl5NydsBTGXr8t80b+wHatfnA+UrXlYr5kzRFz3ba9jsP+WVzfeBx4WfP6Mfx41QoTKVHMaj7t7R6UP71bTjW8pdqKhu37iu2Z8P1fg+nTVg6m8pfTtAukcbA5AOvVgmhxbvv6ilBy/p9NU7+mLVCUVG29SoqP8DjzEkXb19z75du3PPYX+xwCxafvCS9pyL0IUbkVr010UdD7+lIgEeKXw3gLksm/WtajRvrepNX1bOfIXUrs+HcnXLpPW/z33gMbaYGE0e/I6ad+4p35xBDvsye3qp59jvVaF+UwXmLaD8Jcqp7bvDdOrgXl0OO5vStwOTsKTQP+kByS6QzjhbLSrk56Gdf1+3jxmSdv59XUUDPR94XCYXJ81oX1Yz25fVkMZPKW+2TA77/wq7oSrBWZXdw0WSVDqXl3L5ZNL209fjOx2AeETfi9LpQ/tUtGJ1+5jValXRitV1fN+OBx73+5QvlCVrdtVo3ipB17l984YsFosyZ/F67JgBs6ONIZmsWrVKdevW1dWrV+Xj45Pa4cDEvNyd5WS16Optx8rs1dv3FJQ1U7zH/H31jj7785iOX74tD1cntSyTU2NeKK7QWXsUfitKkjR+zUm9Uze/ZoWUV3SMTTZJY1Ye197zN1L6lgDTuHntqmwxMXHaFbJk89P5U8fiPebI7q1a99tcDfxuUYKucS/yrhZ89ZEqNmiuTB5ZHjtmZAwZeekxKrv/LyQkRBaLRRaLRS4uLsqXL5/69Omju3fvpnZoj2XVqlWyWCy6du1aaoeCVHTgwk0tPxSu4+G3tffcDQ1dcljX7karSfF/2h6eKxWoIgGeGrTwoLrN26ev159S91r5VDY3lSMgpdy9dVNThvbUq/1HKotPtkfOj46+p0n/6y7DMNSuzwdPIEIg/aOy+y/PPPOMpk6dqnv37mn79u3q0KGDLBaLRo0aldqhAXYRd6MVYzOUNbOLw3jWzC66cjsqQeeIsRk6dumWcnq7S5JcnSzqWCVIQxcf1pZT1yRJJy7fVgFfD71UJqd2/h2RrPcAmJWnT1ZZnZwUcSXcYfzGlUvyzu4XZ/6ls6d0+fzfGt+7s33MsNkkSV1qFNCw2X/KP3deSfcT3a8HdNOVsL/V68tZVHWRKBYl/7q46aSwS2X339zc3BQYGKigoCC1aNFC9evX17JlyyRJNptNI0eOVL58+ZQpUyaVLl1aP/7440PPt27dOtWsWVOZMmVSUFCQevTooVu37i8N9f7776ty5cpxjildurSGDRsmSdq6dasaNGggX19feXt7q3bt2tqxw7Hny2Kx6Ntvv9Xzzz+vzJkzq1ChQvr1118lSSdPnlTdunUlSVmzZpXFYlFISMhjfY6Q+qJtho5cuqUyub3tYxZJZXJ76UDYzQSdw2qR8mXPrCv/3wrhbLXKxckaZ4UGm2Gkm6VlgLTA2cVVeQqX0MFtG+xjNptNB7ZtUP4S5eLMD8xbQIO//0MDpy+yb6Vq1lfhclU1cPoiZQvIIemfRPfi3yfVc+xMeXpnfWL3BHOwyiKrJZm3dJLukuw+wL59+7Rhwwa5urpKkkaOHKnvvvtOEydO1P79+9WzZ0+98sorWr16dbzHHzt2TM8884xefPFF7dmzR3PmzNG6devUvXt3SVK7du20ZcsWHTv2Tw/X/v37tWfPHrVt21aSdOPGDXXo0EHr1q3Tpk2bVKhQITVu3Fg3bjj2UA4dOlQvv/yy9uzZo8aNG6tdu3a6cuWKgoKCNH/+fEnSoUOHdP78eX3xxRfxxhsZGamIiAiHDWnX/F3n1biYvxoU9lVQVnf1qJNP7s5O+uPAJUlS76cL6LUq/zzR3a5CLpUP8lagl5sK+mZW3/oF5Z/FTYv/ur+G7u17Mdp9NkKh1fKoVE4vBWZxU4Mifqpf2E/rj7PsGJAYDdp01tpfZ2nDwh91/uRRzfx4gKLu3lb1pi0lSVOG9tKCr+7/xdDFzV25ChR22DJ7esnNw0O5ChSWs4vr/daF99/UqYN71WnIGNlsMbp++aKuX76o6HsJ+2sOkJHRxvAvv//+uzw9PRUdHa3IyEhZrVZ9+eWXioyM1IgRI7R8+XJVrVpVkpQ/f36tW7dOkyZNUu3ateOca+TIkWrXrp3eeecdSVKhQoU0duxY1a5dWxMmTFDx4sVVunRp/fDDDxo4cKAkaebMmapcubIKFiwoSapXr57DOb/++mv5+Pho9erVatq0qX08JCREbdq0kSSNGDFCY8eO1ZYtW/TMM88oW7b7PWD+/v4PfXBu5MiRGjp0aNI+cXjiVh+9LO9MzmpfOUhZM7voePhtDfj9oH05Mf8sbg5V2ixuznqnbn5lzeyim5HROnLxlt6Zv0+nr96xzxmx9IheqxKkfg0KKou7sy7eiNS0Taf1+/4LT/r2gHStYv1munH1in799nNFXL6k3IWKqsfn0+WV7X4bw5ULZ2VJxJ9Mrl0K0+61yyVJw9s3dtj37vhZKlyuavIFD9PKyG0MJLv/UrduXU2YMEG3bt3S559/LmdnZ7344ovav3+/bt++rQYNGjjMj4qKUtmyZeM91+7du7Vnzx7NnDnTPmYYhmw2m06cOKGiRYuqXbt2mjJligYOHCjDMDRr1iz16tXLPv/ChQv63//+p1WrVunixYuKiYnR7du3dfr0aYdrlSpVyv7vHh4e8vLy0sWLiXvrVf/+/R2uHRERoaCgoIccgdT2694L+nVv/Ilo75//cvh44vpTmrj+1EPPd/X2PX325/Fkiw/IyOq17KB6LTvEu++9r+Y89NiOAz9z+Ng3R5C+3ngyuUIDMhyS3X/x8PCwV1WnTJmi0qVLa/LkySpRooQkaeHChcqVK5fDMW5ubvGe6+bNm3rjjTfUo0ePOPvy5MkjSWrTpo369u2rHTt26M6dOzpz5oxatfpnjcUOHTro8uXL+uKLL5Q3b165ubmpatWqiopy/LOVi4vjg0oWi0W2/3/AIaHc3NweeC8AACCdy8ClXZLdB7BarXr//ffVq1cvHT58WG5ubjp9+nS8LQvxKVeunP766y978hyf3Llzq3bt2po5c6bu3LmjBg0ayN//n6Wg1q9fr6+++kqNG9//s9WZM2cUHh7+oNPFK7bnOCYmJlHHAQAAmAEPqD1Ey5Yt5eTkpEmTJum9995Tz549NX36dB07dkw7duzQuHHjNH369HiP7du3rzZs2KDu3btr165dOnLkiH755Rf7A2qx2rVrp9mzZ2vevHlq166dw75ChQppxowZOnDggDZv3qx27dopU6b4XxrwIHnz5pXFYtHvv/+uS5cu6ebNhD2tDwAAzIPXBSNezs7O6t69uz7++GP1799fAwcO1MiRI1W0aFE988wzWrhwofLlyxfvsaVKldLq1at1+PBh1axZU2XLltWgQYOUM2dOh3kvvfSSLl++rNu3b6tFixYO+yZPnqyrV6+qXLlyevXVV9WjRw+Hym9C5MqVS0OHDlW/fv0UEBAQJ9kGAAAwM4th/HdlTeD+A2re3t6q+8lyOWfySO1wAPxHy/I5UjsEAA9w59YNvV2/pK5fvy4vr9R9C2Xsz/MVu07LM0vyxnLzRoSeLpMnTdznw9CzCwAAYHIZ+Pk02hgAAABgXlR2AQAAzC4Dl3ap7AIAAOCJGT9+vIKDg+Xu7q7KlStry5YtD5y7YMECVahQQT4+PvLw8FCZMmU0Y8aMRF2PZBcAAMDk0srSY3PmzFGvXr00ePBg7dixQ6VLl1ajRo0e+ObXbNmyacCAAdq4caP27Nmjjh07qmPHjvrjjz8SfE2SXQAAADwRo0ePVmhoqDp27KhixYpp4sSJypw5s6ZMmRLv/Dp16uj5559X0aJFVaBAAb399tsqVaqU1q1bl+BrkuwCAACYnMWSMpt0f3mzf2+RkZHxxhAVFaXt27erfv369jGr1ar69etr48aNj7wHwzC0YsUKHTp0SLVq1UrwvZPsAgAAIMmCgoLk7e1t30aOHBnvvPDwcMXExCggIMBhPCAgQGFhYQ88//Xr1+Xp6SlXV1c1adJE48aNU4MGDRIcH6sxAAAAmFxKLsZw5swZh5dKuLm5Jet1smTJol27dunmzZtasWKFevXqpfz586tOnToJOp5kFwAAAEnm5eWVoDeo+fr6ysnJSRcuXHAYv3DhggIDAx94nNVqVcGCBSVJZcqU0YEDBzRy5MgEJ7u0MQAAAJidJYW2RHB1dVX58uW1YsUK+5jNZtOKFStUtWrVBJ/HZrM9sC84PlR2AQAATC6pS4U96pyJ1atXL3Xo0EEVKlRQpUqVNGbMGN26dUsdO3aUJLVv3165cuWy9/2OHDlSFSpUUIECBRQZGalFixZpxowZmjBhQoKvSbILAACAJ6JVq1a6dOmSBg0apLCwMJUpU0ZLliyxP7R2+vRpWa3/NB7cunVLXbt21d9//61MmTKpSJEi+v7779WqVasEX9NiGIaR7HeCdC8iIkLe3t6q+8lyOWfySO1wAPxHy/I5UjsEAA9w59YNvV2/pK5fv56gXtaUFPvzfO2+v+WZJXljuXkjQjVL5E4T9/kw9OwCAADAtGhjAAAAMLmUXHosraOyCwAAANOisgsAAGB2Gbi0S2UXAAAApkVlFwAAwOTSyjq7qYFkFwAAwOQslvtbcp8zPaCNAQAAAKZFZRcAAMDkMvDzaVR2AQAAYF5UdgEAAMwuA5d2qewCAADAtKjsAgAAmFxGXnqMyi4AAABMi8ouAACAybHOLgAAAGBCVHYBAABMLgMvxkCyCwAAYHoZONuljQEAAACmRWUXAADA5Fh6DAAAADAhKrsAAAAmx9JjAAAAgAlR2QUAADC5DLwYA5VdAAAAmBeVXQAAALPLwKVdkl0AAACTY+kxAAAAwISo7AIAAJhdCiw9lk4Ku1R2AQAAYF5UdgEAAEwuAz+fRmUXAAAA5kVlFwAAwOwycGmXyi4AAABMi8ouAACAyWXkdXZJdgEAAEzOkgJLjyX7UmYphDYGAAAAmBaVXQAAAJPLwM+nUdkFAACAeVHZBQAAMLsMXNqlsgsAAADTorILAABgchl56TEquwAAADAtKrsAAAAmZ1EKrLObvKdLMVR2AQAAYFpUdgEAAEwuAy/GQLILAABgdrwuGAAAADAhKrsAAACml3EbGajsAgAAwLSo7AIAAJgcPbsAAACACVHZBQAAMLmM27FLZRcAAAAmRrILAABgcrE9u8m9JcX48eMVHBwsd3d3Va5cWVu2bHng3G+++UY1a9ZU1qxZlTVrVtWvX/+h8+NDsgsAAGBylhT6J7HmzJmjXr16afDgwdqxY4dKly6tRo0a6eLFi/HOX7Vqldq0aaOVK1dq48aNCgoKUsOGDXX27NkEX5NkFwAAAE/E6NGjFRoaqo4dO6pYsWKaOHGiMmfOrClTpsQ7f+bMmeratavKlCmjIkWK6Ntvv5XNZtOKFSsSfE2SXQAAALOzpNAmKSIiwmGLjIyMN4SoqCht375d9evXt49ZrVbVr19fGzduTNBt3L59W/fu3VO2bNkSfOskuwAAAEiyoKAgeXt727eRI0fGOy88PFwxMTEKCAhwGA8ICFBYWFiCrtW3b1/lzJnTIWF+FJYeAwAAMLmUXHrszJkz8vLyso+7ubkl85Xu++ijjzR79mytWrVK7u7uCT6OZBcAAABJ5uXl5ZDsPoivr6+cnJx04cIFh/ELFy4oMDDwocd++umn+uijj7R8+XKVKlUqUfHRxgAAAGByaWHpMVdXV5UvX97h4bLYh82qVq36wOM+/vhjDR8+XEuWLFGFChUSfe9UdgEAAPBE9OrVSx06dFCFChVUqVIljRkzRrdu3VLHjh0lSe3bt1euXLnsfb+jRo3SoEGD9MMPPyg4ONje2+vp6SlPT88EXZNkFwAAwOSSui7uo86ZWK1atdKlS5c0aNAghYWFqUyZMlqyZIn9obXTp0/Lav2n8WDChAmKiorSSy+95HCewYMHa8iQIQm6JskuAAAAnpju3bure/fu8e5btWqVw8cnT5587OuR7AIAAJhdSi7HkMaR7AIAAJhcBs51WY0BAAAA5kVlFwAAwOSSslRYQs6ZHlDZBQAAgGlR2QUAADC95F96LL107VLZBQAAgGlR2QUAADA5enYBAAAAEyLZBQAAgGnRxgAAAGBytDEAAAAAJkRlFwAAwOQsKbD0WPIvZZYyqOwCAADAtKjsAgAAmBw9uwAAAIAJUdkFAAAwOYuS/+W+6aSwS2UXAAAA5kVlFwAAwOwycGmXZBcAAMDkWHoMAAAAMCEquwAAACbH0mMAAACACVHZBQAAMLkM/HwalV0AAACYF5VdAAAAs8vApV0quwAAADAtKrsAAAAmxzq7AAAAgAlR2UW8DMOQJEXfvZXKkQCIz51bN1I7BAAPcPfWTUn//CxNC27ciEj2dXFv3IhI3hOmEJJdxOvGjfs/SNcOfC6VIwEQn5WpHQCAR7px44a8vb1TNQZXV1cFBgaqUL6gFDl/YGCgXF1dU+TcycVipKVfO5Bm2Gw2nTt3TlmyZJElvbwiBQ8UERGhoKAgnTlzRl5eXqkdDoB/4fvTfAzD0I0bN5QzZ05ZranfMXr37l1FRUWlyLldXV3l7u6eIudOLlR2ES+r1arcuXOndhhIZl5eXvwwBdIovj/NJbUruv/m7u6e5hPSlJT6v24AAAAAKYRkFwAAAKZFsgtkAG5ubho8eLDc3NxSOxQA/8H3J5CyeEANAAAApkVlFwAAAKZFsgsAAADTItkFAACAaZHsAgAAwLRIdgEAAGBaJLsAHtu/F3WJiYlJxUgAAHBEsgvgsRiGIYvFoitXrkiSnJyctGHDBm3bti2VIwMAgGQXwGOyWCy6dOmSGjdurPHjx+v3339XjRo1dPPmzdQODTA1m81m//fo6Og4YwDuI9kF8Nhu376tp59+Wh999JFatmyp2bNnq06dOrQ0ACnIarXqzJkzunfvnpydnfXbb7/pww8/JOEF/oNkF8Bjy5s3r6pXr66zZ8/Ky8tLly9flnS/pYEfvEDKuHPnjp577jnVqFFDs2bN0nPPPafChQvLauVHO/BvvC4YwGOx2WyyWq3as2ePjhw5oj179mj27Nl644031KtXL4c5AJLX8ePHVaVKFd24cUPjx4/Xa6+9ppiYGDk5OaV2aECa4ZzaAQBIn2IfTLt06ZIyZcqk4sWLq1SpUipVqpTu3r2rSZMmyWq16p133pHVatX8+fOVL18+lStXLrVDB0zDzc1NN2/elJubm2bOnKlXX31VLi4u/IIJ/AuVXQBJ9vPPP6tPnz5yd3eXl5eX5s+fr4CAAJ04cUKTJk3STz/9pCZNmsjT01MffPCBjh07pnz58qV22ICpnDhxQnfu3FHjxo0VHBysZcuWOSS80dHRcnamtoWMi2QXQKLEVnT/+usvVa9eXf3791fmzJk1Z84cHTt2TH/88YdKliypU6dOac6cOfr+++/l7u6uSZMmqWzZsqkdPpCuxX7/HThwQBcuXFDu3LlVsGBBSdLWrVvVsmVL5c+fX3/88YdcXFz05ZdfKiIiQv3795fFYknl6IHUQbILINE2bdqkGzduaMOGDRo8eLAkKTw8XK+++qp2796tpUuXqkSJEoqOjlZ0dLTu3LmjrFmzpnLUgDksWLBAISEh8vPz04kTJ/Txxx8rJCREvr6+2rp1q1q3bi2LxaIqVapozpw52rFjh0qWLJnaYQOphoYeAIly69Ytde3aVY0aNdKJEyfs476+vpoxY4ZKly6tJk2aaPfu3XJ2dpa7uzuJLvCYYutSZ86c0ciRI/XJJ59o1apV+vTTTzVkyBB98cUXunjxoipWrKg1a9aoVq1aypQpk3bu3EmiiwyPyi6ARNuzZ4/69OmjQ4cOafPmzfL397f/efXy5ctq1qyZrl27pl27dsnV1TW1wwVMYfny5dqxY4eOHj2qcePGyc3NTZI0YcIE9e3bV2+//ba6du2qHDlySJKioqL4/gNEsgvgEWKT2H+LiYnRoUOH9MorrygqKkrr1q2Tj4+Pw6uDb968qTx58qRS1ID5DBgwQCNHjlT+/Pm1Zs0a5cyZ075vwoQJ+t///qeQkBC999579oQXAMkugIeITV43btyotWvX6ubNm2rcuLGqVKkiSTpw4IDatm2re/fuxUl4ASS/zz77TL1799Znn32m0NBQeXp62veNHj1aY8aM0fbt2+Xn55eKUQJpC8kugIdasGCBunTpohIlSsjDw0MLFy7U999/r7Zt20q6n/B26NBBZ8+e1V9//SVvb+9UjhhI/2J/aYyJiVFMTIxDO8L//vc/ffTRRxo7dqw6dOggDw8P+75r167Jx8cnFSIG0i4W3gPwQBs3blTXrl01YsQIde7cWX///beCg4PVsWNHXblyRd27d1fRokU1ZcoUdevWTZcvXybZBR5TbKL7xx9/6LvvvtPJkyfVoEEDtW3bVk899ZQ++OADGYahHj16yMnJSe3atbNXePn+A+Ii2QUQr5iYGO3atUuvv/66OnfurDNnzqhGjRp644035Ofnp3feeUeenp4KCQlRiRIltGzZMh6GAZKBxWLRL7/8oldffVVt27ZVq1atNHz4cB0+fFhdunRRrVq19OGHH8rJyUlvvvmmXFxc1LFjR1ksFlqIgHjQxgAgjosXL8rf318HDhzQzZs3VaJECT377LMqVKiQJk2apFOnTqlMmTK6ceOGJk2apNDQ0NQOGTCN/fv364UXXlDPnj3VpUsXGYYhf39/3bt3T1WqVNHAgQNVvXp1SdIHH3ygF198UUWLFk3lqIG0i8ouAAd79+5VvXr1tHfvXvsP0CNHjigiIkIhISGyWq1ydXXViy++qCJFith/6AJIHpGRkWrbtq06deqkv//+WzVr1lS7du3UsWNHValSRZ6enrp9+7YaNGig//3vf6kdLpDmkewCcFCyZEnlypVLn3zyiT799FNZLBZdvXpVu3bt0tWrV+3V3IMHD+rLL79U5syZUztkwBT27Nkjd3d3FS1aVFmyZJGTk5P69Oljb1vw8PBQ1apVtWDBAmXOnFk1atRQpkyZUjtsIM3jDWoA7KKjo2Wz2fTSSy9p586dunLliiSpUqVK6tatm5o3b66qVatq7NixGj9+PIkukAwMw9DFixf1wgsvaM2aNcqUKZMKFSqkmJgYnT17ViVLlrSvuFC0aFH98MMPGjJkCIkukED07ALQlStXlC1bNvvH586dU/HixfXuu+86/Jn0999/1+3bt1WxYkXly5cvNUIFTOvtt9/W77//rq1btypbtmwKDw/X008/rbJly6ply5Zav369pk+frj179ih79uypHS6QbpDsAhncpk2b1K9fP5UoUUIjRoyQm5ub3NzcNHbsWM2cOVNTp05VsWLFUjtMwLRiX+t78OBBderUSZ06ddJrr70mSVq1apXatm2rLFmy6N69e5o/f77Kli2byhED6QttDEAG5+/vr5o1a2rDhg0qXbq0hg8frv3796tx48a6ceOGDh06JOn+UmQAks9ff/2lGzdu2JfsK1y4sHLlyqXvvvvOPqdOnTravHmzFi1apE2bNpHoAklAZRfIYGIXrL9+/bqio6Md/hw6ZMgQ7dixQytWrNCIESM0depU3bx5U9u2beOtTEAyOnHihFq3bq0zZ85ozJgxKlasmEqUKKFTp06pevXq6t+/v7p165baYQKmQLILZCCxie5vv/2mr776SkeOHFG5cuVUoUIF9enTR5J0/fp1/f7775o4caKOHz+u69ev69ixYwoICEjl6AHzuHfvnk6cOKEvv/xSa9askWEYatOmjVq2bKnPP/9c0dHRGjNmjNzc3HhRBPCYSHYBk4tNcGMtXLhQL730kj788EOVKFFCixcv1hdffKEVK1aobt269nlnz57VoUOHlDdvXhUoUCA1QgdMI/b78OjRo7p27Zqio6NVpUoVSdLWrVu1ZcsWDRw4UPXq1dOhQ4e0f/9+bdy4UZUrV07lyIH0j2QXyABiYmLk5OSkO3fuKCQkRGXLllW/fv0UHh6usmXL6vnnn9fYsWNTO0zAlGIT3QULFmjgwIGKiYmRxWJRtmzZtGDBAvtfTY4fP6758+dryZIlWrlypQ4ePKinnnoqlaMH0j8eUANMavLkyWrZsqUkycnJyf6/R48eVdGiRXX+/HmVKVNGzz77rD3RnTdvnjZu3JhqMQNmZLFYtGrVKrVv3149e/bU9u3b9eWXX2rjxo1atGiRJMlmsyl//vx67733tGLFCp07d45EF0gmJLuACd27d09Xr17VgQMHFBoaah+Pjo5W0aJFtX37dlWvXl2NGzfWpEmTJEnh4eFasmSJDh48KJvNllqhA+le7MtY/v19tHnzZnXs2FGdO3fWpUuX1LlzZ3Xp0kUdO3aUJFmt938cx656EhgY+ISjBsyLZBcwIRcXF3Xp0kVdu3bV1q1b7Wt2Zs6cWdWqVdMHH3yg3Llza+zYsfZ+3s8//1xr1qxRnTp17D94ASTO3Llz5efnp4MHD8pqtdoT3p07d9p/Ca1Zs6YaNmyo8ePHS5KmTp1q/3dnZ+dUix0wK76rABMyDEOenp5q3769bDabvv32W3Xs2FFTp05V165ddenSJQ0fPlzvvPOOnJycdOvWLf38889atWoVb0YDHkOVKlXUoEED1atXT3/++aeKFCkiSXrxxRc1ZcoUFS5cWC1atNCkSZNks9lks9m0bds2Wa1W3b17V+7u7ql8B4D5UL4BTCi2Whub8IaGhmr79u0KCQmRJA0ePFjjx4/XjRs3dPDgQWXLlk0bNmxQmTJlUi9owATy5MmjyZMnq2zZsqpVq5YOHjwo6f4LI65duyYfHx+1bt1aknTz5k0NHjxYCxYsUPfu3Ul0gRTCagyAicQ+9X38+HHduXNH0dHRKl26tO7du6dvv/1WEydOVLly5TR16lRJ0u3bt5U5c2bZbDZaF4Bk9Pfff+v111/Xtm3btGrVKhUrVkzr1q1Tt27dZLFYFBMToxw5cmjfvn1auHAhb0YDUhDJLmASsYnuTz/9pPfee0/e3t46ceKEnnvuOXXr1k2lSpXSt99+q2+++UYVKlTQt99+m9ohA6Z24cIFdejQQdu3b9fq1atVrFgx7du3T4cPH9b69etVtmxZVatWTfnz50/tUAFTI9kFTGTNmjVq1qyZRo0apS5duuj7779X+/btNWXKFIWEhOjmzZv6/vvvNXLkSDVv3lzjxo1L7ZCBdC/2F81t27bpr7/+0vXr11WlShVVrFhRV65cUbt27bRt2zZ7wgvgySLZBUwg9oftoEGDdPz4cX3//fc6ceKEGjZsqLp16+rrr7+WdH9Zo7t372rWrFmqV68eFSUgmcyfP1+vv/66atasqdOnT8tqtaphw4YaMWKE/v77b73xxhvatWuXli5dquLFi6d2uECGQpMekM7EtwZu7ANpYWFhKl26tGJiYlSjRg09/fTT9nV058yZo59++kkeHh7q1KkTiS6QTPbu3asePXpoxIgR+vnnnzV58mTt37/f/n2ZO3duTZ48WcHBwXr++ed17969VI4YyFhIdoF0JPZBslOnTunbb7/V559/ri1bttj3Fy9eXB9//LFy5sypli1b6ssvv5TFYpFhGFq0aJHWrFmjyMhI+w9hAAn3oJetHD58WHny5NEbb7yhEydO6Pnnn1f79u314YcfSpL279+vwMBAzZ8/XytWrJCLi8uTDBvI8FhnF0gnYhPdPXv2qEmTJgoODtamTZvsrxgNDQ1Vy5YttW7dOq1Zs0ZvvvmmnJ2ddefOHQ0fPlzLli3TypUr5ebmltq3AqQ7sd9/Z86c0dKlS2Wz2VSkSBHVrFlTLi4uCggI0JkzZ1SrVi01btxYX331lSRp7dq1+uOPP/TWW2/xVjQglZDsAulA7A/avXv3qkqVKurTp4969+6tq1evqkqVKpo/f75CQ0OVM2dOvfbaa7p69aoqVKigihUryjAMHTp0SAsXLlThwoVT+1aAdOffv2g2b95cAQEBOnbsmHx8fDR69GiVKlVKixYt0uLFi9WlSxd98cUX9mPnzp2rkydPsoYukIpoYwDSgdjWhapVq+q5557TkCFD5OHhody5c6tYsWLatWuXzpw5I0l69tlnNX/+fH388ceqUKGCXn75ZfsyRwAS59+JbtWqVdWmTRutXLlSs2fP1p07dzRx4kQFBwdrwoQJMgxDuXPn1unTp3Xs2DH16dNHM2fO1EcffSRvb+/UvhUgw2I1BiCdOHnypGrWrKkKFSqoV69eqlmzpj755BP17dtXBQsWVIkSJSRJZcqUUZcuXZQ1a1Z6A4FkcObMGZUrV05169bV3Llz7eOVKlXStWvXtHXrVjk7O2vOnDnq1q2bAgIClDlzZlksFn3//ff8ogmkMpJdIB2IrS4dOnRIL774ogoXLiw/Pz/NmzdPs2bNUnBwsAzD0NSpU7V69Wrt2rVLDRs21KxZs+Tu7s7b0YDHcPLkSb388svKkSOH+vTpo+rVq2vkyJEaMGCAKlSooBw5cih79uxq2rSpfHx8dOfOHeXNm1d+fn4KCAhI7fCBDI9kF0gnYhPegwcPqlWrVtq7d68+/fRT9erVyz4ndr3dmTNnqlq1asqXL18qRgyYx5EjR9SjRw+5urrK399fv/zyi7766itVqlRJ27dv1759+zRu3Dh5eHioXLlymj9/fmqHDOD/kewC6Uhswnvs2DG1aNFCwcHB6t27t2rVqiVJio6OlrMzz50CKeHw4cPq3r271q5dq+HDh+u9995z2H/58mWtXLlSpUuXVqFChVIpSgD/RbILpFGxa3parVZ7khs7Hlvhfemll5Q3b171799fNWrUSM1wgQzh2LFj6tq1q5ycnPT+++/bv+/u3btHjzyQRtHIB6QRscnt3bt3Jd1Pco8cOWL/91ixyW+RIkX0448/6uzZs+rXr582btz45IMGMpgCBQroyy+/lGEY+uCDD7R+/XpJItEF0jCSXSCNsFqtOn78uN555x2dPXtWP/74o4oWLar9+/fHOzc24Z05c6ZsNpty586dClEDGU+hQoU0duxYubi46L333tOmTZtSOyQAD0EbA5CGrFmzRi1atFDp0qW1ceNGff3112rfvr39wbP/iomJkZOTE39CBVLBwYMHNXDgQH322WfKkydPaocD4AFIdoE0IjahHTVqlPr3768qVarou+++U8GCBR32P+xYAE9WVFSUXF1dUzsMAA9BGwOQRsTExEiS3N3dNWjQIF24cEFDhgzRzp07JUkWi0X//t00tsc3dh+AJ49EF0j7qOwCqSy2KvvfZcOWLl2qN954Q9WqVVOfPn1UunRpSdLGjRtVtWrV1AoXAIB0hWQXSEWxie6KFSv0008/6erVqypWrJhCQ0Pl7++vpUuXqkuXLqpevbpat26tHTt2aPDgwQoLC5Ofnx8VXQAAHoFkF0hlP//8s9q0aaNXXnlFp06d0tWrV3Xp0iWtWbNGefLk0YoVK/Tee+/JZrMpIiJCP/74o8qXL5/aYQMAkC6Q7AJP0H8fJAsPD1eDBg3Utm1b9e7dW5K0b98+vfvuuzpy5Ii2bNkiX19fnTx5UhEREfLz81OOHDlSK3wAANIdHlADnoDY3ylv374t6Z+Hy27evKnz58+rTJky9rlFixbVxx9/rKxZs2r27NmSpODgYJUqVYpEFwCARCLZBZ4Ai8WiixcvKjg4WHPnzrW/ES0wMFBBQUFavXq1fa6Tk5NKlSolZ2dnHTp0KLVCBgDAFEh2gSfEarWqefPmevXVV/XLL7/YxypXrqw///xTCxYssM+1WCzKlSuXfHx8ZBiG6DYCACBp6NkFUkh8L3q4ePGiPvzwQ40bN07z58/X888/r8uXL6tdu3a6fv26KleurOrVq2vNmjX67rvvtHnzZhUpUiSV7gAAgPSPZBdIATabTVarVbdu3VJMTIy8vLzs+86fP68RI0Zo/Pjxmjdvnl588UVdvnxZH330kdavX6/w8HAFBgZq7NixDr28AAAg8Uh2gRRy5MgRvfzyy/L09FRoaKgCAwPVsGFDSVJkZKTeffddffXVV5ozZ45atmyp6OhoWSwWXblyRZkzZ5aHh0cq3wEAAOmf86OnAEgsm82madOmaffu3XJ3d9e1a9d0+/ZtZcuWTZUqVdJrr72mjh07Knv27GrVqpW8vLzUqFEjSZKfn18qRw8AgHlQ2QVSSFhYmEaNGqVjx46pYMGC6tatm2bOnKm1a9dqz549ypYtm/Lnz6/t27fr4sWLWrVqlWrVqpXaYQMAYCpUdoEUEhgYqN69e2vEiBFat26dChUqpEGDBkmSNm/erHPnzunrr7+Wv7+/Ll68KF9f31SOGAAA86GyC6Sw2AfSNm/erBYtWuj999+377t3755sNpuuX78uf3//VIwSAABzItkFnoCwsDB9+OGH2rp1q1q0aKF+/fpJkqKjo+XszB9YAABIKSS7wBMSm/Du3LlTTz/9tIYOHZraIQEAYHq8QQ14QgIDAzVgwAAVKlRIGzZs0OXLl1M7JAAATI/KLvCEXbhwQZIUEBCQypEAAGB+JLsAAAAwLdoYAAAAYFokuwAAADAtkl0AAACYFskuAAAATItkFwAAAKZFsgsAAADTItkFAACAaZHsAkgzLBaLw2a1WuXj46OaNWvq22+/VWovCz5t2jRZLBYNGTLEYTwkJEQWi0WrVq1KlbiSqk6dOrJYLDp58mSC5j/o/pMiODhYFovlsc/zKOn1awMg+ZDsAkhzOnTooA4dOqhdu3YqVqyY1q9fr9DQULVt2za1Q0sxyZlIAgD+4ZzaAQDAf02bNs3h42XLlqlx48aaPXu22rVrp6ZNm6ZOYA8wcuRI9evXT3ny5EntUAAA/0FlF0Ca16BBA7366quSpJ9//jl1g4lHjhw5VKRIEWXOnDm1QwEA/AfJLoB0oWzZspKkM2fO2McsFouCg4MVFRWlYcOGqUiRInJzc1OLFi3sc27fvq2RI0eqbNmy8vT0lKenp6pUqaLp06c/8Frr169X/fr1lSVLFvn4+KhRo0bavHnzA+c/rC/01q1bGjVqlCpUqCAvLy95eHioSJEi6tatmw4fPizpfu9sx44dJUlDhw516Fv+b5X7wIEDCgkJUVBQkNzc3BQQEKDWrVtr//798cYWExOjTz/9VEWKFJG7u7uCgoL09ttvKyIi4oH3k1jnz5/Xxx9/rNq1aytXrlxydXVVYGCgXnjhBW3duvWhxxqGoS+++ELFihWTu7u7cuXKpR49eujatWsPnD9r1izVq1dPWbNmlbu7u4oWLaohQ4bo9u3byXZPAMyDNgYA6cKNGzckSW5ubg7jNptNLVq00Jo1a1S7dm2VKlVK2bNnlyRdvHhRDRo00J49exQYGKjatWvLMAxt2LBBISEh2rZtm8aNG+dwvt9//13PP/+8oqOjValSJeXPn1+7d+9WrVq1FBISkqiYz58/rwYNGmj//v3KmjWr6tSpIzc3Nx0/flwTJ05UoUKF9NRTT+mZZ55RdHS01q9fr9KlS6tMmTL2cxQsWND+7z///LNat26tyMhIlSlTRlWqVNGZM2c0d+5c/fbbb1q8eLFq1arlEMMrr7yi2bNnK3PmzGrYsKGcnZ01ffp0rV+/Xi4uLom6nwf55Zdf1LdvXxUuXFilSpWSl5eXjhw5op9++km///67fv/9dzVs2DDeY9966y19/fXXqlOnjkqWLKnVq1dr3LhxWr16tdauXSsvLy/7XJvNpldeeUWzZs2Sp6enKlSooKxZs2rbtm0aOnSoFi9erFWrVilTpkzJcl8ATMIAgDRCkhHf/y3ZbDajatWqhiRjwIABceYXLFjQ+Pvvv+Mc17hxY0OS8fbbbxt37961j4eFhRkVKlQwJBmLFy+2j0dERBh+fn6GJGPKlCkO1+/bt6/9eoMHD3a4TocOHQxJxsqVKx3Gn376aUOS8fLLLxs3btxw2HfixAlj9+7d9o+nTp0a77n/Pd/Dw8Pw9PQ0li1b5rBv8eLFhouLixEUFGRERkbax2fPnm1IMvLkyWOcOHHCPn7hwgWjRIkS9vv5976HeVCMe/bsMfbt2xdn/pIlSwxXV1ejQIEChs1mc9iXN29eQ5Lh5eVlbNu2zT5+48YNo169evav2799/PHHhiSjTp06xvnz5+3jkZGRRqdOnQxJRt++fR2OedDXBkDGQbILIM34b7IbHR1tHD582AgJCTEkGW5ubsbRo0fjzJ83b16cc+3cudOQZFSsWNGIiYmJs3/Hjh2GJKN58+b2sSlTphiSjFq1asWZHxUVZeTOnTvBye7mzZsNSYa/v78RERHxyHt/VLL79ttvG5KMcePGxbu/R48ehiRjwYIF9rFatWrFSdxjLV68ONmS3Ydp166dIcnYs2ePw3hssvv+++/HOWb//v2GxWIxPD09jTt37hiGYRj37t0zfH19DQ8PDyMsLCzOMbdv3zYCAwONrFmzOny9SXYB0LMLIM2J7Vd1dnbWU089pWnTpilLliyaNWuWChQoEGdus2bN4pxj6dKlkqQWLVrIao37f3WxPbxbtmyxj61du1aS1Lp16zjzXVxc9NJLLyX4HpYvXy5JatOmjbJkyZLg4x4k9n5eeOGFePfXrFlTkuz3c+/ePW3atEmS1KpVqzjzn3nmGWXNmvWx44oVGRmpX375RQMGDNDrr7+ukJAQhYSEaO/evZKkI0eOxHtcfJ/rYsWKqXTp0rp586Z27twpSdqxY4fCw8NVrVo1BQQExDkmU6ZMKl++vK5evfrAawHImOjZBZDmdOjQQZJktVrl5eWlkiVL6oUXXog3OfP394/TxyvJ/qKEAQMGaMCAAQ+81t27d+3/fu7cOUlS3rx5450bHByc0FuwP0j33+Q8qWLvJ1euXA+dFx4eLkm6fPmyoqKi5Ofn98BVIvLmzaurV68+dmx79+5V8+bNH/pyitie6/hiiE9wcLB27dpl/5rEnnvZsmWPfBlFeHi4Chcu/OjAAWQIJLsA0pz/rkDwMO7u7vGO22w2SVKNGjWSLeFMTbH3E/uLwINUrlz5SYRjZxiGXn75ZZ08eVJdunRRly5dlD9/fnl6espisej999/XyJEjH/vtd7H3X7BgQVWvXv2hc2MfUAQAiWQXgEnlzp1b0v02hnfffTdBx+TIkUOSdOrUqXj3P2g8PkFBQZKkY8eOJfiYh8mdO7eOHTumzz77LEHJXPbs2eXq6qpLly7pzp078a5QcPr06ceO6+DBgzp48KAqVKigCRMmxNl//Pjxhx5/6tQplSxZMt5xScqZM6ekf76eRYoUSdQvQwBAzy4AU2rQoIEk6aeffkrwMbF9r3Pnzo2zLzo6WvPnz0/wuerXry9JmjVrlm7evPnI+a6urvbrxCex9+Pi4mKv8sZ3P0uXLtWVK1cSdK6HiW2DiE1G/7tv2bJlDz0+vtgOHjyoXbt2ydPT074MW8WKFeXt7a3Vq1cnS9wAMg6SXQCmVLlyZTVo0EDr169Xt27d4n2Jwu7du7VkyRL7xy1btlT27Nm1atUqh5dOGIahwYMHJ6oSWqlSJdWtW1cXL17U66+/rlu3bjnsP3nypP3hLemfCuahQ4fiPd+7776rTJky6b333tOCBQvi7I+MjNSPP/6ov//+2z725ptvSlKc2MPDw9W7d+8E38vDFCxYUFarVX/++afDg2F3795Vly5dHpmYjhs3zv4QmnT/JSBvvfWWDMNQx44d7RVpNzc39enTRzdu3NALL7wQb8X47NmzmjFjRrLcFwATSd3FIADgH3rAOrsPm583b94H7r9w4YJRtmxZQ5Lh4+Nj1KlTx2jbtq3RpEkTIygoKN61XH/++WfDycnJkGRUrlzZaNOmjVGsWDHDxcXFCA0NTdQ6u3///bdRuHBhQ5KRLVs2o3nz5kbLli2NcuXKGVar1fj888/tc+/cuWP4+/sbkozatWsbHTt2NDp16mSsX7/eIbbMmTPb1xZu1qyZ0bp1a6NmzZqGh4eHIcnYuXOnQwwtW7Y0JBkeHh5G8+bNjRdeeMHw8fExypUrZ1SpUiVZlh6L/bxkypTJaNKkifHSSy8ZAQEBhq+vr33ZuKlTpzocE7v0WLdu3QwXFxejUaNGxssvv2wEBgYakozixYsb165dczgmJibGePXVVw1Jhqurq1G5cmWjdevWxgsvvGAUL17csFgsRunSpRP0tQGQcVDZBWBa/v7+2rBhg8aOHatixYpp586d+vHHH7Vnzx7lz59fn3zyid577z2HY5577jmtXLlSdevW1b59+7Rw4ULlyJFDq1evVrVq1RJ1/Vy5cmnr1q0aNmyYcufOrWXLlmnx4sW6ffu2unbtqqZNm9rnuru7a+HChWrQoIF27dqladOmafLkyfZXCsfGtmfPHnXt2lUWi0XLli3TwoULdfHiRTVr1kxz585VsWLFHGL44YcfNGrUKOXKlUtLlizRpk2b1LZtW/3555/xrmKRFBMmTNBnn32mfPnyacWKFVq7dq3q16+vbdu2PXC1hVhjx47VyJEjderUKf3yyy+yWCzq1q2b1q5dK29vb4e5VqtV3333nX755Rc1aNBAJ06c0Pz587Vu3Tq5u7urd+/emjJlSrLcEwDzsBjGYz4iCwAAAKRRVHYBAABgWiS7AAAAMC2SXQAAAJgWyS4AAABMi2QXAAAApkWyCwAAANMi2QUAAIBpkewCAADAtEh2AQAAYFokuwAAADAtkl0AAACYFskuAAAATOv/AKtxm7dgqVLnAAAAAElFTkSuQmCC", - "text/plain": [ - "
" + "cell_type": "code", + "source": [], + "metadata": { + "id": "JAumX0BC2lFK" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fOKz8xQr5xXJ" + }, + "source": [ + "### Section 2: Text Pre-processing" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Step 4: Evaluate the classifier using various measures\n", - "\n", - "# Function to plot confusion matrix. \n", - "# Ref:http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", - "import itertools\n", - "from sklearn.metrics import roc_auc_score\n", - "\n", - "def plot_confusion_matrix(cm, classes,\n", - " normalize=False,\n", - " title='Confusion matrix',\n", - " cmap=plt.cm.Blues):\n", - " \"\"\"\n", - " This function prints and plots the confusion matrix.\n", - " Normalization can be applied by setting `normalize=True`.\n", - " \"\"\"\n", - " if normalize:\n", - " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", - "\n", - " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", - " plt.title(title)\n", - " plt.colorbar()\n", - " tick_marks = np.arange(len(classes))\n", - " plt.xticks(tick_marks, classes, rotation=45)\n", - " plt.yticks(tick_marks, classes)\n", - "\n", - " fmt = '.2f' if normalize else 'd'\n", - " thresh = cm.max() / 2.\n", - " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", - " plt.text(j, i, format(cm[i, j], fmt),\n", - " horizontalalignment=\"center\",\n", - " color=\"white\" if cm[i, j] > thresh else \"black\")\n", - "\n", - " plt.tight_layout()\n", - " plt.ylabel('True label',fontsize=15)\n", - " plt.xlabel('Predicted label',fontsize=15)\n", - " \n", - " \n", - "# Print accuracy:\n", - "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", - "\n", - " \n", - "# print the confusion matrix\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with all features')\n", - "\n", - "# calculate AUC: Area under the curve(AUC) gives idea about the model efficiency:\n", - "# Further information: https://en.wikipedia.org/wiki/Receiver_operating_characteristic\n", - "y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]\n", - "print(\"ROC_AOC_Score: \", roc_auc_score(y_test, y_pred_prob))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ga5-KhYN5xaD" - }, - "source": [ - "At this point, we can notice that the classifier is doing poorly with identifying relevant articles, while it is doing well with non-relevant ones. Our large feature vector could be creating a lot of noise in the form of very rarely occurring features that are not useful for learning. Let us change the count vectorizer to take a certain number of features as maximum. " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 511 }, - "id": "ylOI4OsD5xaE", - "outputId": "0aea4279-84d2-49d3-e979-30e7c911f814" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 14.5 ms, sys: 0 ns, total: 14.5 ms\n", - "Wall time: 14.6 ms\n", - "Accuracy: 0.6876876876876877\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "yhC5TZuL5xXK" + }, + "source": [ + "Typical steps involve tokenization, lower casing, removing, stop words, punctuation markers etc, and vectorization. Other processes such as stemming/lemmatization can also be performed. Here, we are performing the following steps: removing br tags, punctuation, numbers, and stopwords. While we are using sklearn's list of stopwords, there are several other stop word lists (e.g., from NLTK) or sometimes, custom stopword lists are needed depending on the task." + ] }, { - "data": { - "image/png": "", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "7MZSHdHZ5xXL" + }, + "outputs": [], + "source": [ + "stopwords = _stop_words.ENGLISH_STOP_WORDS\n", + "def clean(doc): # doc is a string of text\n", + " doc = doc.replace(\"
\", \" \") # This text contains a lot of
tags.\n", + " doc = \"\".join([char for char in doc if char not in string.punctuation and not char.isdigit()])\n", + " doc = \" \".join([token for token in doc.split() if token not in stopwords])\n", + " # remove punctuation and numbers\n", + " return doc" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vect = CountVectorizer(preprocessor=clean, max_features=5000) # Step-1\n", - "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", - "X_test_dtm = vect.transform(X_test)\n", - "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", - "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", - "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm\n", - "print(\"Accuracy: \", metrics.accuracy_score(y_test, y_pred_class))\n", - "# print the confusion matrix\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with max 5000 features')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2JzJ6k7g5xaL" - }, - "source": [ - "Clearly, the performance on relevance classification got better even though the overall accuracy fell by 10%. Let us try another classification algorithm and see if the performance changes. For this experiment, we have considered logistic regression, with class_weight attribute as \"balanced\", to address the problem of class imbalance in this dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 494 }, - "id": "0v7pM9hB5xbA", - "outputId": "292bdf0c-924b-494b-ffae-c4914f2f5db9" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.7367367367367368\n", - "AUC: 0.6586769358985225\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "3CfVm42o5xXS" + }, + "source": [ + "### Section 3: Modeling\n", + "\n", + "Now we are ready for the modelling. We are going to use algorithms from sklearn package. We will go through the following steps:\n", + "\n", + "1 Split the data into training and test sets (75% train, 25% test) \n", + "2 Extract features from the training data using CountVectorizer, which is a bag of words feature implementation. We will use the pre-processing function above in conjunction with Count Vectorizer \n", + "3 Transform the test data into the same feature vector as the training data. \n", + "4 Train the classifier \n", + "5 Evaluate the classifier " + ] }, { - "data": { - "image/png": "", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GimJJHhg5xYl", + "outputId": "48f5d9f9-b0e3-4e65-b6cc-13cc21f874a1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(7991,) (7991,)\n", + "(5993,) (5993,)\n", + "(1998,) (1998,)\n" + ] + } + ], + "source": [ + "import sklearn\n", + "#from sklearn.cross_validation import train_test_split\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Step 1: train-test split\n", + "X = our_data.text # the column text contains textual data to extract features from\n", + "y = our_data.relevance # this is the column we are learning to predict.\n", + "print(X.shape, y.shape)\n", + "# split X and y into training and testing sets. By default, it splits 75% training and 25% test\n", + "# random_state=1 for reproducibility\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)\n", + "print(X_train.shape, y_train.shape)\n", + "print(X_test.shape, y_test.shape)" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from sklearn.linear_model import LogisticRegression # import\n", - "\n", - "logreg = LogisticRegression(class_weight=\"balanced\") # instantiate a logistic regression model\n", - "logreg.fit(X_train_dtm, y_train) # fit the model with training data\n", - "\n", - "# Make predictions on test data\n", - "y_pred_class = logreg.predict(X_test_dtm)\n", - "y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]\n", - "\n", - "# calculate evaluation measures:\n", - "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", - "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with normalization')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6v1evQyy5xbe" - }, - "source": [ - "Let us wrap this notebook by trying with one more classifier, but reducing the feature vector size to 1000." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 494 }, - "id": "XJLKusAQ5xbf", - "outputId": "4dcdc0d5-4f4f-487a-ac44-2bc6778a0876" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.6841841841841841\n", - "AUC: 0.6732650365850213\n" - ] + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gsUyIBUD5xZI", + "outputId": "6e17b2c2-d0ea-453a-e42e-308f33ed5bd2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(5993, 49753) (1998, 49753)\n" + ] + } + ], + "source": [ + "# Step 2-3: Preprocess and Vectorize train and test data\n", + "vect = CountVectorizer(preprocessor=clean) # instantiate a vectoriezer\n", + "X_train_dtm = vect.fit_transform(X_train)# use it to extract features from training data\n", + "# transform testing data (using training data's features)\n", + "X_test_dtm = vect.transform(X_test)\n", + "print(X_train_dtm.shape, X_test_dtm.shape)\n", + "# i.e., the dimension of our feature vector is 49753!" + ] }, { - "data": { - "image/png": "", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nDLwA4CL5xZq", + "outputId": "c374e0f2-2026-497d-b2c8-12ad9289b865" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 13.6 ms, sys: 0 ns, total: 13.6 ms\n", + "Wall time: 81.6 ms\n" + ] + } + ], + "source": [ + "# Step 3: Train the classifier and predict for test data\n", + "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", + "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", + "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm" ] - }, - "metadata": {}, - "output_type": "display_data" + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 668 + }, + "id": "LiCHjvc75xZ3", + "outputId": "db90135e-8645-4e2a-f2d3-5d3f3c2ecaa4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.7822822822822822\n", + "ROC_AOC_Score: 0.7251117679464362\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "# Step 4: Evaluate the classifier using various measures\n", + "\n", + "# Function to plot confusion matrix.\n", + "# Ref:http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html\n", + "import itertools\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "def plot_confusion_matrix(cm, classes,\n", + " normalize=False,\n", + " title='Confusion matrix',\n", + " cmap=plt.cm.Blues):\n", + " \"\"\"\n", + " This function prints and plots the confusion matrix.\n", + " Normalization can be applied by setting `normalize=True`.\n", + " \"\"\"\n", + " if normalize:\n", + " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", + "\n", + " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", + " plt.title(title)\n", + " plt.colorbar()\n", + " tick_marks = np.arange(len(classes))\n", + " plt.xticks(tick_marks, classes, rotation=45)\n", + " plt.yticks(tick_marks, classes)\n", + "\n", + " fmt = '.2f' if normalize else 'd'\n", + " thresh = cm.max() / 2.\n", + " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n", + " plt.text(j, i, format(cm[i, j], fmt),\n", + " horizontalalignment=\"center\",\n", + " color=\"white\" if cm[i, j] > thresh else \"black\")\n", + "\n", + " plt.tight_layout()\n", + " plt.ylabel('True label',fontsize=15)\n", + " plt.xlabel('Predicted label',fontsize=15)\n", + "\n", + "\n", + "# Print accuracy:\n", + "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", + "\n", + "\n", + "# print the confusion matrix\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with all features')\n", + "\n", + "# calculate AUC: Area under the curve(AUC) gives idea about the model efficiency:\n", + "# Further information: https://en.wikipedia.org/wiki/Receiver_operating_characteristic\n", + "y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]\n", + "print(\"ROC_AOC_Score: \", roc_auc_score(y_test, y_pred_prob))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ga5-KhYN5xaD" + }, + "source": [ + "At this point, we can notice that the classifier is doing poorly with identifying relevant articles, while it is doing well with non-relevant ones. Our large feature vector could be creating a lot of noise in the form of very rarely occurring features that are not useful for learning. Let us change the count vectorizer to take a certain number of features as maximum." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 686 + }, + "id": "ylOI4OsD5xaE", + "outputId": "cb0303c1-140f-4990-aa1c-e9ba1e3cc05b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 7.24 ms, sys: 0 ns, total: 7.24 ms\n", + "Wall time: 35.2 ms\n", + "Accuracy: 0.6876876876876877\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "vect = CountVectorizer(preprocessor=clean, max_features=5000) # Step-1\n", + "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", + "X_test_dtm = vect.transform(X_test)\n", + "nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model\n", + "%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython \"magic command\")\n", + "y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm\n", + "print(\"Accuracy: \", metrics.accuracy_score(y_test, y_pred_class))\n", + "# print the confusion matrix\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with max 5000 features')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2JzJ6k7g5xaL" + }, + "source": [ + "Clearly, the performance on relevance classification got better even though the overall accuracy fell by 10%. Let us try another classification algorithm and see if the performance changes. For this experiment, we have considered logistic regression, with class_weight attribute as \"balanced\", to address the problem of class imbalance in this dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 668 + }, + "id": "0v7pM9hB5xbA", + "outputId": "e86c81ce-cb3f-4268-8ccd-7daa7c4d3a66" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.7367367367367368\n", + "AUC: 0.6584385682402464\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression # import\n", + "\n", + "logreg = LogisticRegression(class_weight=\"balanced\") # instantiate a logistic regression model\n", + "logreg.fit(X_train_dtm, y_train) # fit the model with training data\n", + "\n", + "# Make predictions on test data\n", + "y_pred_class = logreg.predict(X_test_dtm)\n", + "y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]\n", + "\n", + "# calculate evaluation measures:\n", + "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", + "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with normalization')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6v1evQyy5xbe" + }, + "source": [ + "Let us wrap this notebook by trying with one more classifier, but reducing the feature vector size to 1000." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 668 + }, + "id": "XJLKusAQ5xbf", + "outputId": "cea4494e-a06d-41d0-c2a9-cdad88ff4a89" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.6926926926926927\n", + "AUC: 0.6742856032997147\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "from sklearn.svm import LinearSVC\n", + "\n", + "vect = CountVectorizer(preprocessor=clean, max_features=1000) # Step-1\n", + "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", + "X_test_dtm = vect.transform(X_test)\n", + "\n", + "classifier = LinearSVC(class_weight='balanced') # instantiate a Linear Support Vector Machine model\n", + "classifier.fit(X_train_dtm, y_train) # fit the model with training data\n", + "\n", + "# Make predictions on test data\n", + "y_pred_class = classifier.predict(X_test_dtm)\n", + "\n", + "# Like other Sklearn models, LinearSVC doesn't have implement .predict_proba, but we can get the same results\n", + "# by using .decision_function (predicts the confidence scores) and then applying softmax on the output\n", + "\n", + "# Softmax Function\n", + "def softmax(x):\n", + " e_x = np.exp(x - np.max(x))\n", + " return e_x / e_x.sum(axis=0)\n", + "\n", + "y_prob_intermediate = classifier.decision_function(X_test_dtm) ## Predicts the Confidence Scores\n", + "y_pred_prob = softmax(y_prob_intermediate)\n", + "\n", + "# calculate evaluation measures:\n", + "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", + "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", + "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", + "plt.figure(figsize=(8,6))\n", + "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", + " title='Confusion matrix with normalization')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fd_-M70F5xbl" + }, + "source": [ + "So, how do we choose whats the best? If we look at overall accuracy alone, we should be choosing the very first classifier in this notebook. However, that is also doing poorly with identifying \"relevant\" articles. If we choose purely based on how good it is doing with \"relevant\" category, we should choose the second one we built. If we choose purely based on how good it is doing with \"irrelevant\" category, surely, nothing beats not building any classifier and just calling everything irrelevant! So, what to choose as the best among these depends on what we are looking for in our usecase!" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "iMJlTrJvLrS2" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "from sklearn.svm import LinearSVC\n", - "\n", - "vect = CountVectorizer(preprocessor=clean, max_features=1000) # Step-1\n", - "X_train_dtm = vect.fit_transform(X_train) # combined step 2 and 3\n", - "X_test_dtm = vect.transform(X_test)\n", - "\n", - "classifier = LinearSVC(class_weight='balanced') # instantiate a Linear Support Vector Machine model\n", - "classifier.fit(X_train_dtm, y_train) # fit the model with training data\n", - "\n", - "# Make predictions on test data\n", - "y_pred_class = classifier.predict(X_test_dtm)\n", - "\n", - "# Like other Sklearn models, LinearSVC doesn't have implement .predict_proba, but we can get the same results\n", - "# by using .decision_function (predicts the confidence scores) and then applying softmax on the output\n", - "\n", - "# Softmax Function\n", - "def softmax(x):\n", - " e_x = np.exp(x - np.max(x))\n", - " return e_x / e_x.sum(axis=0)\n", - "\n", - "y_prob_intermediate = classifier.decision_function(X_test_dtm) ## Predicts the Confidence Scores\n", - "y_pred_prob = softmax(y_prob_intermediate)\n", - "\n", - "# calculate evaluation measures:\n", - "print(\"Accuracy: \", accuracy_score(y_test, y_pred_class))\n", - "print(\"AUC: \", roc_auc_score(y_test, y_pred_prob))\n", - "cnf_matrix = confusion_matrix(y_test, y_pred_class)\n", - "plt.figure(figsize=(8,6))\n", - "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", - " title='Confusion matrix with normalization')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fd_-M70F5xbl" - }, - "source": [ - "So, how do we choose whats the best? If we look at overall accuracy alone, we should be choosing the very first classifier in this notebook. However, that is also doing poorly with identifying \"relevant\" articles. If we choose purely based on how good it is doing with \"relevant\" category, we should choose the second one we built. If we choose purely based on how good it is doing with \"irrelevant\" category, surely, nothing beats not building any classifier and just calling everything irrelevant! So, what to choose as the best among these depends on what we are looking for in our usecase! " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "01_OnePipeline_ManyClassifiers.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 3092194fc0f1775704b186e4ad2a22b1ae71f873 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Date: Tue, 22 Aug 2023 23:51:24 +0530 Subject: [PATCH 11/14] fix: Added Freezed Library Versions to Ch4/02_Doc2Vec_Example.ipynb --- Ch4/02_Doc2Vec_Example.ipynb | 1087 ++++++++++++++++++++-------------- 1 file changed, 626 insertions(+), 461 deletions(-) diff --git a/Ch4/02_Doc2Vec_Example.ipynb b/Ch4/02_Doc2Vec_Example.ipynb index cd54b8c..c6c63c7 100644 --- a/Ch4/02_Doc2Vec_Example.ipynb +++ b/Ch4/02_Doc2Vec_Example.ipynb @@ -1,488 +1,653 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "LCgVnQopb6TI" - }, - "source": [ - "# Doc2Vec demonstration \n", - "\n", - "In this notebook, let us take a look at how to \"learn\" document embeddings and use them for text classification. We will be using the dataset of \"Sentiment and Emotion in Text\" from [Kaggle](https://www.kaggle.com/c/sa-emotions/data).\n", - "\n", - "\"In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels. A subset of this data is used in an experiment we uploaded to Microsoft’s Cortana Intelligence Gallery.\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "LCgVnQopb6TI" + }, + "source": [ + "# Doc2Vec demonstration\n", + "\n", + "In this notebook, let us take a look at how to \"learn\" document embeddings and use them for text classification. We will be using the dataset of \"Sentiment and Emotion in Text\" from [Kaggle](https://www.kaggle.com/c/sa-emotions/data).\n", + "\n", + "\"In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels. A subset of this data is used in an experiment we uploaded to Microsoft’s Cortana Intelligence Gallery.\"\n" + ] }, - "id": "KX5dKXdcaENd", - "outputId": "956f503d-1a2c-4af1-aad5-a5da021ae29b" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: nltk in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (3.8.1)\n", - "Requirement already satisfied: tqdm in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (4.66.1)\n", - "Requirement already satisfied: joblib in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (1.3.2)\n", - "Requirement already satisfied: click in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (8.1.6)\n", - "Requirement already satisfied: regex>=2021.8.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from nltk) (2023.8.8)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Requirement already satisfied: pandas in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (2.0.3)\n", - "Requirement already satisfied: tzdata>=2022.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", - "Requirement already satisfied: pytz>=2020.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2023.3)\n", - "Requirement already satisfied: numpy>=1.20.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (1.24.3)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: six>=1.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Requirement already satisfied: gensim in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (4.3.1)\n", - "Requirement already satisfied: numpy>=1.18.5 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from gensim) (1.24.3)\n", - "Requirement already satisfied: scipy>=1.7.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from gensim) (1.11.1)\n", - "Requirement already satisfied: smart-open>=1.8.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from gensim) (6.3.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "Requirement already satisfied: scikit-learn in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (1.3.0)\n", - "Requirement already satisfied: numpy>=1.17.3 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.24.3)\n", - "Requirement already satisfied: scipy>=1.5.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.11.1)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (3.2.0)\n", - "Requirement already satisfied: joblib>=1.1.1 in /root/Working/practical-nlp-code/env/lib/python3.9/site-packages (from scikit-learn) (1.3.2)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "!pip install nltk\n", - "!pip install pandas\n", - "!pip install gensim\n", - "!pip install scikit-learn\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "CIlwQe1S4EpL" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KX5dKXdcaENd", + "outputId": "c18e98fa-df2d-49a9-baf5-4ac23d66297c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: nltk==3.8.1 in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (8.1.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (1.3.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (2023.6.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (4.66.1)\n", + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: gensim==4.3.1 in /usr/local/lib/python3.10/dist-packages (4.3.1)\n", + "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.10.1)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (6.3.0)\n", + "Requirement already satisfied: scikit-learn==1.2.2 in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.10.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (3.2.0)\n" + ] + } + ], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "!pip install nltk==3.8.1\n", + "!pip install pandas==1.5.3\n", + "!pip install gensim==4.3.1\n", + "!pip install scikit-learn==1.2.2\n", + "\n", + "# ===========================" + ] }, - "id": "hSB6W1seb6TJ", - "outputId": "e93459c9-fd82-4d22-852b-819faeb430a6" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - } - ], - "source": [ - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "import pandas as pd\n", - "import nltk\n", - "nltk.download('stopwords')\n", - "from nltk.tokenize import TweetTokenizer\n", - "from nltk.corpus import stopwords\n", - "from sklearn.model_selection import train_test_split\n", - "from gensim.models.doc2vec import Doc2Vec, TaggedDocument" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "CIlwQe1S4EpL" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] }, - "id": "NGAFbmrA4EpM", - "outputId": "f78def1c-c291-4fba-dd41-f24f1456757c" - }, - "outputs": [], - "source": [ - "#Load the dataset and explore.\n", - "try:\n", - " from google.colab import files\n", - " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", - " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", - " !ls -lah DATAPATH\n", - " filepath = \"DATAPATH/train_data.csv\"\n", - "except ModuleNotFoundError:\n", - " filepath = \"Data/Sentiment and Emotion in Text/train_data.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hSB6W1seb6TJ", + "outputId": "9e34f468-dc75-4555-9522-4fea208d6a00" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import pandas as pd\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.corpus import stopwords\n", + "from sklearn.model_selection import train_test_split\n", + "from gensim.models.doc2vec import Doc2Vec, TaggedDocument" + ] }, - "id": "lSvnHBYPb6TQ", - "outputId": "b992755a-470e-470b-eb59-e4225711f252" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(30000, 2)\n" - ] + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NGAFbmrA4EpM", + "outputId": "947b9250-7fd2-4cc7-c74c-4c88ecfdd4fa" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "--2023-08-22 16:12:41-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2479133 (2.4M) [text/plain]\n", + "Saving to: ‘DATAPATH/train_data.csv.1’\n", + "\n", + "\rtrain_data.csv.1 0%[ ] 0 --.-KB/s \rtrain_data.csv.1 100%[===================>] 2.36M --.-KB/s in 0.02s \n", + "\n", + "2023-08-22 16:12:42 (131 MB/s) - ‘DATAPATH/train_data.csv.1’ saved [2479133/2479133]\n", + "\n", + "--2023-08-22 16:12:42-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 783640 (765K) [text/plain]\n", + "Saving to: ‘DATAPATH/test_data.csv.1’\n", + "\n", + "test_data.csv.1 100%[===================>] 765.27K --.-KB/s in 0.009s \n", + "\n", + "2023-08-22 16:12:42 (78.7 MB/s) - ‘DATAPATH/test_data.csv.1’ saved [783640/783640]\n", + "\n", + "total 6.3M\n", + "drwxr-xr-x 2 root root 4.0K Aug 22 16:12 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 22 16:10 ..\n", + "-rw-r--r-- 1 root root 766K Aug 22 16:08 test_data.csv\n", + "-rw-r--r-- 1 root root 766K Aug 22 16:12 test_data.csv.1\n", + "-rw-r--r-- 1 root root 2.4M Aug 22 16:08 train_data.csv\n", + "-rw-r--r-- 1 root root 2.4M Aug 22 16:12 train_data.csv.1\n" + ] + } + ], + "source": [ + "#Load the dataset and explore.\n", + "try:\n", + " from google.colab import files\n", + " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", + " !wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", + " !ls -lah DATAPATH\n", + " filepath = \"DATAPATH/train_data.csv\"\n", + "except ModuleNotFoundError:\n", + " filepath = \"Data/Sentiment and Emotion in Text/train_data.csv\"" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sentimentcontent
0empty@tiffanylue i know i was listenin to bad habi...
1sadnessLayin n bed with a headache ughhhh...waitin o...
2sadnessFuneral ceremony...gloomy friday...
3enthusiasmwants to hang out with friends SOON!
4neutral@dannycastillo We want to trade with someone w...
\n", - "
" + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "id": "lSvnHBYPb6TQ", + "outputId": "aaff8fc3-c5fd-457e-f0dc-19d4b412a8ee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(30000, 2)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " sentiment content\n", + "0 empty @tiffanylue i know i was listenin to bad habi...\n", + "1 sadness Layin n bed with a headache ughhhh...waitin o...\n", + "2 sadness Funeral ceremony...gloomy friday...\n", + "3 enthusiasm wants to hang out with friends SOON!\n", + "4 neutral @dannycastillo We want to trade with someone w..." + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentimentcontent
0empty@tiffanylue i know i was listenin to bad habi...
1sadnessLayin n bed with a headache ughhhh...waitin o...
2sadnessFuneral ceremony...gloomy friday...
3enthusiasmwants to hang out with friends SOON!
4neutral@dannycastillo We want to trade with someone w...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 15 + } ], - "text/plain": [ - " sentiment content\n", - "0 empty @tiffanylue i know i was listenin to bad habi...\n", - "1 sadness Layin n bed with a headache ughhhh...waitin o...\n", - "2 sadness Funeral ceremony...gloomy friday...\n", - "3 enthusiasm wants to hang out with friends SOON!\n", - "4 neutral @dannycastillo We want to trade with someone w..." + "source": [ + "df = pd.read_csv(filepath)\n", + "print(df.shape)\n", + "df.head()" ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(filepath)\n", - "print(df.shape)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "5JEI6SH7b6TU", - "outputId": "7c4bccf9-3c39-4e43-cde8-3989a7a002d0" - }, - "outputs": [ { - "data": { - "text/plain": [ - "sentiment\n", - "worry 7433\n", - "neutral 6340\n", - "sadness 4828\n", - "happiness 2986\n", - "love 2068\n", - "surprise 1613\n", - "hate 1187\n", - "fun 1088\n", - "relief 1021\n", - "empty 659\n", - "enthusiasm 522\n", - "boredom 157\n", - "anger 98\n", - "Name: count, dtype: int64" + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5JEI6SH7b6TU", + "outputId": "c3c1034c-6a1c-4e87-9d21-3fdc9f58c9dd" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "worry 7433\n", + "neutral 6340\n", + "sadness 4828\n", + "happiness 2986\n", + "love 2068\n", + "surprise 1613\n", + "hate 1187\n", + "fun 1088\n", + "relief 1021\n", + "empty 659\n", + "enthusiasm 522\n", + "boredom 157\n", + "anger 98\n", + "Name: sentiment, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "df['sentiment'].value_counts()" ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['sentiment'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "CHajyKpmb6TY", - "outputId": "bbb05164-f107-4b7c-fedb-145a3b2d1ca3" - }, - "outputs": [ { - "data": { - "text/plain": [ - "(16759, 2)" + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CHajyKpmb6TY", + "outputId": "9c28cf4f-87f4-4261-bd32-4e2abfd9435f" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(16759, 2)" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ], + "source": [ + "#Let us take the top 3 categories and leave out the rest.\n", + "shortlist = ['neutral', \"happiness\", \"worry\"]\n", + "df_subset = df[df['sentiment'].isin(shortlist)]\n", + "df_subset.shape" ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Let us take the top 3 categories and leave out the rest.\n", - "shortlist = ['neutral', \"happiness\", \"worry\"]\n", - "df_subset = df[df['sentiment'].isin(shortlist)]\n", - "df_subset.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m2oiZzU5b6Tf" - }, - "source": [ - "# Text pre-processing:\n", - "Tweets are different. Somethings to consider:\n", - "- Removing @mentions, and urls perhaps?\n", - "- using NLTK Tweet tokenizer instead of a regular one\n", - "- stopwords, numbers as usual." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "Rl-FfMdLb6Th", - "outputId": "818e0510-afdb-4732-fe69-c6119ca695c1" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "16759 16759\n" - ] - } - ], - "source": [ - "#strip_handles removes personal information such as twitter handles, which don't\n", - "#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.\n", - "tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)\n", - "mystopwords = set(stopwords.words(\"english\"))\n", - "\n", - "#Function to tokenize tweets, remove stopwords and numbers. \n", - "#Keeping punctuations and emoticon symbols could be relevant for this task!\n", - "def preprocess_corpus(texts):\n", - " def remove_stops_digits(tokens):\n", - " #Nested function that removes stopwords and digits from a list of tokens\n", - " return [token for token in tokens if token not in mystopwords and not token.isdigit()]\n", - " #This return statement below uses the above function to process twitter tokenizer output further. \n", - " return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]\n", - "\n", - "#df_subset contains only the three categories we chose. \n", - "mydata = preprocess_corpus(df_subset['content'])\n", - "mycats = df_subset['sentiment']\n", - "print(len(mydata), len(mycats))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "markdown", + "metadata": { + "id": "m2oiZzU5b6Tf" + }, + "source": [ + "# Text pre-processing:\n", + "Tweets are different. Somethings to consider:\n", + "- Removing @mentions, and urls perhaps?\n", + "- using NLTK Tweet tokenizer instead of a regular one\n", + "- stopwords, numbers as usual." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Rl-FfMdLb6Th", + "outputId": "df2382f7-5823-4831-f356-85cf93d23ab6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "16759 16759\n" + ] + } + ], + "source": [ + "#strip_handles removes personal information such as twitter handles, which don't\n", + "#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.\n", + "tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)\n", + "mystopwords = set(stopwords.words(\"english\"))\n", + "\n", + "#Function to tokenize tweets, remove stopwords and numbers.\n", + "#Keeping punctuations and emoticon symbols could be relevant for this task!\n", + "def preprocess_corpus(texts):\n", + " def remove_stops_digits(tokens):\n", + " #Nested function that removes stopwords and digits from a list of tokens\n", + " return [token for token in tokens if token not in mystopwords and not token.isdigit()]\n", + " #This return statement below uses the above function to process twitter tokenizer output further.\n", + " return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]\n", + "\n", + "#df_subset contains only the three categories we chose.\n", + "mydata = preprocess_corpus(df_subset['content'])\n", + "mycats = df_subset['sentiment']\n", + "print(len(mydata), len(mycats))" + ] }, - "id": "rsGwfVebb6Tl", - "outputId": "c19bc96f-513c-45b6-d476-b95899ab7eca" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Saved\n" - ] + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rsGwfVebb6Tl", + "outputId": "0329a297-b6d9-4be1-d4c7-afa61ef786d3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Model Saved\n" + ] + } + ], + "source": [ + "#Split data into train and test, following the usual process\n", + "train_data, test_data, train_cats, test_cats = train_test_split(mydata,mycats,random_state=1234)\n", + "\n", + "#prepare training data in doc2vec format:\n", + "train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]\n", + "\n", + "#Train a doc2vec model to learn tweet representations. Use only training data!!\n", + "model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)\n", + "model.build_vocab(train_doc2vec)\n", + "model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)\n", + "model.save(\"d2v.model\")\n", + "print(\"Model Saved\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hTqo26Vsb6Ts", + "outputId": "5308af1c-a3a4-45d4-cf3d-b442fef129b7" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " happiness 0.32 0.51 0.39 713\n", + " neutral 0.45 0.53 0.49 1595\n", + " worry 0.60 0.38 0.47 1882\n", + "\n", + " accuracy 0.46 4190\n", + " macro avg 0.46 0.47 0.45 4190\n", + "weighted avg 0.50 0.46 0.46 4190\n", + "\n" + ] + } + ], + "source": [ + "#Infer the feature representation for training and test data using the trained model\n", + "model= Doc2Vec.load(\"d2v.model\")\n", + "\n", + "#infer in multiple steps to get a stable representation.\n", + "train_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in train_data]\n", + "test_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in test_data]\n", + "\n", + "#Use any regular classifier like logistic regression\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "myclass = LogisticRegression(class_weight=\"balanced\") #because classes are not balanced.\n", + "myclass.fit(train_vectors, train_cats)\n", + "\n", + "preds = myclass.predict(test_vectors)\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "print(classification_report(test_cats, preds))\n", + "\n", + "#print(confusion_matrix(test_cats,preds))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "qcRNGUJFAA1w" + }, + "outputs": [], + "source": [] } - ], - "source": [ - "#Split data into train and test, following the usual process\n", - "train_data, test_data, train_cats, test_cats = train_test_split(mydata,mycats,random_state=1234)\n", - "\n", - "#prepare training data in doc2vec format:\n", - "train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]\n", - "\n", - "#Train a doc2vec model to learn tweet representations. Use only training data!!\n", - "model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)\n", - "model.build_vocab(train_doc2vec)\n", - "model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)\n", - "model.save(\"d2v.model\")\n", - "print(\"Model Saved\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { + ], + "metadata": { + "accelerator": "GPU", "colab": { - "base_uri": "https://localhost:8080/" + "provenance": [] }, - "id": "hTqo26Vsb6Ts", - "outputId": "cd16346c-ca81-4dc7-c269-d9ccf83a774d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " happiness 0.34 0.54 0.41 713\n", - " neutral 0.47 0.54 0.50 1595\n", - " worry 0.61 0.39 0.48 1882\n", - "\n", - " accuracy 0.47 4190\n", - " macro avg 0.47 0.49 0.46 4190\n", - "weighted avg 0.51 0.47 0.47 4190\n", - "\n" - ] + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "#Infer the feature representation for training and test data using the trained model\n", - "model= Doc2Vec.load(\"d2v.model\")\n", - "\n", - "#infer in multiple steps to get a stable representation. \n", - "train_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in train_data]\n", - "test_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in test_data]\n", - "\n", - "#Use any regular classifier like logistic regression\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "myclass = LogisticRegression(class_weight=\"balanced\") #because classes are not balanced. \n", - "myclass.fit(train_vectors, train_cats)\n", - "\n", - "preds = myclass.predict(test_vectors)\n", - "from sklearn.metrics import classification_report, confusion_matrix\n", - "print(classification_report(test_cats, preds))\n", - "\n", - "#print(confusion_matrix(test_cats,preds))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "02_Doc2Vec_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From b0dba5ddfe74a06d044ce2377a2e3ee82c8a5f10 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Date: Tue, 22 Aug 2023 23:52:02 +0530 Subject: [PATCH 12/14] fix: Added Freezed Library Versions to Ch4/03_Word2Vec_Example.ipynb --- Ch4/03_Word2Vec_Example.ipynb | 1035 ++++++++++++++++++--------------- 1 file changed, 551 insertions(+), 484 deletions(-) diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb index 19b5e87..7ac82f1 100644 --- a/Ch4/03_Word2Vec_Example.ipynb +++ b/Ch4/03_Word2Vec_Example.ipynb @@ -1,505 +1,572 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "sVtvH58nb_Hp" - }, - "source": [ - "# Word2Vec for Text Classification\n", - "\n", - "In this short notebook, we will see an example of how to use a pre-trained Word2vec model for doing feature extraction and performing text classification.\n", - "\n", - "We will use the sentiment labelled sentences dataset from UCI repository\n", - "http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences\n", - "\n", - "The dataset consists of 1500 positive, and 1500 negative sentiment sentences from Amazon, Yelp, IMDB. Let us first combine all the three separate data files into one using the following unix command:\n", - "\n", - "```cat amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt > sentiment_sentences.txt```\n", - "\n", - "For a pre-trained embedding model, we will use the Google News vectors.\n", - "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM\n", - "\n", - "Let us get started!" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sVtvH58nb_Hp" + }, + "source": [ + "# Word2Vec for Text Classification\n", + "\n", + "In this short notebook, we will see an example of how to use a pre-trained Word2vec model for doing feature extraction and performing text classification.\n", + "\n", + "We will use the sentiment labelled sentences dataset from UCI repository\n", + "http://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences\n", + "\n", + "The dataset consists of 1500 positive, and 1500 negative sentiment sentences from Amazon, Yelp, IMDB. Let us first combine all the three separate data files into one using the following unix command:\n", + "\n", + "```cat amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt > sentiment_sentences.txt```\n", + "\n", + "For a pre-trained embedding model, we will use the Google News vectors.\n", + "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM\n", + "\n", + "Let us get started!" + ] }, - "id": "77UP8YyEdS2W", - "outputId": "1bb0a097-0232-42fd-ec29-b2e96ce857f5" - }, - "outputs": [], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# !pip install numpy\n", - "# !pip install pandas\n", - "# !pip install gensim\n", - "# !pip install wget\n", - "# !pip install nltk\n", - "# !pip install scikit-learn\n", - "# !pip install gdown\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "URLGvBLv9T0M" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + { + "cell_type": "code", + "source": [ + "import pkg_resources\n", + "\n", + "def get_library_versions(library_list):\n", + " frozen_list = []\n", + "\n", + " for library in library_list:\n", + " try:\n", + " version = pkg_resources.get_distribution(library).version\n", + " frozen_list.append(f\"{library}=={version}\")\n", + " except pkg_resources.DistributionNotFound:\n", + " print(f\"Error: {library} not found or could not retrieve version.\")\n", + "\n", + " return frozen_list\n", + "\n", + "# List of library names\n", + "libraries = [\"numpy\", \"pandas\", \"gensim\", \"nltk\", \"scikit-learn\", \"gdown\"]\n", + "\n", + "# Get frozen list of library versions\n", + "frozen_versions = get_library_versions(libraries)\n", + "\n", + "# Print the frozen list\n", + "for item in frozen_versions:\n", + " print(item)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KN4IKdaFCH7c", + "outputId": "ad4e3a11-17ce-4049-8ce9-53972fe41bfb" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "numpy==1.23.5\n", + "pandas==1.5.3\n", + "gensim==4.3.1\n", + "nltk==3.8.1\n", + "scikit-learn==1.2.2\n", + "gdown==4.6.6\n" + ] + } + ] }, - "id": "JQX8DAmBb_Hr", - "outputId": "a89dcee7-f76f-4bd9-ba60-8642b88ab50c" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n", - "[nltk_data] Downloading package punkt to /root/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - } - ], - "source": [ - "#basic imports\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "import os\n", - "import gzip\n", - "import shutil\n", - "from time import time\n", - "\n", - "#pre-processing imports\n", - "import nltk\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt')\n", - "from nltk.tokenize import word_tokenize\n", - "from nltk.corpus import stopwords\n", - "from string import punctuation\n", - "\n", - "#imports related to modeling\n", - "import numpy as np\n", - "from gensim.models import Word2Vec, KeyedVectors\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import classification_report\n", - "\n", - "#google-drive download imports\n", - "import gdown" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 140, - "resources": { - "http://localhost:8080/nbextensions/google.colab/files.js": { - "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", - "headers": [ - [ - "content-type", - "application/javascript" - ] - ], - "ok": true, - "status": 200, - "status_text": "" - } - } + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "77UP8YyEdS2W", + "outputId": "c7203e6a-e19d-4e9a-f577-ae936a2e1a4d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: numpy==1.23.5 in /usr/local/lib/python3.10/dist-packages (1.23.5)\n", + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: gensim==4.3.1 in /usr/local/lib/python3.10/dist-packages (4.3.1)\n", + "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (1.10.1)\n", + "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim==4.3.1) (6.3.0)\n", + "Requirement already satisfied: nltk==3.8.1 in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (8.1.7)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (1.3.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (2023.6.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk==3.8.1) (4.66.1)\n", + "Requirement already satisfied: scikit-learn==1.2.2 in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.23.5)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.10.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2) (3.2.0)\n", + "Requirement already satisfied: gdown==4.6.6 in /usr/local/lib/python3.10/dist-packages (4.6.6)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (3.12.2)\n", + "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (2.31.0)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (1.16.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.66.1)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.11.2)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->gdown==4.6.6) (2.4.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2023.7.22)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (1.7.1)\n" + ] + } + ], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "!pip install numpy==1.23.5\n", + "!pip install pandas==1.5.3\n", + "!pip install gensim==4.3.1\n", + "!pip install nltk==3.8.1\n", + "!pip install scikit-learn==1.2.2\n", + "!pip install gdown==4.6.6\n", + "\n", + "# ===========================" + ] }, - "id": "S8RM8c6AS8AX", - "outputId": "0b366a76-49b0-4170-dce6-33572a37a929" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "File already exists\n" - ] - } - ], - "source": [ - "try:\n", - " from google.colab import files\n", - " \n", - " # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt' present in \"sentiment labelled sentences\" folder\n", - " uploaded = files.upload()\n", - " \n", - " !mkdir DATAPATH\n", - " !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n", - " !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n", - " \n", - "except ModuleNotFoundError:\n", - "\n", - " fil = 'sentiment_sentences.txt'\n", - "\n", - " if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n", - " file = open(os.path.join(path, fil), 'w')\n", - " file.close()\n", - " \n", - " # combined the three files to make sentiment_sentences.txt\n", - " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", - "\n", - " with open('Data/sentiment_sentences.txt', 'w') as outfile:\n", - " for fname in filenames:\n", - " with open('Data/sentiment labelled sentences/' + fname) as infile:\n", - " outfile.write(infile.read())\n", - " print(\"File created\")\n", - " else:\n", - " print(\"File already exists\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "URLGvBLv9T0M" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] }, - "id": "COUGXAxcb_H5", - "outputId": "b88ee64f-6c36-412e-ce57-f9387eec3051", - "scrolled": true - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data Present at location : ./Data/GoogleNews-vectors-negative300.bin\n", - "CPU times: user 46.2 s, sys: 2 s, total: 48.2 s\n", - "Wall time: 49.1 s\n", - "done loading Word2Vec\n" - ] - } - ], - "source": [ - "#Load the pre-trained word2vec model and the dataset\n", - "\n", - "def check_if_file_exists(filename: str, locations: list) -> str :\n", - " for location in locations:\n", - " if os.path.exists(os.path.join(location, filename)):\n", - " return location\n", - " return None\n", - "\n", - "def extract_data(location: str) -> None:\n", - " with gzip.open(os.path.join(location, 'GoogleNews-vectors-negative300.bin.gz'), 'rb') as f_in:\n", - " with open(os.path.join('./Data', './GoogleNews-vectors-negative300.bin'), 'wb') as f_out:\n", - " shutil.copyfileobj(f_in, f_out)\n", - "\n", - "try:\n", - " from google.colab import files\n", - " data_path= \"DATAPATH\"\n", - " !gdown -O DATAPATH https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\n", - " !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz \n", - " path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n", - " training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n", - " \n", - "except ModuleNotFoundError:\n", - "\n", - " data_path = './Data/'\n", - " compressed_file_name = 'GoogleNews-vectors-negative300.bin.gz'\n", - " extracted_file_name = 'GoogleNews-vectors-negative300.bin'\n", - " \n", - " # Check if Extracted File exists\n", - " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", - " \n", - " if location_of_extracted_file:\n", - " # Extracted File exists\n", - " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", - " \n", - " else:\n", - " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", - " \n", - " if location_of_compressed_file:\n", - " # Compressed File exists\n", - " extract_data(os.path.join(location_of_compressed_file))\n", - " path_to_model = os.path.join(data_path, extracted_file_name)\n", - " \n", - " else:\n", - " # Download File\n", - " output_path = './Data/'\n", - " gdown.download(\"https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\", output=output_path)\n", - "\n", - " # Extract File\n", - " extract_data(output_path)\n", - "\n", - " path_to_model = os.path.join(data_path, extracted_file_name)\n", - "\n", - " print(f\"Data Present at location : {path_to_model}\")\n", - " training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n", - " \n", - " \n", - "#Load W2V model. This will take some time. \n", - "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", - "print('done loading Word2Vec')\n", - "\n", - "#Read text data, cats.\n", - "#the file path consists of tab separated sentences and cats.\n", - "texts = []\n", - "cats = []\n", - "fh = open(training_data_path)\n", - "for line in fh:\n", - " text, sentiment = line.split(\"\\t\")\n", - " texts.append(text)\n", - " cats.append(sentiment)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JQX8DAmBb_Hr", + "outputId": "65dc3618-5f7b-41a9-88bf-a5090b32f270" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ], + "source": [ + "#basic imports\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import os\n", + "import gzip\n", + "import shutil\n", + "from time import time\n", + "\n", + "#pre-processing imports\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.corpus import stopwords\n", + "from string import punctuation\n", + "\n", + "#imports related to modeling\n", + "import numpy as np\n", + "from gensim.models import Word2Vec, KeyedVectors\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "\n", + "#google-drive download imports\n", + "import gdown" + ] }, - "id": "m-WjFyC6b_IE", - "outputId": "87270b42-96b9-4420-f22a-6f13160e5cbe" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "3000000\n" - ] - } - ], - "source": [ - "#Inspect the model\n", - "word2vec_vocab = w2v_model.key_to_index.keys()\n", - "word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]\n", - "print(len(word2vec_vocab))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "S8RM8c6AS8AX" + }, + "outputs": [], + "source": [ + "try:\n", + " from google.colab import files\n", + "\n", + " # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt' present in \"sentiment labelled sentences\" folder\n", + " uploaded = files.upload()\n", + "\n", + " !mkdir DATAPATH\n", + " !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n", + " !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n", + "\n", + "except ModuleNotFoundError:\n", + "\n", + " fil = 'sentiment_sentences.txt'\n", + "\n", + " if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n", + " file = open(os.path.join(path, fil), 'w')\n", + " file.close()\n", + "\n", + " # combined the three files to make sentiment_sentences.txt\n", + " filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n", + "\n", + " with open('Data/sentiment_sentences.txt', 'w') as outfile:\n", + " for fname in filenames:\n", + " with open('Data/sentiment labelled sentences/' + fname) as infile:\n", + " outfile.write(infile.read())\n", + " print(\"File created\")\n", + " else:\n", + " print(\"File already exists\")" + ] }, - "id": "XEz30Jztb_IP", - "outputId": "18794f4b-828f-4c7c-9708-b9af3143d700" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "3000 3000\n", - "Good case, Excellent value.\n", - "1\n", - "\n" - ] - } - ], - "source": [ - "#Inspect the dataset\n", - "print(len(cats), len(texts))\n", - "print(texts[1])\n", - "print(cats[1])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "COUGXAxcb_H5", + "outputId": "640ac771-0389-4640-ab79-a11f26bd2c29", + "scrolled": true + }, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM\n", + "To: /content/DATAPATH/GoogleNews-vectors-negative300.bin.gz\n", + "100% 1.65G/1.65G [00:15<00:00, 103MB/s]\n", + "CPU times: user 28.1 s, sys: 3.92 s, total: 32.1 s\n", + "Wall time: 34.4 s\n", + "done loading Word2Vec\n" + ] + } + ], + "source": [ + "#Load the pre-trained word2vec model and the dataset\n", + "\n", + "def check_if_file_exists(filename: str, locations: list) -> str :\n", + " for location in locations:\n", + " if os.path.exists(os.path.join(location, filename)):\n", + " return location\n", + " return None\n", + "\n", + "def extract_data(location: str) -> None:\n", + " with gzip.open(os.path.join(location, 'GoogleNews-vectors-negative300.bin.gz'), 'rb') as f_in:\n", + " with open(os.path.join('./Data', './GoogleNews-vectors-negative300.bin'), 'wb') as f_out:\n", + " shutil.copyfileobj(f_in, f_out)\n", + "\n", + "try:\n", + " from google.colab import files\n", + " data_path= \"DATAPATH\"\n", + " !gdown -O DATAPATH/ https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\n", + " !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz\n", + " path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n", + " training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n", + "\n", + "except ModuleNotFoundError:\n", + "\n", + " data_path = './Data/'\n", + " compressed_file_name = 'GoogleNews-vectors-negative300.bin.gz'\n", + " extracted_file_name = 'GoogleNews-vectors-negative300.bin'\n", + "\n", + " # Check if Extracted File exists\n", + " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", + "\n", + " if location_of_extracted_file:\n", + " # Extracted File exists\n", + " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", + "\n", + " else:\n", + " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data','../Ch2/Data','../Ch3/Data'])\n", + "\n", + " if location_of_compressed_file:\n", + " # Compressed File exists\n", + " extract_data(os.path.join(location_of_compressed_file))\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " else:\n", + " # Download File\n", + " output_path = './Data/'\n", + " gdown.download(\"https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download\", output=output_path)\n", + "\n", + " # Extract File\n", + " extract_data(output_path)\n", + "\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " print(f\"Data Present at location : {path_to_model}\")\n", + " training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n", + "\n", + "\n", + "#Load W2V model. This will take some time.\n", + "%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n", + "print('done loading Word2Vec')\n", + "\n", + "#Read text data, cats.\n", + "#the file path consists of tab separated sentences and cats.\n", + "texts = []\n", + "cats = []\n", + "fh = open(training_data_path)\n", + "for line in fh:\n", + " text, sentiment = line.split(\"\\t\")\n", + " texts.append(text)\n", + " cats.append(sentiment)" + ] }, - "id": "MFOGaDTwb_Ig", - "outputId": "b9983e21-f00e-4c3e-ebe4-e2c8be738398" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "3000 3000\n", - "['good', 'case', 'excellent', 'value']\n", - "1\n", - "\n" - ] - } - ], - "source": [ - "#preprocess the text.\n", - "def preprocess_corpus(texts):\n", - " mystopwords = set(stopwords.words(\"english\"))\n", - " def remove_stops_digits(tokens):\n", - " #Nested function that lowercases, removes stopwords and digits from a list of tokens\n", - " return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()\n", - " and token not in punctuation]\n", - " #This return statement below uses the above function to process twitter tokenizer output further. \n", - " return [remove_stops_digits(word_tokenize(text)) for text in texts]\n", - "\n", - "texts_processed = preprocess_corpus(texts)\n", - "print(len(cats), len(texts_processed))\n", - "print(texts_processed[1])\n", - "print(cats[1])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "m-WjFyC6b_IE", + "outputId": "7cb1a092-d3fa-4bf4-e437-6d079da7ed74" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3000000\n" + ] + } + ], + "source": [ + "#Inspect the model\n", + "word2vec_vocab = w2v_model.key_to_index.keys()\n", + "word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]\n", + "print(len(word2vec_vocab))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XEz30Jztb_IP", + "outputId": "7c37e0e1-9f2e-411b-cdac-b89ecc39a0ea" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3000 3000\n", + "Good case, Excellent value.\n", + "1\n", + "\n" + ] + } + ], + "source": [ + "#Inspect the dataset\n", + "print(len(cats), len(texts))\n", + "print(texts[1])\n", + "print(cats[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MFOGaDTwb_Ig", + "outputId": "4e50a4a9-1f40-429c-c7b3-e445e42cae6f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3000 3000\n", + "['good', 'case', 'excellent', 'value']\n", + "1\n", + "\n" + ] + } + ], + "source": [ + "#preprocess the text.\n", + "def preprocess_corpus(texts):\n", + " mystopwords = set(stopwords.words(\"english\"))\n", + " def remove_stops_digits(tokens):\n", + " #Nested function that lowercases, removes stopwords and digits from a list of tokens\n", + " return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()\n", + " and token not in punctuation]\n", + " #This return statement below uses the above function to process twitter tokenizer output further.\n", + " return [remove_stops_digits(word_tokenize(text)) for text in texts]\n", + "\n", + "texts_processed = preprocess_corpus(texts)\n", + "print(len(cats), len(texts_processed))\n", + "print(texts_processed[1])\n", + "print(cats[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fXRiGtY1b_Iq", + "outputId": "1f5eaad6-939e-46fc-cf27-21798f78e18f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3000\n" + ] + } + ], + "source": [ + "# Creating a feature vector by averaging all embeddings for all sentences\n", + "def embedding_feats(list_of_lists):\n", + " DIMENSION = 300\n", + " zero_vector = np.zeros(DIMENSION)\n", + " feats = []\n", + " for tokens in list_of_lists:\n", + " feat_for_this = np.zeros(DIMENSION)\n", + " count_for_this = 0 + 1e-5 # to avoid divide-by-zero\n", + " for token in tokens:\n", + " if token in w2v_model:\n", + " feat_for_this += w2v_model[token]\n", + " count_for_this +=1\n", + " if(count_for_this!=0):\n", + " feats.append(feat_for_this/count_for_this)\n", + " else:\n", + " feats.append(zero_vector)\n", + " return feats\n", + "\n", + "\n", + "train_vectors = embedding_feats(texts_processed)\n", + "print(len(train_vectors))" + ] }, - "id": "fXRiGtY1b_Iq", - "outputId": "fdba211b-e6bd-453e-b70d-79546d6ef005" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "3000\n" - ] + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mr9IaQppb_Ix", + "outputId": "c74d84ea-6586-4d68-c8f7-e2e36b7f915d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.8013333333333333\n", + " precision recall f1-score support\n", + "\n", + " 0\n", + " 0.77 0.83 0.80 353\n", + " 1\n", + " 0.84 0.78 0.81 397\n", + "\n", + " accuracy 0.80 750\n", + " macro avg 0.80 0.80 0.80 750\n", + "weighted avg 0.80 0.80 0.80 750\n", + "\n" + ] + } + ], + "source": [ + "#Take any classifier (LogisticRegression here, and train/test it like before.\n", + "classifier = LogisticRegression(random_state=1234)\n", + "train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)\n", + "classifier.fit(train_data, train_cats)\n", + "print(\"Accuracy: \", classifier.score(test_data, test_cats))\n", + "preds = classifier.predict(test_data)\n", + "print(classification_report(test_cats, preds))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k7wjLB8rb_JB" + }, + "source": [ + "Not bad. With little efforts we got 80% accuracy. Thats a great starting model to have!!" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "h4lF7mkPCAuy" + }, + "outputs": [], + "source": [] } - ], - "source": [ - "# Creating a feature vector by averaging all embeddings for all sentences\n", - "def embedding_feats(list_of_lists):\n", - " DIMENSION = 300\n", - " zero_vector = np.zeros(DIMENSION)\n", - " feats = []\n", - " for tokens in list_of_lists:\n", - " feat_for_this = np.zeros(DIMENSION)\n", - " count_for_this = 0 + 1e-5 # to avoid divide-by-zero \n", - " for token in tokens:\n", - " if token in w2v_model:\n", - " feat_for_this += w2v_model[token]\n", - " count_for_this +=1\n", - " if(count_for_this!=0):\n", - " feats.append(feat_for_this/count_for_this) \n", - " else:\n", - " feats.append(zero_vector)\n", - " return feats\n", - "\n", - "\n", - "train_vectors = embedding_feats(texts_processed)\n", - "print(len(train_vectors))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { + ], + "metadata": { + "accelerator": "GPU", "colab": { - "base_uri": "https://localhost:8080/" + "provenance": [] }, - "id": "mr9IaQppb_Ix", - "outputId": "2c372ab4-38d8-4884-99dc-9bb3bbba16d0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.796\n", - " precision recall f1-score support\n", - "\n", - " 0\n", - " 0.82 0.77 0.79 383\n", - " 1\n", - " 0.78 0.82 0.80 367\n", - "\n", - " accuracy 0.80 750\n", - " macro avg 0.80 0.80 0.80 750\n", - "weighted avg 0.80 0.80 0.80 750\n", - "\n" - ] + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "#Take any classifier (LogisticRegression here, and train/test it like before.\n", - "classifier = LogisticRegression(random_state=1234)\n", - "train_data, test_data, train_cats, test_cats = train_test_split(train_vectors, cats)\n", - "classifier.fit(train_data, train_cats)\n", - "print(\"Accuracy: \", classifier.score(test_data, test_cats))\n", - "preds = classifier.predict(test_data)\n", - "print(classification_report(test_cats, preds))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "k7wjLB8rb_JB" - }, - "source": [ - "Not bad. With little efforts we got 80% accuracy. Thats a great starting model to have!!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "03_Word2Vec_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 29b8e6445e7a455139895a6f304f5431e5aae965 Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Date: Wed, 23 Aug 2023 00:59:42 +0530 Subject: [PATCH 13/14] fix: Added Freezed Library Versions to Ch4/04_FastText_Example.ipynb --- Ch4/04_FastText_Example.ipynb | 1245 +++++++++++++++++++-------------- 1 file changed, 701 insertions(+), 544 deletions(-) diff --git a/Ch4/04_FastText_Example.ipynb b/Ch4/04_FastText_Example.ipynb index 3a57ee4..219a8ae 100644 --- a/Ch4/04_FastText_Example.ipynb +++ b/Ch4/04_FastText_Example.ipynb @@ -1,572 +1,729 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "6FIToZHAhz2O" - }, - "source": [ - "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n", - "**Note**: This notebook uses an older version of fasttext." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6FIToZHAhz2O" + }, + "source": [ + "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n", + "**Note**: This notebook uses an older version of fasttext." + ] }, - "id": "UBnT5t_LiCU2", - "outputId": "ca0bcea9-75a7-4237-e58e-154c3d72e89f" - }, - "outputs": [], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# !pip install pandas==1.1.5\n", - "# !pip install wget==3.2\n", - "# !pip install fasttext==0.9.2\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "zrBi6bvbiCU4" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "YKgZXvTGb61z" - }, - "outputs": [], - "source": [ - "#necessary imports\n", - "import os\n", - "import pandas as pd\n", - "import wget\n", - "import tarfile\n", - "import gdown" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UBnT5t_LiCU2", + "outputId": "c63a8ae7-5816-486c-b161-1a597cff909f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas==1.5.3 in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (2023.3)\n", + "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas==1.5.3) (1.23.5)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas==1.5.3) (1.16.0)\n", + "Requirement already satisfied: gdown==4.6.6 in /usr/local/lib/python3.10/dist-packages (4.6.6)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (3.12.2)\n", + "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (2.31.0)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (1.16.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.66.1)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from gdown==4.6.6) (4.11.2)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->gdown==4.6.6) (2.4.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (2023.7.22)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from requests[socks]->gdown==4.6.6) (1.7.1)\n", + "Requirement already satisfied: fasttext==0.9.2 in /usr/local/lib/python3.10/dist-packages (0.9.2)\n", + "Requirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (2.11.1)\n", + "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (67.7.2)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from fasttext==0.9.2) (1.23.5)\n" + ] + } + ], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "!pip install pandas==1.5.3\n", + "!pip install gdown==4.6.6\n", + "!pip install fasttext==0.9.2\n", + "\n", + "# ===========================" + ] }, - "id": "l6CfW7C3L4EB", - "outputId": "debf3639-77d2-4a2c-8aa1-3ff8438b9585" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading...\n", - "From (uriginal): https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\n", - "From (redirected): https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k&confirm=t&uuid=a7accc21-6f49-4d9d-b323-f6fedaec8e8f\n", - "To: /root/Working/Working/practical-nlp-code/Ch4/Data/dbpedia_csv.tar.gz\n", - "100%|█████████████████████████████████████████████████████████████| 68.3M/68.3M [00:09<00:00, 7.58MB/s]\n" - ] + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "zrBi6bvbiCU4" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data Present at location : ./Data/dbpedia_csv\n" - ] - } - ], - "source": [ - "def check_if_file_exists(filename: str, locations: list) -> str :\n", - " for location in locations:\n", - " if os.path.exists(os.path.join(location, filename)):\n", - " return location\n", - " return None\n", - "\n", - "def extract_tar_file(file_path: str, extraction_path: str) -> None:\n", - " tar = tarfile.open(file_path, \"r:gz\")\n", - " tar.extractall(extraction_path)\n", - " tar.close()\n", - "\n", - "try :\n", - " \n", - " from google.colab import files\n", - " \n", - " # downloading the data\n", - " !wget -P DATAPATH https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", - "\n", - " # untaring the required file\n", - " !tar -xvf DATAPATH/dbpedia_csv.tar.gz -C DATAPATH\n", - "\n", - " # sneek peek in the folder structure\n", - " !ls -lah DATAPATH\n", - " \n", - " # specifying the data_path\n", - " data_path = 'DATAPATH'\n", - " \n", - "except ModuleNotFoundError:\n", - " data_path = './Data/'\n", - " compressed_file_name = 'dbpedia_csv.tar.gz'\n", - " extracted_file_name = 'dbpedia_csv'\n", - " \n", - " # Check if Extracted File exists\n", - " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data'])\n", - " \n", - " if location_of_extracted_file:\n", - " # Extracted File exists\n", - " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", - " \n", - " else:\n", - " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data'])\n", - " \n", - " if location_of_compressed_file:\n", - " # Compressed File exists\n", - " extract_tar_file(os.path.join(location_of_compressed_file, compressed_file_name), data_path)\n", - " path_to_model = os.path.join(data_path, extracted_file_name)\n", - " \n", - " else:\n", - " # Download File\n", - " output_path = './Data/'\n", - " gdown.download(\"https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\", output=output_path)\n", - "\n", - " # Extract File\n", - " extract_data(output_path+compressed_file_name, output_path)\n", - "\n", - " path_to_model = os.path.join(data_path, extracted_file_name)\n", - "\n", - " print(f\"Data Present at location : {path_to_model}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "YKgZXvTGb61z" + }, + "outputs": [], + "source": [ + "#necessary imports\n", + "import os\n", + "import pandas as pd\n", + "import tarfile\n", + "import gdown" + ] }, - "id": "lMoRw3oQb62I", - "outputId": "744d1cb7-4966-4db1-b176-c2020975ed94" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train:(560000, 3) Test:(70000, 3)\n" - ] - } - ], - "source": [ - "# Loading train data\n", - "train_file = data_path + '/dbpedia_csv/train.csv'\n", - "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n", - "# Loading test data\n", - "test_file = data_path + '/dbpedia_csv/test.csv'\n", - "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n", - "# Data we have\n", - "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 204 + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "l6CfW7C3L4EB", + "outputId": "6e30eb61-0cdc-4616-d14e-46888017cac9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\n", + "To: /content/DATAPATH/dbpedia_csv.tar.gz\n", + "100% 68.3M/68.3M [00:00<00:00, 164MB/s]\n", + "dbpedia_csv/\n", + "dbpedia_csv/classes.txt\n", + "dbpedia_csv/test.csv\n", + "dbpedia_csv/train.csv\n", + "dbpedia_csv/readme.txt\n", + "total 66M\n", + "drwxr-xr-x 3 root root 4.0K Aug 22 18:17 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 22 18:17 ..\n", + "drwxrwxr-x 2 3666 11555 4.0K Sep 9 2015 dbpedia_csv\n", + "-rw-r--r-- 1 root root 66M Aug 22 18:17 dbpedia_csv.tar.gz\n" + ] + } + ], + "source": [ + "def check_if_file_exists(filename: str, locations: list) -> str :\n", + " for location in locations:\n", + " if os.path.exists(os.path.join(location, filename)):\n", + " return location\n", + " return None\n", + "\n", + "def extract_tar_file(file_path: str, extraction_path: str) -> None:\n", + " tar = tarfile.open(file_path, \"r:gz\")\n", + " tar.extractall(extraction_path)\n", + " tar.close()\n", + "\n", + "try :\n", + "\n", + " from google.colab import files\n", + "\n", + " # specifying the data_path\n", + " data_path = \"./DATAPATH\"\n", + "\n", + " !mkdir ./DATAPATH\n", + "\n", + " # downloading the data\n", + " !gdown -O ./DATAPATH/dbpedia_csv.tar.gz \"https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\"\n", + "\n", + " # untaring the required file\n", + " !tar -xvf ./DATAPATH/dbpedia_csv.tar.gz --directory ./DATAPATH\n", + "\n", + " # sneek peek in the folder structure\n", + " !ls -lah ./DATAPATH\n", + "\n", + "except ModuleNotFoundError:\n", + " data_path = './Data/'\n", + " compressed_file_name = 'dbpedia_csv.tar.gz'\n", + " extracted_file_name = 'dbpedia_csv'\n", + "\n", + " # Check if Extracted File exists\n", + " location_of_extracted_file = check_if_file_exists(extracted_file_name, ['./Data'])\n", + "\n", + " if location_of_extracted_file:\n", + " # Extracted File exists\n", + " path_to_model = os.path.join(location_of_extracted_file, extracted_file_name)\n", + "\n", + " else:\n", + " location_of_compressed_file = check_if_file_exists(compressed_file_name, ['./Data'])\n", + "\n", + " if location_of_compressed_file:\n", + " # Compressed File exists\n", + " extract_tar_file(os.path.join(location_of_compressed_file, compressed_file_name), data_path)\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " else:\n", + " # Download File\n", + " os.makedirs(\"./Data\", exist_ok=True)\n", + " output_path = './Data/'\n", + " gdown.download(\"https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k\", output=output_path)\n", + "\n", + " # Extract File\n", + " extract_tar_file(os.path.join(data_path, compressed_file_name), output_path)\n", + "\n", + " path_to_model = os.path.join(data_path, extracted_file_name)\n", + "\n", + " print(f\"Data Present at location : {path_to_model}\")" + ] }, - "id": "gaz226vXb62W", - "outputId": "a7e5ab41-732e-4a94-def6-5e62124d6bd5" - }, - "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
classnamedescriptionclass_name
01E. D. Abbott LtdAbbott of Farnham E D Abbott Limited was a Br...Company
11Schwan-StabiloSchwan-STABILO is a German maker of pens for ...Company
21Q-workshopQ-workshop is a Polish company located in Poz...Company
31Marvell Software Solutions IsraelMarvell Software Solutions Israel known as RA...Company
41Bergan Mercy Medical CenterBergan Mercy Medical Center is a hospital loc...Company
\n", - "
" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lMoRw3oQb62I", + "outputId": "61ef0d52-044d-4829-db25-796f9ae2d562" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Train:(560000, 3) Test:(70000, 3)\n" + ] + } ], - "text/plain": [ - " class name \\\n", - "0 1 E. D. Abbott Ltd \n", - "1 1 Schwan-Stabilo \n", - "2 1 Q-workshop \n", - "3 1 Marvell Software Solutions Israel \n", - "4 1 Bergan Mercy Medical Center \n", - "\n", - " description class_name \n", - "0 Abbott of Farnham E D Abbott Limited was a Br... Company \n", - "1 Schwan-STABILO is a German maker of pens for ... Company \n", - "2 Q-workshop is a Polish company located in Poz... Company \n", - "3 Marvell Software Solutions Israel known as RA... Company \n", - "4 Bergan Mercy Medical Center is a hospital loc... Company " + "source": [ + "# Loading train data\n", + "train_file = data_path + '/dbpedia_csv/train.csv'\n", + "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n", + "# Loading test data\n", + "test_file = data_path + '/dbpedia_csv/test.csv'\n", + "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n", + "# Data we have\n", + "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))" ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Since we have no clue about the classes lets build one\n", - "# Mapping from class number to class name\n", - "class_dict={\n", - " 1:'Company',\n", - " 2:'EducationalInstitution',\n", - " 3:'Artist',\n", - " 4:'Athlete',\n", - " 5:'OfficeHolder',\n", - " 6:'MeanOfTransportation',\n", - " 7:'Building',\n", - " 8:'NaturalPlace',\n", - " 9:'Village',\n", - " 10:'Animal',\n", - " 11:'Plant',\n", - " 12:'Album',\n", - " 13:'Film',\n", - " 14:'WrittenWork'\n", - " }\n", - "\n", - "# Mapping the classes\n", - "df['class_name'] = df['class'].map(class_dict)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "si7VC_Rub62a", - "outputId": "a1f7d406-0e9c-4adf-eaee-fc09572f27bf" - }, - "outputs": [ { - "data": { - "text/plain": [ - "class_name\n", - "Company 40000\n", - "EducationalInstitution 40000\n", - "Artist 40000\n", - "Athlete 40000\n", - "OfficeHolder 40000\n", - "MeanOfTransportation 40000\n", - "Building 40000\n", - "NaturalPlace 40000\n", - "Village 40000\n", - "Animal 40000\n", - "Plant 40000\n", - "Album 40000\n", - "Film 40000\n", - "WrittenWork 40000\n", - "Name: count, dtype: int64" + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "gaz226vXb62W", + "outputId": "ba8cecca-4c1b-41ae-f726-27f6bbef243f" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " class name \\\n", + "0 1 E. D. Abbott Ltd \n", + "1 1 Schwan-Stabilo \n", + "2 1 Q-workshop \n", + "3 1 Marvell Software Solutions Israel \n", + "4 1 Bergan Mercy Medical Center \n", + "\n", + " description class_name \n", + "0 Abbott of Farnham E D Abbott Limited was a Br... Company \n", + "1 Schwan-STABILO is a German maker of pens for ... Company \n", + "2 Q-workshop is a Polish company located in Poz... Company \n", + "3 Marvell Software Solutions Israel known as RA... Company \n", + "4 Bergan Mercy Medical Center is a hospital loc... Company " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classnamedescriptionclass_name
01E. D. Abbott LtdAbbott of Farnham E D Abbott Limited was a Br...Company
11Schwan-StabiloSchwan-STABILO is a German maker of pens for ...Company
21Q-workshopQ-workshop is a Polish company located in Poz...Company
31Marvell Software Solutions IsraelMarvell Software Solutions Israel known as RA...Company
41Bergan Mercy Medical CenterBergan Mercy Medical Center is a hospital loc...Company
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "# Since we have no clue about the classes lets build one\n", + "# Mapping from class number to class name\n", + "class_dict={\n", + " 1:'Company',\n", + " 2:'EducationalInstitution',\n", + " 3:'Artist',\n", + " 4:'Athlete',\n", + " 5:'OfficeHolder',\n", + " 6:'MeanOfTransportation',\n", + " 7:'Building',\n", + " 8:'NaturalPlace',\n", + " 9:'Village',\n", + " 10:'Animal',\n", + " 11:'Plant',\n", + " 12:'Album',\n", + " 13:'Film',\n", + " 14:'WrittenWork'\n", + " }\n", + "\n", + "# Mapping the classes\n", + "df['class_name'] = df['class'].map(class_dict)\n", + "df.head()" ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"class_name\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "Sn-3kIqMb62d" - }, - "outputs": [], - "source": [ - "# Lets do some cleaning of this text\n", - "def clean_it(text,normalize=True):\n", - " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n", - " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n", - " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n", - " \n", - " # normalizing / encoding the text\n", - " if normalize:\n", - " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n", - " \n", - " return s\n", - "\n", - "# Now lets define a small function where we can use above cleaning on datasets\n", - "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n", - " # Defining the new data\n", - " df = data[['name','description']].copy(deep=True)\n", - " df['class'] = label_prefix + data['class'].astype(str) + ' '\n", - " \n", - " # cleaning it\n", - " if cleanit:\n", - " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n", - " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n", - " \n", - " # shuffling it\n", - " if shuffleit:\n", - " df.sample(frac=1).reset_index(drop=True)\n", - " \n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "r_DRvdFcb62m", - "outputId": "d3fc1348-fcb2-4f50-c090-067e5ca66301" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3.98 s, sys: 206 ms, total: 4.19 s\n", - "Wall time: 4.26 s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Transform the datasets using the above clean functions\n", - "df_train_cleaned = clean_df(df, True, True)\n", - "df_test_cleaned = clean_df(df_test, True, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "imMZ9-Bkb62t" - }, - "outputs": [], - "source": [ - "# Write files to disk as fastText classifier API reads files from disk.\n", - "train_file = data_path + '/dbpedia_train.csv'\n", - "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n", - "\n", - "test_file = data_path + '/dbpedia_test.csv'\n", - "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bWZTSzd9b62x" - }, - "source": [ - "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "si7VC_Rub62a", + "outputId": "dd8c0c6e-fce9-4362-abab-45d9c751858c" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Company 40000\n", + "EducationalInstitution 40000\n", + "Artist 40000\n", + "Athlete 40000\n", + "OfficeHolder 40000\n", + "MeanOfTransportation 40000\n", + "Building 40000\n", + "NaturalPlace 40000\n", + "Village 40000\n", + "Animal 40000\n", + "Plant 40000\n", + "Album 40000\n", + "Film 40000\n", + "WrittenWork 40000\n", + "Name: class_name, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "df[\"class_name\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "Sn-3kIqMb62d" + }, + "outputs": [], + "source": [ + "# Lets do some cleaning of this text\n", + "def clean_it(text,normalize=True):\n", + " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n", + " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n", + " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n", + "\n", + " # normalizing / encoding the text\n", + " if normalize:\n", + " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n", + "\n", + " return s\n", + "\n", + "# Now lets define a small function where we can use above cleaning on datasets\n", + "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n", + " # Defining the new data\n", + " df = data[['name','description']].copy(deep=True)\n", + " df['class'] = label_prefix + data['class'].astype(str) + ' '\n", + "\n", + " # cleaning it\n", + " if cleanit:\n", + " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n", + " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n", + "\n", + " # shuffling it\n", + " if shuffleit:\n", + " df.sample(frac=1).reset_index(drop=True)\n", + "\n", + " return df" + ] }, - "id": "a-H1wouCb62x", - "outputId": "3d7c130a-fd3b-472c-8585-2e965017763f" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Read 31M words\n", - "Number of words: 1116962\n", - "Number of labels: 14\n", - "Progress: 100.0% words/sec/thread: 1168450 lr: 0.000174 avg.loss: 0.003673 ETA: 0h 0m 0s 23.8% words/sec/thread: 1198254 lr: 0.762147 avg.loss: 0.009301 ETA: 0h12m38s% words/sec/thread: 1173386 lr: 0.577732 avg.loss: 0.006055 ETA: 0h 9m47s 0.419437 avg.loss: 0.004894 ETA: 0h 7m 5s 73.1% words/sec/thread: 1159819 lr: 0.268968 avg.loss: 0.004059 ETA: 0h 4m36s 0.003774 ETA: 0h 3m23s 81.4% words/sec/thread: 1157751 lr: 0.186418 avg.loss: 0.003744 ETA: 0h 3m12s 0.003730 ETA: 0h 3m 7s 82.8% words/sec/thread: 1155522 lr: 0.172032 avg.loss: 0.003694 ETA: 0h 2m57s 96.3% words/sec/thread: 1169432 lr: 0.036623 avg.loss: 0.003380 ETA: 0h 0m37s" - ] + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r_DRvdFcb62m", + "outputId": "75d06ea5-e04f-4c03-a6c4-37fdc4e10442" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 3.78 s, sys: 220 ms, total: 4 s\n", + "Wall time: 4.15 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Transform the datasets using the above clean functions\n", + "df_train_cleaned = clean_df(df, True, True)\n", + "df_test_cleaned = clean_df(df_test, True, True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "imMZ9-Bkb62t" + }, + "outputs": [], + "source": [ + "# Write files to disk as fastText classifier API reads files from disk.\n", + "train_file = data_path + '/dbpedia_train.csv'\n", + "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n", + "\n", + "test_file = data_path + '/dbpedia_test.csv'\n", + "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 33min 7s, sys: 19 s, total: 33min 26s\n", - "Wall time: 17min 5s\n" - ] + "cell_type": "markdown", + "metadata": { + "id": "bWZTSzd9b62x" + }, + "source": [ + "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Progress: 100.0% words/sec/thread: 1168425 lr: 0.000000 avg.loss: 0.003688 ETA: 0h 0m 0s\n" - ] + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "a-H1wouCb62x", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "bdb18719-1b63-4c76-f1cb-58e93717fbd2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 1h, sys: 19 s, total: 1h 19s\n", + "Wall time: 33min 36s\n" + ] + } + ], + "source": [ + "%%time\n", + "## Using fastText for feature extraction and training\n", + "from fasttext import train_supervised\n", + "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n", + "label_prefix refers to the prefix before label string in the dataset.\n", + "default is __label__. In our dataset, it is __class__.\n", + "There are several other parameters which can be seen in:\n", + "https://pypi.org/project/fasttext/\n", + "\"\"\"\n", + "model = train_supervised(input=train_file, label=\"__class__\", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "sAyN3ZDbQFq-", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7aec83c4-251d-402e-e10d-1f926de4c64f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test Samples: 70000 Precision@1 : 91.5214 Recall@1 : 91.5214\n", + "Test Samples: 70000 Precision@2 : 47.6493 Recall@2 : 95.2986\n", + "Test Samples: 70000 Precision@3 : 31.9848 Recall@3 : 95.9543\n", + "Test Samples: 70000 Precision@4 : 24.2014 Recall@4 : 96.8057\n", + "Test Samples: 70000 Precision@5 : 19.4149 Recall@5 : 97.0743\n" + ] + } + ], + "source": [ + "for k in range(1,6):\n", + " results = model.test(test_file,k=k)\n", + " print(f\"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nrxSYRs3b621" + }, + "source": [ + "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 90% Precision and Recall are hard numbers to beat, too!" + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "AHSqt1rLd-R0" + }, + "execution_count": null, + "outputs": [] } - ], - "source": [ - "%%time\n", - "## Using fastText for feature extraction and training\n", - "from fasttext import train_supervised \n", - "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n", - "label_prefix refers to the prefix before label string in the dataset.\n", - "default is __label__. In our dataset, it is __class__. \n", - "There are several other parameters which can be seen in: \n", - "https://pypi.org/project/fasttext/\n", - "\"\"\"\n", - "model = train_supervised(input=train_file, label=\"__class__\", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { + ], + "metadata": { + "accelerator": "GPU", "colab": { - "base_uri": "https://localhost:8080/" + "provenance": [] }, - "id": "sAyN3ZDbQFq-", - "outputId": "13acbc62-48d9-469c-dfb1-d3e5446b8530" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test Samples: 70000 Precision@1 : 90.7343 Recall@1 : 90.7343\n", - "Test Samples: 70000 Precision@2 : 48.0407 Recall@2 : 96.0814\n", - "Test Samples: 70000 Precision@3 : 32.3319 Recall@3 : 96.9957\n", - "Test Samples: 70000 Precision@4 : 24.3021 Recall@4 : 97.2086\n", - "Test Samples: 70000 Precision@5 : 19.4711 Recall@5 : 97.3557\n" - ] + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "for k in range(1,6):\n", - " results = model.test(test_file,k=k)\n", - " print(f\"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nrxSYRs3b621" - }, - "source": [ - "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 90% Precision and Recall are hard numbers to beat, too!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "04_FastText_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 695cad433d81e059e38a8e9f79e9ed8c567f28ff Mon Sep 17 00:00:00 2001 From: Abhijeetsingh Date: Wed, 23 Aug 2023 01:00:06 +0530 Subject: [PATCH 14/14] fix: Added Freezed Library Versions to Ch4/05_DeepNN_Example.ipynb --- Ch4/05_DeepNN_Example.ipynb | 1175 +++++++++++++++++------------------ 1 file changed, 571 insertions(+), 604 deletions(-) diff --git a/Ch4/05_DeepNN_Example.ipynb b/Ch4/05_DeepNN_Example.ipynb index c8b184b..7cbdaee 100644 --- a/Ch4/05_DeepNN_Example.ipynb +++ b/Ch4/05_DeepNN_Example.ipynb @@ -1,628 +1,595 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "aLNg_Puse6EX" - }, - "source": [ - "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "aLNg_Puse6EX" + }, + "source": [ + "In this notebook we will demonstrate different text classification models trained using the IMDB reviews dataset." + ] }, - "id": "eOJLveJqtEO3", - "outputId": "067a74b2-c5df-464d-a3fa-3f4517a9090a" - }, - "outputs": [], - "source": [ - "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# !pip install numpy==1.19.5\n", - "# !pip install wget==3.2\n", - "# !pip install tensorflow==1.14.0\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "Ixb_5zcYtEO5" - }, - "outputs": [], - "source": [ - "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", - "\n", - "# ===========================\n", - "\n", - "# try:\n", - "# import google.colab\n", - "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", - "# except ModuleNotFoundError:\n", - "# !pip install -r \"ch4-requirements.txt\"\n", - "\n", - "# ===========================" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "xqUcb7NBb5--" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-08-15 09:43:18.128696: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-08-15 09:43:18.575098: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-08-15 09:43:20.130588: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" - ] - } - ], - "source": [ - "#Make the necessary imports\n", - "import os\n", - "import sys\n", - "import numpy as np\n", - "import tarfile\n", - "import wget\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\") \n", - "from zipfile import ZipFile\n", - "from tensorflow.keras.preprocessing.text import Tokenizer\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", - "from tensorflow.keras.utils import to_categorical\n", - "from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D\n", - "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM\n", - "from tensorflow.keras.models import Model, Sequential\n", - "from tensorflow.keras.initializers import Constant" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0MqW5vWwfiCP" - }, - "source": [ - "Here we set all the paths of all the external datasets and models such as [glove](https://nlp.stanford.edu/projects/glove/) and [IMDB reviews dataset](http://ai.stanford.edu/~amaas/data/sentiment/)." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "HUKTqLHud7fo" - }, - "outputs": [], - "source": [ - "%%capture\n", - "try:\n", - " \n", - " from google.colab import files\n", - " \n", - " !wget -P DATAPATH http://nlp.stanford.edu/data/glove.6B.zip\n", - " !unzip DATAPATH/glove.6B.zip -d DATAPATH/glove.6B\n", - " \n", - " !wget -P DATAPATH http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", - " !tar -xvf DATAPATH/aclImdb_v1.tar.gz -C DATAPATH\n", - " \n", - " BASE_DIR = 'DATAPATH'\n", - " \n", - "except ModuleNotFoundError:\n", - " \n", - " if not os.path.exists('Data/glove.6B'):\n", - " os.mkdir('Data/glove.6B')\n", - " \n", - " url='http://nlp.stanford.edu/data/glove.6B.zip' \n", - " wget.download(url,'Data') \n", - " \n", - " temp='Data/glove.6B.zip' \n", - " file = ZipFile(temp) \n", - " file.extractall('Data/glove.6B') \n", - " file.close()\n", - " \n", - " \n", - " \n", - " if not os.path.exists('Data/aclImdb'):\n", - " \n", - " url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' \n", - " wget.download(url,'Data')\n", - " \n", - " temp='Data/aclImdb_v1.tar.gz' \n", - " tar = tarfile.open(temp, \"r:gz\")\n", - " tar.extractall('Data') \n", - " tar.close()\n", - " \n", - " BASE_DIR = 'Data'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "qvl1qb78fUib" - }, - "outputs": [], - "source": [ - "GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')\n", - "TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/train')\n", - "TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/test')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "Yu9xmAZEd7fp" - }, - "outputs": [], - "source": [ - "#Within these, I only have a pos/ and a neg/ folder containing text files \n", - "MAX_SEQUENCE_LENGTH = 1000\n", - "MAX_NUM_WORDS = 20000 \n", - "EMBEDDING_DIM = 100 \n", - "VALIDATION_SPLIT = 0.2\n", - "\n", - "#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py\n", - "#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EmifkoA8b5_N" - }, - "source": [ - "### Loading and Preprocessing\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "WI4O1usEb5_O" - }, - "outputs": [], - "source": [ - "#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.\n", - "def get_data(data_dir):\n", - " texts = [] # list of text samples\n", - " labels_index = {'pos':1, 'neg':0} # dictionary mapping label name to numeric id\n", - " labels = [] # list of label ids\n", - " for name in sorted(os.listdir(data_dir)):\n", - " path = os.path.join(data_dir, name)\n", - " if os.path.isdir(path):\n", - " if name=='pos' or name=='neg':\n", - " label_id = labels_index[name]\n", - " for fname in sorted(os.listdir(path)):\n", - " fpath = os.path.join(path, fname)\n", - " text = open(fpath,encoding='utf8').read()\n", - " texts.append(text)\n", - " labels.append(label_id)\n", - " return texts, labels\n", - "\n", - "train_texts, train_labels = get_data(TRAIN_DATA_DIR)\n", - "test_texts, test_labels = get_data(TEST_DATA_DIR)\n", - "labels_index = {'pos':1, 'neg':0} \n", - "\n", - "#Just to see how the data looks like. \n", - "#print(train_texts[0])\n", - "#print(train_labels[0])\n", - "#print(test_texts[24999])\n", - "#print(test_labels[24999])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "eOJLveJqtEO3" + }, + "outputs": [], + "source": [ + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# !pip install numpy==1.23.5\n", + "# !pip install wget==3.2\n", + "# !pip install tensorflow==2.12.0\n", + "\n", + "# ===========================" + ] }, - "id": "QhhqM0Jdd7fs", - "outputId": "9b5b394e-bc52-4779-d85d-a0383446051d" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 88582 unique tokens.\n" - ] - } - ], - "source": [ - "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer \n", - "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data. \n", - "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) \n", - "tokenizer.fit_on_texts(train_texts) \n", - "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes \n", - "test_sequences = tokenizer.texts_to_sequences(test_texts) \n", - "word_index = tokenizer.word_index \n", - "print('Found %s unique tokens.' % len(word_index))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "Ixb_5zcYtEO5" + }, + "outputs": [], + "source": [ + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", + "\n", + "# ===========================\n", + "\n", + "# try:\n", + "# import google.colab\n", + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/ch4-requirements.txt | xargs -n 1 -L 1 pip install\n", + "# except ModuleNotFoundError:\n", + "# !pip install -r \"ch4-requirements.txt\"\n", + "\n", + "# ===========================" + ] }, - "id": "_e0V1-bBb5_d", - "outputId": "d866429d-5bb6-43a7-c66e-ed5abbafc4cd" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Splitting the train data into train and valid is done\n" - ] - } - ], - "source": [ - "#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier\n", - "#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH\n", - "trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", - "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", - "trainvalid_labels = to_categorical(np.asarray(train_labels))\n", - "test_labels = to_categorical(np.asarray(test_labels))\n", - "\n", - "# split the training data into a training set and a validation set\n", - "indices = np.arange(trainvalid_data.shape[0])\n", - "np.random.shuffle(indices)\n", - "trainvalid_data = trainvalid_data[indices]\n", - "trainvalid_labels = trainvalid_labels[indices]\n", - "num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])\n", - "x_train = trainvalid_data[:-num_validation_samples]\n", - "y_train = trainvalid_labels[:-num_validation_samples]\n", - "x_val = trainvalid_data[-num_validation_samples:]\n", - "y_val = trainvalid_labels[-num_validation_samples:]\n", - "#This is the data we will use for CNN and RNN training\n", - "print('Splitting the train data into train and valid is done')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "xqUcb7NBb5--" + }, + "outputs": [], + "source": [ + "#Make the necessary imports\n", + "import os\n", + "import sys\n", + "import numpy as np\n", + "import tarfile\n", + "import wget\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "from zipfile import ZipFile\n", + "from tensorflow.keras.preprocessing.text import Tokenizer\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from tensorflow.keras.utils import to_categorical\n", + "from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D\n", + "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM\n", + "from tensorflow.keras.models import Model, Sequential\n", + "from tensorflow.keras.initializers import Constant" + ] }, - "id": "WUHqg2vvb5_l", - "outputId": "8387eda1-18f0-4254-9819-e63191b8fc04" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Preparing embedding matrix.\n", - "Found 400000 word vectors in Glove embeddings.\n", - "Preparing of embedding matrix is done\n" - ] - } - ], - "source": [ - "print('Preparing embedding matrix.')\n", - "\n", - "# first, build index mapping words in the embeddings set\n", - "# to their embedding vector\n", - "embeddings_index = {}\n", - "with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf8') as f:\n", - " for line in f:\n", - " values = line.split()\n", - " word = values[0]\n", - " coefs = np.asarray(values[1:], dtype='float32')\n", - " embeddings_index[word] = coefs\n", - "\n", - "print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))\n", - "#print(embeddings_index[\"google\"])\n", - "\n", - "# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.\n", - "num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n", - "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n", - "for word, i in word_index.items():\n", - " if i > MAX_NUM_WORDS:\n", - " continue\n", - " embedding_vector = embeddings_index.get(word)\n", - " if embedding_vector is not None:\n", - " # words not found in embedding index will be all-zeros.\n", - " embedding_matrix[i] = embedding_vector\n", - "\n", - "# load these pre-trained word embeddings into an Embedding layer\n", - "# note that we set trainable = False so as to keep the embeddings fixed\n", - "embedding_layer = Embedding(num_words,\n", - " EMBEDDING_DIM,\n", - " embeddings_initializer=Constant(embedding_matrix),\n", - " input_length=MAX_SEQUENCE_LENGTH,\n", - " trainable=False)\n", - "print(\"Preparing of embedding matrix is done\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vEastnX8gdxR" - }, - "source": [ - "### 1D CNN Model with pre-trained embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "markdown", + "metadata": { + "id": "0MqW5vWwfiCP" + }, + "source": [ + "Here we set all the paths of all the external datasets and models such as [glove](https://nlp.stanford.edu/projects/glove/) and [IMDB reviews dataset](http://ai.stanford.edu/~amaas/data/sentiment/)." + ] }, - "id": "TTY-4K-Ob5_t", - "outputId": "836681ca-936e-400a-8973-0754759bb7cd" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Define a 1D CNN model.\n" - ] + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "HUKTqLHud7fo" + }, + "outputs": [], + "source": [ + "%%capture\n", + "try:\n", + "\n", + " from google.colab import files\n", + "\n", + " !wget -P DATAPATH http://nlp.stanford.edu/data/glove.6B.zip\n", + " !unzip DATAPATH/glove.6B.zip -d DATAPATH/glove.6B\n", + "\n", + " !wget -P DATAPATH http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", + " !tar -xvf DATAPATH/aclImdb_v1.tar.gz -C DATAPATH\n", + "\n", + " BASE_DIR = 'DATAPATH'\n", + "\n", + "except ModuleNotFoundError:\n", + "\n", + " if not os.path.exists('Data/glove.6B'):\n", + " os.mkdir('Data/glove.6B')\n", + "\n", + " url='http://nlp.stanford.edu/data/glove.6B.zip'\n", + " wget.download(url,'Data')\n", + "\n", + " temp='Data/glove.6B.zip'\n", + " file = ZipFile(temp)\n", + " file.extractall('Data/glove.6B')\n", + " file.close()\n", + "\n", + "\n", + "\n", + " if not os.path.exists('Data/aclImdb'):\n", + "\n", + " url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'\n", + " wget.download(url,'Data')\n", + "\n", + " temp='Data/aclImdb_v1.tar.gz'\n", + " tar = tarfile.open(temp, \"r:gz\")\n", + " tar.extractall('Data')\n", + " tar.close()\n", + "\n", + " BASE_DIR = 'Data'" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-08-15 09:55:06.494956: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 80000000 exceeds 10% of free system memory.\n", - "2023-08-15 09:55:07.639684: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 51200000 exceeds 10% of free system memory.\n", - "2023-08-15 09:55:07.663242: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65273856 exceeds 10% of free system memory.\n", - "2023-08-15 09:55:08.392478: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 26083328 exceeds 10% of free system memory.\n", - "2023-08-15 09:55:08.392575: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 65273856 exceeds 10% of free system memory.\n" - ] + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "qvl1qb78fUib" + }, + "outputs": [], + "source": [ + "GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')\n", + "TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/train')\n", + "TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/test')" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "157/157 [==============================] - 122s 768ms/step - loss: 0.6612 - acc: 0.6183 - val_loss: 0.5199 - val_acc: 0.7728\n", - "782/782 [==============================] - 46s 59ms/step - loss: 0.5269 - acc: 0.7656\n", - "Test accuracy with CNN: 0.76555997133255\n" - ] - } - ], - "source": [ - "print('Define a 1D CNN model.')\n", - "\n", - "cnnmodel = Sequential()\n", - "cnnmodel.add(embedding_layer)\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(GlobalMaxPooling1D())\n", - "cnnmodel.add(Dense(128, activation='relu'))\n", - "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", - "\n", - "cnnmodel.compile(loss='categorical_crossentropy',\n", - " optimizer='rmsprop',\n", - " metrics=['acc'])\n", - "#Train the model. Tune to validation set. \n", - "cnnmodel.fit(x_train, y_train,\n", - " batch_size=128,\n", - " epochs=1, validation_data=(x_val, y_val))\n", - "#Evaluate on test set:\n", - "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", - "print('Test accuracy with CNN:', acc)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VdDj2FJzgi_W" - }, - "source": [ - "### 1D CNN model with training your own embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "Yu9xmAZEd7fp" + }, + "outputs": [], + "source": [ + "#Within these, I only have a pos/ and a neg/ folder containing text files\n", + "MAX_SEQUENCE_LENGTH = 1000\n", + "MAX_NUM_WORDS = 20000\n", + "EMBEDDING_DIM = 100\n", + "VALIDATION_SPLIT = 0.2\n", + "\n", + "#started off from: https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py\n", + "#and from: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py" + ] }, - "id": "zI0bISwRb5_w", - "outputId": "d7697504-dacb-415c-b131-b89d6b10c771" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n", - "157/157 [==============================] - 200s 1s/step - loss: 0.6526 - acc: 0.5850 - val_loss: 0.5804 - val_acc: 0.7002\n", - "782/782 [==============================] - 55s 70ms/step - loss: 0.5655 - acc: 0.7103\n", - "Test accuracy with CNN: 0.7102800011634827\n" - ] - } - ], - "source": [ - "print(\"Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\")\n", - "cnnmodel = Sequential()\n", - "cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(MaxPooling1D(5))\n", - "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", - "cnnmodel.add(GlobalMaxPooling1D())\n", - "cnnmodel.add(Dense(128, activation='relu'))\n", - "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", - "\n", - "cnnmodel.compile(loss='categorical_crossentropy',\n", - " optimizer='rmsprop',\n", - " metrics=['acc'])\n", - "#Train the model. Tune to validation set. \n", - "cnnmodel.fit(x_train, y_train,\n", - " batch_size=128,\n", - " epochs=1, validation_data=(x_val, y_val))\n", - "#Evaluate on test set:\n", - "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", - "print('Test accuracy with CNN:', acc)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6GwhXpmSgt4H" - }, - "source": [ - "### LSTM Model with training your own embedding " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cell_type": "markdown", + "metadata": { + "id": "EmifkoA8b5_N" + }, + "source": [ + "### Loading and Preprocessing\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "WI4O1usEb5_O" + }, + "outputs": [], + "source": [ + "#Function to load the data from the dataset into the notebook. Will be called twice - for train and test.\n", + "def get_data(data_dir):\n", + " texts = [] # list of text samples\n", + " labels_index = {'pos':1, 'neg':0} # dictionary mapping label name to numeric id\n", + " labels = [] # list of label ids\n", + " for name in sorted(os.listdir(data_dir)):\n", + " path = os.path.join(data_dir, name)\n", + " if os.path.isdir(path):\n", + " if name=='pos' or name=='neg':\n", + " label_id = labels_index[name]\n", + " for fname in sorted(os.listdir(path)):\n", + " fpath = os.path.join(path, fname)\n", + " text = open(fpath,encoding='utf8').read()\n", + " texts.append(text)\n", + " labels.append(label_id)\n", + " return texts, labels\n", + "\n", + "train_texts, train_labels = get_data(TRAIN_DATA_DIR)\n", + "test_texts, test_labels = get_data(TEST_DATA_DIR)\n", + "labels_index = {'pos':1, 'neg':0}\n", + "\n", + "#Just to see how the data looks like.\n", + "#print(train_texts[0])\n", + "#print(train_labels[0])\n", + "#print(test_texts[24999])\n", + "#print(test_labels[24999])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QhhqM0Jdd7fs", + "outputId": "9e16478c-8111-4aaf-e73b-b89359dd114f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Found 88582 unique tokens.\n" + ] + } + ], + "source": [ + "#Vectorize these text samples into a 2D integer tensor using Keras Tokenizer\n", + "#Tokenizer is fit on training data only, and that is used to tokenize both train and test data.\n", + "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)\n", + "tokenizer.fit_on_texts(train_texts)\n", + "train_sequences = tokenizer.texts_to_sequences(train_texts) #Converting text to a vector of word indexes\n", + "test_sequences = tokenizer.texts_to_sequences(test_texts)\n", + "word_index = tokenizer.word_index\n", + "print('Found %s unique tokens.' % len(word_index))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_e0V1-bBb5_d", + "outputId": "94d409aa-5ac2-4b4a-809d-fc91d4563285" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Splitting the train data into train and valid is done\n" + ] + } + ], + "source": [ + "#Converting this to sequences to be fed into neural network. Max seq. len is 1000 as set earlier\n", + "#initial padding of 0s, until vector is of size MAX_SEQUENCE_LENGTH\n", + "trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", + "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", + "trainvalid_labels = to_categorical(np.asarray(train_labels))\n", + "test_labels = to_categorical(np.asarray(test_labels))\n", + "\n", + "# split the training data into a training set and a validation set\n", + "indices = np.arange(trainvalid_data.shape[0])\n", + "np.random.shuffle(indices)\n", + "trainvalid_data = trainvalid_data[indices]\n", + "trainvalid_labels = trainvalid_labels[indices]\n", + "num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])\n", + "x_train = trainvalid_data[:-num_validation_samples]\n", + "y_train = trainvalid_labels[:-num_validation_samples]\n", + "x_val = trainvalid_data[-num_validation_samples:]\n", + "y_val = trainvalid_labels[-num_validation_samples:]\n", + "#This is the data we will use for CNN and RNN training\n", + "print('Splitting the train data into train and valid is done')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WUHqg2vvb5_l", + "outputId": "0b0bc141-184a-4d99-bf55-360d0e6a0212" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Preparing embedding matrix.\n", + "Found 400000 word vectors in Glove embeddings.\n", + "Preparing of embedding matrix is done\n" + ] + } + ], + "source": [ + "print('Preparing embedding matrix.')\n", + "\n", + "# first, build index mapping words in the embeddings set\n", + "# to their embedding vector\n", + "embeddings_index = {}\n", + "with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf8') as f:\n", + " for line in f:\n", + " values = line.split()\n", + " word = values[0]\n", + " coefs = np.asarray(values[1:], dtype='float32')\n", + " embeddings_index[word] = coefs\n", + "\n", + "print('Found %s word vectors in Glove embeddings.' % len(embeddings_index))\n", + "#print(embeddings_index[\"google\"])\n", + "\n", + "# prepare embedding matrix - rows are the words from word_index, columns are the embeddings of that word from glove.\n", + "num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n", + "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n", + "for word, i in word_index.items():\n", + " if i > MAX_NUM_WORDS:\n", + " continue\n", + " embedding_vector = embeddings_index.get(word)\n", + " if embedding_vector is not None:\n", + " # words not found in embedding index will be all-zeros.\n", + " embedding_matrix[i] = embedding_vector\n", + "\n", + "# load these pre-trained word embeddings into an Embedding layer\n", + "# note that we set trainable = False so as to keep the embeddings fixed\n", + "embedding_layer = Embedding(num_words,\n", + " EMBEDDING_DIM,\n", + " embeddings_initializer=Constant(embedding_matrix),\n", + " input_length=MAX_SEQUENCE_LENGTH,\n", + " trainable=False)\n", + "print(\"Preparing of embedding matrix is done\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vEastnX8gdxR" + }, + "source": [ + "### 1D CNN Model with pre-trained embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TTY-4K-Ob5_t", + "outputId": "834682a9-9371-4769-a967-cc2b2c24eaa7" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Define a 1D CNN model.\n", + "157/157 [==============================] - 139s 878ms/step - loss: 0.6694 - acc: 0.6158 - val_loss: 0.5212 - val_acc: 0.7606\n", + "782/782 [==============================] - 56s 71ms/step - loss: 0.5251 - acc: 0.7537\n", + "Test accuracy with CNN: 0.7536799907684326\n" + ] + } + ], + "source": [ + "print('Define a 1D CNN model.')\n", + "\n", + "cnnmodel = Sequential()\n", + "cnnmodel.add(embedding_layer)\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(GlobalMaxPooling1D())\n", + "cnnmodel.add(Dense(128, activation='relu'))\n", + "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", + "\n", + "cnnmodel.compile(loss='categorical_crossentropy',\n", + " optimizer='rmsprop',\n", + " metrics=['acc'])\n", + "#Train the model. Tune to validation set.\n", + "cnnmodel.fit(x_train, y_train,\n", + " batch_size=128,\n", + " epochs=1, validation_data=(x_val, y_val))\n", + "#Evaluate on test set:\n", + "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", + "print('Test accuracy with CNN:', acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VdDj2FJzgi_W" + }, + "source": [ + "### 1D CNN model with training your own embedding" + ] }, - "id": "SvBt2Brib5_4", - "outputId": "008fe9fa-13bf-4127-ba46-67916426ddbe" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defining and training an LSTM model, training embedding layer on the fly\n", - "Training the RNN\n", - "625/625 [==============================] - 709s 1s/step - loss: 0.5129 - accuracy: 0.7477 - val_loss: 0.4001 - val_accuracy: 0.8254\n", - "782/782 [==============================] - 156s 199ms/step - loss: 0.3995 - accuracy: 0.8270\n", - "Test accuracy with RNN: 0.8270000219345093\n" - ] + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zI0bISwRb5_w", + "outputId": "1c2285f4-edd4-4142-8f9e-7da2f9a91dc7" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\n", + "157/157 [==============================] - 216s 1s/step - loss: 0.6921 - acc: 0.5152 - val_loss: 0.6671 - val_acc: 0.6168\n", + "782/782 [==============================] - 66s 85ms/step - loss: 0.6667 - acc: 0.6200\n", + "Test accuracy with CNN: 0.6200399994850159\n" + ] + } + ], + "source": [ + "print(\"Defining and training a CNN model, training embedding layer on the fly instead of using pre-trained embeddings\")\n", + "cnnmodel = Sequential()\n", + "cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(MaxPooling1D(5))\n", + "cnnmodel.add(Conv1D(128, 5, activation='relu'))\n", + "cnnmodel.add(GlobalMaxPooling1D())\n", + "cnnmodel.add(Dense(128, activation='relu'))\n", + "cnnmodel.add(Dense(len(labels_index), activation='softmax'))\n", + "\n", + "cnnmodel.compile(loss='categorical_crossentropy',\n", + " optimizer='rmsprop',\n", + " metrics=['acc'])\n", + "#Train the model. Tune to validation set.\n", + "cnnmodel.fit(x_train, y_train,\n", + " batch_size=128,\n", + " epochs=1, validation_data=(x_val, y_val))\n", + "#Evaluate on test set:\n", + "score, acc = cnnmodel.evaluate(test_data, test_labels)\n", + "print('Test accuracy with CNN:', acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6GwhXpmSgt4H" + }, + "source": [ + "### LSTM Model with training your own embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SvBt2Brib5_4", + "outputId": "434183cf-f713-4911-b403-100223907162" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Defining and training an LSTM model, training embedding layer on the fly\n", + "Training the RNN\n", + "625/625 [==============================] - 1315s 2s/step - loss: 0.4609 - accuracy: 0.7807 - val_loss: 0.3932 - val_accuracy: 0.8286\n", + "782/782 [==============================] - 191s 245ms/step - loss: 0.4004 - accuracy: 0.8236\n", + "Test accuracy with RNN: 0.8235999941825867\n" + ] + } + ], + "source": [ + "print(\"Defining and training an LSTM model, training embedding layer on the fly\")\n", + "\n", + "#model\n", + "rnnmodel = Sequential()\n", + "rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", + "rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", + "rnnmodel.add(Dense(2, activation='sigmoid'))\n", + "rnnmodel.compile(loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + "print('Training the RNN')\n", + "\n", + "rnnmodel.fit(x_train, y_train,\n", + " batch_size=32,\n", + " epochs=1,\n", + " validation_data=(x_val, y_val))\n", + "score, acc = rnnmodel.evaluate(test_data, test_labels,\n", + " batch_size=32)\n", + "print('Test accuracy with RNN:', acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tJYzsZFSg9z-" + }, + "source": [ + "### LSTM Model using pre-trained Embedding Layer" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Eymx0IyCb5_-", + "outputId": "2c6c182a-0dee-442c-f978-ac16e840b51f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Defining and training an LSTM model, using pre-trained embedding layer\n", + "Training the RNN\n", + "625/625 [==============================] - 1075s 2s/step - loss: 0.6050 - accuracy: 0.6728 - val_loss: 0.4578 - val_accuracy: 0.7916\n", + "782/782 [==============================] - 183s 234ms/step - loss: 0.4554 - accuracy: 0.7917\n", + "Test accuracy with RNN: 0.7916799783706665\n" + ] + } + ], + "source": [ + "print(\"Defining and training an LSTM model, using pre-trained embedding layer\")\n", + "\n", + "rnnmodel2 = Sequential()\n", + "rnnmodel2.add(embedding_layer)\n", + "rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", + "rnnmodel2.add(Dense(2, activation='sigmoid'))\n", + "rnnmodel2.compile(loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + "print('Training the RNN')\n", + "\n", + "rnnmodel2.fit(x_train, y_train,\n", + " batch_size=32,\n", + " epochs=1,\n", + " validation_data=(x_val, y_val))\n", + "score, acc = rnnmodel2.evaluate(test_data, test_labels,\n", + " batch_size=32)\n", + "print('Test accuracy with RNN:', acc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tb81rafef3Wl" + }, + "outputs": [], + "source": [] } - ], - "source": [ - "print(\"Defining and training an LSTM model, training embedding layer on the fly\")\n", - "\n", - "#model\n", - "rnnmodel = Sequential()\n", - "rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))\n", - "rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", - "rnnmodel.add(Dense(2, activation='sigmoid'))\n", - "rnnmodel.compile(loss='binary_crossentropy',\n", - " optimizer='adam',\n", - " metrics=['accuracy'])\n", - "print('Training the RNN')\n", - "\n", - "rnnmodel.fit(x_train, y_train,\n", - " batch_size=32,\n", - " epochs=1,\n", - " validation_data=(x_val, y_val))\n", - "score, acc = rnnmodel.evaluate(test_data, test_labels,\n", - " batch_size=32)\n", - "print('Test accuracy with RNN:', acc)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tJYzsZFSg9z-" - }, - "source": [ - "### LSTM Model using pre-trained Embedding Layer" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { + ], + "metadata": { "colab": { - "base_uri": "https://localhost:8080/" + "provenance": [] }, - "id": "Eymx0IyCb5_-", - "outputId": "da0fa303-a4c4-4b92-ff42-54f1a1d51e45" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defining and training an LSTM model, using pre-trained embedding layer\n", - "Training the RNN\n", - "625/625 [==============================] - 548s 875ms/step - loss: 0.6192 - accuracy: 0.6578 - val_loss: 0.4754 - val_accuracy: 0.7876\n", - "782/782 [==============================] - 140s 179ms/step - loss: 0.4757 - accuracy: 0.7847\n", - "Test accuracy with RNN: 0.7847200036048889\n" - ] + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" } - ], - "source": [ - "print(\"Defining and training an LSTM model, using pre-trained embedding layer\")\n", - "\n", - "rnnmodel2 = Sequential()\n", - "rnnmodel2.add(embedding_layer)\n", - "rnnmodel2.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))\n", - "rnnmodel2.add(Dense(2, activation='sigmoid'))\n", - "rnnmodel2.compile(loss='binary_crossentropy',\n", - " optimizer='adam',\n", - " metrics=['accuracy'])\n", - "print('Training the RNN')\n", - "\n", - "rnnmodel2.fit(x_train, y_train,\n", - " batch_size=32,\n", - " epochs=1,\n", - " validation_data=(x_val, y_val))\n", - "score, acc = rnnmodel2.evaluate(test_data, test_labels,\n", - " batch_size=32)\n", - "print('Test accuracy with RNN:', acc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "05_DeepNN_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.17" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file