Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 88 additions & 45 deletions Ch3/09_Visualizing_Embeddings_Using_TSNE.ipynb

Large diffs are not rendered by default.

247 changes: 171 additions & 76 deletions Ch4/03_Word2Vec_Example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "sVtvH58nb_Hp"
},
"source": [
Expand All @@ -28,17 +27,36 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "JQX8DAmBb_Hr"
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JQX8DAmBb_Hr",
"outputId": "3e55c7d1-be7c-44bf-caf8-d00cf9dab00b"
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
]
}
],
"source": [
"#basic imports\n",
"import os\n",
"import wget\n",
"import gzip\n",
"import shutil\n",
"from time import time\n",
"\n",
"#pre-processing imports\n",
"import nltk\n",
"nltk.download('stopwords')\n",
"nltk.download('punkt')\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from string import punctuation\n",
Expand All @@ -54,91 +72,163 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"path = os.getcwd()\n",
"path = path + '\\Data'\n",
"\n",
"fil = 'sentiment_sentences.txt'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 140,
"resources": {
"http://localhost:8080/nbextensions/google.colab/files.js": {
"data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgZG8gewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwoKICAgICAgbGV0IHBlcmNlbnREb25lID0gZmlsZURhdGEuYnl0ZUxlbmd0aCA9PT0gMCA/CiAgICAgICAgICAxMDAgOgogICAgICAgICAgTWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCk7CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPSBgJHtwZXJjZW50RG9uZX0lIGRvbmVgOwoKICAgIH0gd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCk7CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK",
"headers": [
[
"content-type",
"application/javascript"
]
],
"ok": true,
"status": 200,
"status_text": ""
}
}
},
"id": "S8RM8c6AS8AX",
"outputId": "e6e3d6b7-bd49-4dd9-a28e-59a4193187aa"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <input type=\"file\" id=\"files-d0856459-a7c7-4318-a2a4-6c118d5723ab\" name=\"files[]\" multiple disabled\n",
" style=\"border:none\" />\n",
" <output id=\"result-d0856459-a7c7-4318-a2a4-6c118d5723ab\">\n",
" Upload widget is only available when the cell has been executed in the\n",
" current browser session. Please rerun this cell to enable.\n",
" </output>\n",
" <script src=\"/nbextensions/google.colab/files.js\"></script> "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {
"tags": []
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File already exists\n"
"Saving amazon_cells_labelled.txt to amazon_cells_labelled.txt\n",
"Saving imdb_labelled.txt to imdb_labelled.txt\n",
"Saving yelp_labelled.txt to yelp_labelled.txt\n"
]
}
],
"source": [
"if not os.path.exists(path+\"\\sentiment_sentences.txt\"):\n",
" file = open(os.path.join(path, fil), 'w')\n",
" file.close()\n",
"try:\n",
" from google.colab import files\n",
" \n",
" # combined the three files to make sentiment_sentences.txt\n",
" filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n",
" # upload 'amazon_cells_labelled.txt', 'imdb_labelled.txt' and 'yelp_labelled.txt'\n",
" uploaded = files.upload()\n",
" \n",
" !mkdir DATAPATH\n",
" !mv -t DATAPATH amazon_cells_labelled.txt imdb_labelled.txt yelp_labelled.txt\n",
" !cat DATAPATH/amazon_cells_labelled.txt DATAPATH/imdb_labelled.txt DATAPATH/yelp_labelled.txt > DATAPATH/sentiment_sentences.txt\n",
" \n",
"except ModuleNotFoundError:\n",
"\n",
" fil = 'sentiment_sentences.txt'\n",
"\n",
" if not os.path.exists(\"Data/sentiment_sentences.txt\"):\n",
" file = open(os.path.join(path, fil), 'w')\n",
" file.close()\n",
" \n",
" # combined the three files to make sentiment_sentences.txt\n",
" filenames = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']\n",
"\n",
" with open(path+'\\sentiment_sentences.txt', 'w') as outfile:\n",
" for fname in filenames:\n",
" with open(path + '\\sentiment labelled sentences\\\\' + fname) as infile:\n",
" outfile.write(infile.read())\n",
" print(\"File created\")\n",
"else:\n",
" print(\"File already exists\")"
" with open('Data/sentiment_sentences.txt', 'w') as outfile:\n",
" for fname in filenames:\n",
" with open('Data/sentiment labelled sentences/' + fname) as infile:\n",
" outfile.write(infile.read())\n",
" print(\"File created\")\n",
" else:\n",
" print(\"File already exists\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {
"colab": {},
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "COUGXAxcb_H5",
"outputId": "f1b6d8ad-e22b-4126-d2ea-862697c4158b",
"outputId": "dd37e92a-942a-49c6-aa2e-8c48e542ec1b",
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 15.7 s\n",
"--2021-07-04 11:24:51-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.225.99\n",
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.225.99|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1647046227 (1.5G) [application/x-gzip]\n",
"Saving to: ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’\n",
"\n",
"GoogleNews-vectors- 100%[===================>] 1.53G 75.3MB/s in 26s \n",
"\n",
"2021-07-04 11:25:17 (60.2 MB/s) - ‘DATAPATH/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n",
"\n",
"CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs\n",
"Wall time: 9.06 µs\n",
"done loading Word2Vec\n"
]
}
],
"source": [
"#Load the pre-trained word2vec model and the dataset\n",
"try:\n",
" \n",
" from google.colab import files\n",
" data_path= \"DATAPATH\" \n",
" data_path= \"DATAPATH\"\n",
" !wget -P DATAPATH https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
" !gunzip DATAPATH/GoogleNews-vectors-negative300.bin.gz \n",
" path_to_model = 'DATAPATH/GoogleNews-vectors-negative300.bin'\n",
" training_data_path = \"DATAPATH/sentiment_sentences.txt\"\n",
" \n",
"except ModuleNotFoundError:\n",
" data_path= \"Data\" \n",
" \n",
" if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n",
" if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n",
" wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n",
" path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
" data_path= \"Data\"\n",
" \n",
" if not os.path.exists('GoogleNews-vectors-negative300.bin'):\n",
" if not os.path.exists('../Ch2/GoogleNews-vectors-negative300.bin'):\n",
" if not os.path.exists('../Ch3/GoogleNews-vectors-negative300.bin'):\n",
" wget.download(\"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\")\n",
"\n",
" with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:\n",
" with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:\n",
" shutil.copyfileobj(f_in, f_out)\n",
"\n",
" path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
" else:\n",
" path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n",
"\n",
" else:\n",
" path_to_model = '../Ch3/GoogleNews-vectors-negative300.bin'\n",
" \n",
" path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n",
" else:\n",
" path_to_model = '../Ch2/GoogleNews-vectors-negative300.bin'\n",
" path_to_model = 'GoogleNews-vectors-negative300.bin'\n",
" \n",
" training_data_path = os.path.join(data_path, \"sentiment_sentences.txt\")\n",
"\n",
"\n",
"\n",
" \n",
" \n",
"#Load W2V model. This will take some time. \n",
"%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n",
"%time \n",
"w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)\n",
"print('done loading Word2Vec')\n",
"\n",
"#Read text data, cats.\n",
Expand All @@ -154,12 +244,13 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {
"colab": {},
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "m-WjFyC6b_IE",
"outputId": "5df9e11b-6f8e-42b8-e198-6fe343293cc3"
"outputId": "ce75ae49-eaf7-4af9-fc70-28ad6fb984eb"
},
"outputs": [
{
Expand All @@ -179,12 +270,13 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {
"colab": {},
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XEz30Jztb_IP",
"outputId": "2169b2c9-e89f-439a-a23f-d322fb856841"
"outputId": "321faadb-db0a-4d2b-9c4f-a504f29accf7"
},
"outputs": [
{
Expand All @@ -207,12 +299,13 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {
"colab": {},
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MFOGaDTwb_Ig",
"outputId": "7603e297-9167-43ec-c7da-46d82dc850ad"
"outputId": "ccaf5749-fff8-440e-8709-026b1394afc1"
},
"outputs": [
{
Expand Down Expand Up @@ -245,12 +338,13 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {
"colab": {},
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fXRiGtY1b_Iq",
"outputId": "2d57a96f-8da8-4285-ca1e-2c617578b9e1"
"outputId": "2edbe27b-0400-4df7-f1f0-8b0d20549892"
},
"outputs": [
{
Expand Down Expand Up @@ -287,28 +381,29 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {
"colab": {},
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mr9IaQppb_Ix",
"outputId": "13a84b5c-fde3-49f4-b156-5c2f36592b19"
"outputId": "0d1c168d-daac-40c9-b725-98a6406bc309"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.8173333333333334\n",
"Accuracy: 0.816\n",
" precision recall f1-score support\n",
"\n",
" 0\n",
" 0.79 0.82 0.81 350\n",
" 0.84 0.81 0.83 404\n",
" 1\n",
" 0.84 0.81 0.83 400\n",
" 0.79 0.82 0.80 346\n",
"\n",
" accuracy 0.82 750\n",
" macro avg 0.82 0.82 0.82 750\n",
" macro avg 0.81 0.82 0.82 750\n",
"weighted avg 0.82 0.82 0.82 750\n",
"\n"
]
Expand All @@ -327,7 +422,6 @@
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "k7wjLB8rb_JB"
},
"source": [
Expand All @@ -337,7 +431,8 @@
],
"metadata": {
"colab": {
"name": "Word2Vec_Example.ipynb",
"collapsed_sections": [],
"name": "03_Word2Vec_Example.ipynb",
"provenance": []
},
"kernelspec": {
Expand All @@ -355,7 +450,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
"version": "3.7.4"
}
},
"nbformat": 4,
Expand Down
Loading