From 4a6b3c6a9dc8ddcb6096403248f496262f940c1b Mon Sep 17 00:00:00 2001
From: Faisal Sharji <faisal.sharji@gmail.com>
Date: Fri, 21 May 2021 17:16:28 -0400
Subject: [PATCH] Update 03_Word2Vec_Example.ipynb

need to check token.lower() for membership in mystopwords, otherwise you will include tokens like 'The'
---
 Ch4/03_Word2Vec_Example.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb
index 03a5016..a163318 100644
--- a/Ch4/03_Word2Vec_Example.ipynb
+++ b/Ch4/03_Word2Vec_Example.ipynb
@@ -173,7 +173,7 @@
     "    mystopwords = set(stopwords.words(\"english\"))\n",
     "    def remove_stops_digits(tokens):\n",
     "        #Nested function that lowercases, removes stopwords and digits from a list of tokens\n",
-    "        return [token.lower() for token in tokens if token not in mystopwords and not token.isdigit()\n",
+    "        return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()\n",
     "               and token not in punctuation]\n",
     "    #This return statement below uses the above function to process twitter tokenizer output further. \n",
     "    return [remove_stops_digits(word_tokenize(text)) for text in texts]\n",