From 4a6b3c6a9dc8ddcb6096403248f496262f940c1b Mon Sep 17 00:00:00 2001 From: Faisal Sharji Date: Fri, 21 May 2021 17:16:28 -0400 Subject: [PATCH] Update 03_Word2Vec_Example.ipynb need to check token.lower() for membership in mystopwords, otherwise you will include tokens like 'The' --- Ch4/03_Word2Vec_Example.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ch4/03_Word2Vec_Example.ipynb b/Ch4/03_Word2Vec_Example.ipynb index 03a5016..a163318 100644 --- a/Ch4/03_Word2Vec_Example.ipynb +++ b/Ch4/03_Word2Vec_Example.ipynb @@ -173,7 +173,7 @@ " mystopwords = set(stopwords.words(\"english\"))\n", " def remove_stops_digits(tokens):\n", " #Nested function that lowercases, removes stopwords and digits from a list of tokens\n", - " return [token.lower() for token in tokens if token not in mystopwords and not token.isdigit()\n", + " return [token.lower() for token in tokens if token.lower() not in mystopwords and not token.isdigit()\n", " and token not in punctuation]\n", " #This return statement below uses the above function to process twitter tokenizer output further. \n", " return [remove_stops_digits(word_tokenize(text)) for text in texts]\n",