#### [`Chapter-02_Tokens-of-thought-natural-language-words`](/home/hobs/code/hobs/nlpia-manuscript/manuscript/adoc/Chapter-02_Tokens-of-thought-natural-language-words.adoc)

#### .Example quote from _The Book Thief_ split into tokens

In [1]:
text = ("Trust me, though, the words were on their way, and when "
        "they arrived, Liesel would hold them in her hands like "
        "the clouds, and she would wring them out, like the rain.")
tokens = text.split()  # <1>
tokens[:8]

['Trust', 'me,', 'though,', 'the', 'words', 'were', 'on', 'their']

#### .Example quote from _The Book Thief_ split into tokens

In [2]:
import re
pattern = r'\w+(?:\'\w+)?|[^\w\s]'  # <1>
texts = [text]
texts.append("There's no such thing as survival of the fittest. "
             "Survival of the most adequate, maybe.")
tokens = list(re.findall(pattern, texts[-1]))
tokens[:8]

["There's", 'no', 'such', 'thing', 'as', 'survival', 'of', 'the']

#### .Example quote from _The Book Thief_ split into tokens

In [3]:
tokens[8:16]

['fittest', '.', 'Survival', 'of', 'the', 'most', 'adequate', ',']

#### .Example quote from _The Book Thief_ split into tokens

In [4]:
tokens[16:]

['maybe', '.']

#### 

In [5]:
import numpy as np
vocab = sorted(set(tokens))  # <1>
' '.join(vocab[:12])  # <2>

", . Survival There's adequate as fittest maybe most no of such"

#### 

In [6]:
num_tokens = len(tokens)
num_tokens

18

#### 

In [7]:
vocab_size = len(vocab)
vocab_size

15

#### 

In [8]:
import spacy  # <1>
# spacy.cli.download('en_core_web_sm')  # <2>
nlp = spacy.load('en_core_web_sm')  # <3>
doc = nlp(texts[-1])
type(doc)

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


spacy.tokens.doc.Doc

#### 

In [9]:
tokens = [tok.text for tok in doc]
tokens[:9]

['There', "'s", 'no', 'such', 'thing', 'as', 'survival', 'of', 'the']

#### 

In [10]:
tokens[9:17]

['fittest', '.', 'Survival', 'of', 'the', 'most', 'adequate', ',']

#### 

In [11]:
from spacy import displacy
sentence = list(doc.sents)[0]  # <1>
svg = displacy.render(sentence, style="dep",
    jupyter=False)  # <2>
open('sentence_diagram.svg', 'w').write(svg)  # <3>
displacy.render(sentence, style="dep")  # <5>

#### 

In [12]:
import requests
text = requests.get('https://proai.org/nlpia2-ch2.adoc').text
f'{round(len(text) / 10_000)}0k'  # <1>

'190k'

#### 

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')
%timeit nlp(text)  # <1>

11.2 s ± 512 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### 

In [14]:
f'{round(len(text) / 10_000)}0k'

'190k'

#### 

In [15]:
doc = nlp(text)
f'{round(len(list(doc)) / 10_000)}0k'

'40k'

#### 

In [16]:
f'{round(len(doc) / 1_000 / 4.67)}kWPS'  # <2>

'9kWPS'

#### 

In [17]:
nlp.pipe_names  # <1>

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

#### 

In [18]:
nlp = spacy.load('en_core_web_sm', disable=nlp.pipe_names)
%timeit nlp(text)

573 ms ± 4.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### 

In [19]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/hobs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### 

In [20]:
from nltk.tokenize import word_tokenize
%timeit word_tokenize(text)

349 ms ± 17.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### 

In [21]:
tokens = word_tokenize(text)
f'{round(len(tokens) / 10_000)}0k'

'40k'

#### 

In [22]:
pattern = r'\w+(?:\'\w+)?|[^\w\s]'
tokens = re.findall(pattern, text)  # <1>
f'{round(len(tokens) / 10_000)}0k'

'40k'

#### 

In [23]:
%timeit re.findall(pattern, text)

24.1 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### 

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer='char')
vectorizer.fit(texts)

#### 

In [25]:
bpevocab_list = [
   sorted((i, s) for s, i in vectorizer.vocabulary_.items())]
bpevocab_dict = dict(bpevocab_list[0])
list(bpevocab_dict.values())[:7]

[' ', ' a', ' c', ' f', ' h', ' i', ' l']

#### 

In [26]:
vectors = vectorizer.transform(texts)
df = pd.DataFrame(
    vectors.todense(), 
    columns=vectorizer.vocabulary_)
df.index = [t[:8] + '...' for t in texts]
df = df.T
df['total'] = df.T.sum()
df

Unnamed: 0,Trust me...,There's ...,total
t,31,14,45
r,3,2,5
u,1,0,1
s,0,1,1
,3,0,3
...,...,...,...
at,1,0,1
ma,2,1,3
yb,1,0,1
be,1,0,1


#### 

In [27]:
df.sort_values('total').tail()

Unnamed: 0,Trust me...,There's ...,total
en,10,3,13
an,14,5,19
uc,11,9,20
e,18,8,26
t,31,14,45


#### 

In [28]:
df['n'] = [len(tok) for tok in vectorizer.vocabulary_]
df[df['n'] > 1].sort_values('total').tail()

Unnamed: 0,Trust me...,There's ...,total,n
c,8,4,12,2
en,10,3,13,2
an,14,5,19,2
uc,11,9,20,2
e,18,8,26,2


#### 

In [29]:
hi_text = 'Hiking home now'
hi_text.startswith('Hi')

True

#### 

In [30]:
pattern = r'\w+(?:\'\w+)?|[^\w\s]'  # <1>
'Hi' in re.findall(pattern, hi_text)  # <2>

False

#### 

In [31]:
'Hi' == re.findall(pattern, hi_text)[0]  # <3>

False

#### 

In [32]:
import pandas as pd
onehot_vectors = np.zeros(
    (len(tokens), vocab_size), int)  # <1>
for i, tok in enumerate(tokens):
    if tok not in vocab:
        continue
    onehot_vectors[i, vocab.index(tok)] = 1  # <2>
df_onehot = pd.DataFrame(onehot_vectors, columns=vocab)
df_onehot.shape

(43249, 15)

#### 

In [33]:
df_onehot.iloc[:,:8].replace(0, '')  # <3>

Unnamed: 0,",",.,Survival,There's,adequate,as,fittest,maybe
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
...,...,...,...,...,...,...,...,...
43244,,,,,,,,
43245,,,,,,,,
43246,,,,,,,,
43247,,,,,,,,


#### 

In [34]:
import spacy  # <1>
from nlpia2.spacy_language_model import load  # <2>
nlp = load('en_core_web_sm')  # <3>
nlp

<spacy.lang.en.English at 0x7feb8202ae60>

#### 

In [35]:
doc = nlp(texts[-1])
type(doc)

spacy.tokens.doc.Doc

#### 

In [36]:
tokens = [tok.text for tok in doc]  # <1>
tokens[:9]  # <2>

['There', "'s", 'no', 'such', 'thing', 'as', 'survival', 'of', 'the']

#### 

In [37]:
tokens[9:17]

['fittest', '.', 'Survival', 'of', 'the', 'most', 'adequate', ',']

#### 

In [None]:
from spacy import displacy
sentence = list(doc.sents)[0] # <1>
displacy.render(sentence, style="dep")
# displacy.serve(sentence, style="dep")
# !firefox 127.0.0.1:5000

#### 

In [None]:
import requests
text = requests.get('https://proai.org/nlpia2-ch2.adoc').text
f'{round(len(text) / 10_000)}0k'  # <1>

#### 

In [None]:
from nlpia2.spacy_language_model import load
nlp = load('en_core_web_sm')
%timeit nlp(text)  # <1>

#### 

In [None]:
f'{round(len(text) / 10_000)}0k'

#### 

In [None]:
doc = nlp(text)
f'{round(len(list(doc)) / 10_000)}0k'

#### 

In [None]:
f'{round(len(doc) / 1_000 / 4.67)}kWPS'  # <2>

#### 

In [None]:
nlp.pipe_names  # <1>

#### 

In [None]:
nlp = load('en_core_web_sm', disable=['tok2vec', 'tagger', 'parser'])
nlp.pipe_names

#### 

In [None]:
%timeit nlp(text)

#### 

In [None]:
import nltk

#### 

In [None]:
pattern = r'\w+(?:\'\w+)?|[^\w\s]'
tokens = re.findall(pattern, text)  # <1>
f'{round(len(tokens) / 10_000)}0k'

#### 

In [None]:
%timeit re.findall(pattern, text)

#### 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer='char')
vectorizer.fit(texts)

#### 

In [None]:
bpevocab_list = [
   sorted((i, s) for s, i in vectorizer.vocabulary_.items())]
bpevocab_dict = dict(bpevocab_list[0])
list(bpevocab_dict.values())[:7]

#### 

In [None]:
vectors = vectorizer.transform(texts)
df = pd.DataFrame(
    vectors.todense(),
    columns=vectorizer.vocabulary_)
df.index = [t[:8] + '...' for t in texts]
df = df.T
df['total'] = df.T.sum()
df

#### 

In [None]:
df.sort_values('total').tail(3)

#### 

In [None]:
df['n'] = [len(tok) for tok in vectorizer.vocabulary_]
df[df['n'] > 1].sort_values('total').tail()

#### 

In [None]:
hi_text = 'Hiking home now'
hi_text.startswith('Hi')

#### 

In [None]:
pattern = r'\w+(?:\'\w+)?|[^\w\s]'  # <1>
'Hi' in re.findall(pattern, hi_text)  # <2>

#### 

In [None]:
'Hi' == re.findall(pattern, hi_text)[0]  # <3>

#### 

In [None]:
bow = sorted(set(re.findall(pattern, text)))
bow[:9]

#### 

In [None]:
bow[9:19]

#### 

In [None]:
bow[19:27]

#### .Example dot product calculation

In [None]:
v1 = np.array([1, 2, 3])
v2 = np.array([2, 3, 4])
v1.dot(v2)

#### .Example dot product calculation

In [None]:
(v1 * v2).sum()  # <1>

#### .Example dot product calculation

In [None]:
sum([x1 * x2 for x1, x2 in zip(v1, v2)])  # <2>

#### .Example dot product calculation

In [None]:
from nltk.tokenize import TreebankWordTokenizer
texts.append(
  "If conscience and empathy were impediments to the advancement of "
  "self-interest, then we would have evolved to be amoral sociopaths."
  )  # <1>
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(texts[-1])[:6]
tokens[:8]

#### .Example dot product calculation

In [None]:
tokens[8:16]

#### .Example dot product calculation

In [None]:
tokens[16:]

#### 

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = "Nice guys finish first."  # <1>
doc = nlp(text)
for token in doc:
    print(f"{token.text:<11}{token.pos_:<10}{token.dep:<10}")

#### 

In [None]:
import jieba
seg_list = jieba.cut("西安是一座举世闻名的文化古城")  # <1>
list(seg_list)

#### 

In [None]:
import jieba
seg_list = jieba.cut("西安是一座举世闻名的文化古城", cut_all=True)  # <1>
list(seg_list)

#### 

In [None]:
import jieba
from jieba import posseg
words = posseg.cut("西安是一座举世闻名的文化古城")
jieba.enable_paddle()  # <1>
words = posseg.cut("西安是一座举世闻名的文化古城", use_paddle=True)
list(words)

#### 

In [None]:
import spacy
spacy.cli.download("zh_core_web_sm")  # <1>
nlpzh = spacy.load("zh_core_web_sm")
doc = nlpzh("西安是一座举世闻名的文化古城")
[(tok.text, tok.pos_) for tok in doc]

#### 

In [None]:
from nltk.tokenize.casual import casual_tokenize
texts.append("@rickrau mind BLOOOOOOOOWWWWWN by latest lex :*) !!!!!!!!")
casual_tokenize(texts[-1], reduce_len=True)

#### .Broad list of stop words

In [None]:
import requests
url = ("https://gitlab.com/tangibleai/nlpia/-/raw/master/"
       "src/nlpia/data/stopword_lists.json")
response = requests.get(url)
stopwords = response.json()['exhaustive']  # <1>
tokens = 'the words were just as I remembered them'.split()  # <2>
tokens_without_stopwords = [x for x in tokens if x not in stopwords]
print(tokens_without_stopwords)

#### .Broad list of stop words

In [None]:
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
len(stop_words)

#### .Broad list of stop words

In [None]:
stop_words[:7]

#### .Broad list of stop words

In [None]:
[sw for sw in stopwords if len(sw) == 1]

#### .Broad list of stop words

In [None]:
resp = requests.get(url)

#### 

In [None]:
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

#### 

In [None]:
def stem(phrase):
    return ' '.join([re.findall('^(.*ss|.*?)(s)?$',
        word)[0][0].strip("'") for word in phrase.lower().split()])
stem('houses')

#### 

In [None]:
stem("Doctor House's calls")

#### 

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in
  "dish washer's fairly washed dishes".split()])

#### 

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='english')
' '.join([stemmer.stem(w).strip("'") for w in
  "dish washer's fairly washed dishes".split()])

#### 

In [None]:
nltk.download('wordnet')

#### 

In [None]:
nltk.download('omw-1.4')

#### 

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("better")  # <1>

#### 

In [None]:
lemmatizer.lemmatize("better", pos="a")  # <2>

#### 

In [None]:
lemmatizer.lemmatize("good", pos="a")

#### 

In [None]:
stemmer.stem('goodness')

#### 

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("better good goods goodness best")
for token in doc:
    print(token.text, token.lemma_)

#### 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
sa.lexicon  # <1>

#### 

In [None]:
[(tok, score) for tok, score in sa.lexicon.items()
  if " " in tok]  # <4>

#### 

In [None]:
sa.polarity_scores(text=\
  "Python is very readable and it's great for NLP.")

#### 

In [None]:
sa.polarity_scores(text=\
  "Python is not a bad choice for most applications.")

#### 

In [None]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
          "Horrible! Completely useless. :(",
          "It was OK. Some good and some bad things."]
for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))

#### 

In [None]:
movies = pd.read_csv('https://proai.org/movie-reviews.csv.gz',
    index_col=0)
movies.head().round(2)

#### 

In [None]:
movies.describe().round(2)

#### 

In [None]:
import pandas as pd
pd.options.display.width = 75  # <1>
from nltk.tokenize import casual_tokenize  # <2>
bows = []
from collections import Counter  # <3>
for text in movies.text:
    bows.append(Counter(casual_tokenize(text)))
df_movies = pd.DataFrame.from_records(bows)  # <4>
df_movies = df_movies.fillna(0).astype(int)  # <5>
df_movies.shape  # <6>

#### 

In [None]:
df_movies.head()

#### 

In [None]:
df_movies.head()[list(bows[0].keys())]

#### 

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = nb.fit(df_movies, movies.sentiment > 0)  # <1>
movies['pred_senti'] = (
  nb.predict_proba(df_movies))[:, 1] * 8 - 4  # <2>
movies['error'] = movies.pred_senti - movies.sentiment
mae = movies['error'].abs().mean().round(1)  # <3>
mae

#### 

In [None]:
movies['senti_ispos'] = (movies['sentiment'] > 0).astype(int)
movies['pred_ispos'] = (movies['pred_senti'] > 0).astype(int)
columns = [c for c in movies.columns if 'senti' in c or 'pred' in c]
movies[columns].head(8)

#### 

In [None]:
(movies.pred_ispos ==
  movies.senti_ispos).sum() / len(movies)

#### 

In [None]:
products = pd.read_csv('https://proai.org/product-reviews.csv.gz')
products.columns

#### 

In [None]:
products.head()

#### 

In [None]:
bows = []
for text in products['text']:
    bows.append(Counter(casual_tokenize(text)))
df_products = pd.DataFrame.from_records(bows)
df_products = df_products.fillna(0).astype(int)
df_products.shape # <1>

#### 

In [None]:
df_all_bows = pd.concat([df_movies, df_products])
df_all_bows.columns  # <1>

#### 

In [None]:
vocab = list(df_movies.columns)  # <1>
df_products = df_all_bows.iloc[len(movies):]  # <2>
df_products = df_products[vocab]  # <3>
df_products.shape

#### 

In [None]:
df_movies.shape  # <4>

#### 

In [None]:
products['senti_ispos'] = (products['sentiment'] > 0).astype(int)
products['pred_ispos'] = nb.predict(df_products).astype(int)
correct = (products['pred_ispos']
        == products['senti_ispos'])  # <1>
correct.sum() / len(products)