#### [CH03_lane3](/home/hobs/code/hobs/nlpia-manuscript/manuscript/adoc/CH03_lane3.adoc)

#### 

In [None]:
import spacy
spacy.cli.download("en_core_web_sm")  # <1>
nlp = spacy.load("en_core_web_sm")
sentence = ('It has also arisen in criminal justice, healthcare, and '
    'hiring, compounding existing racial, economic, and gender biases'
    )
doc = nlp(sentence)
tokens = [token.text for token in doc]
tokens

#### 

In [None]:
from collections import Counter
bag_of_words = Counter(tokens)
bag_of_words

#### 

In [None]:
import pandas as pd
most_common = dict(bag_of_words.most_common())  # <1>
counts = pd.Series(most_common)  # <2>
counts

#### 

In [None]:
len(counts)  # <1>

#### 

In [None]:
counts.sum()

#### 

In [None]:
len(tokens)  # <2>

#### 

In [None]:
counts / counts.sum()  # <3>

#### 

In [None]:
counts['justice']

#### 

In [None]:
counts['justice'] / counts.sum()

#### 

In [None]:
sentence = "Algorithmic bias has been cited in cases ranging from " \
    "election outcomes to the spread of online hate speech."
tokens = [tok.text for tok in nlp(sentence)]
counts = Counter(tokens)
dict(counts)

#### 

In [None]:
import requests
url = ('https://gitlab.com/tangibleai/nlpia2/'
       '-/raw/main/src/nlpia2/ch03/bias_intro.txt')
response = requests.get(url)
response

#### 

In [None]:
bias_intro_bytes = response.content  # <1>
bias_intro = response.text  # <2>
assert bias_intro_bytes.decode() == bias_intro    # <3>
bias_intro[:70]

#### 

In [None]:
tokens = [tok.text for tok in nlp(bias_intro)]
counts = Counter(tokens)
counts

#### 

In [None]:
counts.most_common(5)

#### .Short documents about bias

In [None]:
docs = [nlp(s) for s in bias_intro.split('\n')
        if s.strip()]  # <1>
counts = []
for doc in docs:
    counts.append(Counter([
        t.text.lower() for t in doc]))  # <2>
df = pd.DataFrame(counts)
df = df.fillna(0).astype(int)  # <3>
len(df)

#### .Short documents about bias

In [None]:
df.head()

#### .Short documents about bias

In [None]:
docs[10]

#### .Short documents about bias

In [None]:
df.iloc[10]  # <1>

#### 

In [None]:
all_doc_tokens = []
for tokens in docs_tokens:
    all_doc_tokens.extend(tokens)
len(all_doc_tokens)

#### 

In [None]:
vocab = set(all_doc_tokens)  # <1>
vocab = sorted(vocab)  # <2>
len(vocab)

#### 

In [None]:
len(all_doc_tokens) / len(vocab)  # <3>

#### 

In [None]:
vocab  # <1>

#### 

In [None]:
count_vectors = []
for tokens in docs_tokens:
    count_vectors.append(Counter(tokens))
tf = pd.DataFrame(count_vectors)  # <1>
tf = tf.T.sort_index().T
tf = tf.fillna(0).astype(int)
tf

#### 

In [None]:
!pip install scikit-learn  # <1>

#### 

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
np.set_printoptions(edgeitems=8)  # <1>
corpus = [doc.text for doc in docs]
vectorizer = CountVectorizer()  # <2>
vectorizer = vectorizer.fit(corpus)  # <3>
count_vectors = vectorizer.transform(corpus)  # <4>
count_vectors

#### 

In [None]:
count_vectors.toarray()  # <1>

#### .Increasing the vectorized math in your code

In [None]:
v1 = np.arange(5)  # <1>
v2 = pd.Series(reversed(range(5)))
slow_answer = sum([4.2 * (x1 * x2) for x1, x2 in zip(v1, v2)])
slow_answer

#### .Increasing the vectorized math in your code

In [None]:
faster_answer = sum(4.2 * v1 * v2)  # <2>
faster_answer

#### .Increasing the vectorized math in your code

In [None]:
fastest_answer = 4.2 * v1.dot(v2)  # <3>
fastest_answer

#### .Increasing the vectorized math in your code

In [None]:
!git clone git@gitlab.com/tangibleai/community/knowt
!cd knowt
mmvecs = np.memmap(
    '.knowt-data/hpr_vectors.memmap',
    shape=(41_531, 384),  # <1>
    dtype=np.float32,
    mode='r')
vecs = np.array(mmvecs.T.copy().tolist())
variables = dict(vecs=vecs, v=v)
dt_vectorized = timeit('v.dot(vecs)', globals=variables, number=20)
dt_vectorized

#### .A looping search of HPR episodes

In [None]:
def loops():
    answers = np.zeros(shape[0])
    for i, vec in enumerate(vecs):
        answers[i] = sum((x1 * x2 for (x1, x2) in zip(v[0], vec)))
    return answers
variables = dict(np=np, loops=loops, vecs=vecs.T, v=v)
dt_loop = timeit('loops()', globals=variables, number=20)
dt_loop

#### .A looping search of HPR episodes

In [None]:
dt_loop / dt_vectorized

#### .A looping search of HPR episodes

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
palette = sns.color_palette("muted")  # <1>

#### .Example dot product calculation

In [None]:
v1 = np.array([1, 2, 3])
v2 = np.array([2, 3, 4])
v1.dot(v2)

#### .Example dot product calculation

In [None]:
(v1 * v2).sum()  # <1>

#### .Example dot product calculation

In [None]:
sum([x1 * x2 for x1, x2 in zip(v1, v2)])  # <2>

#### .Example dot product calculation

In [None]:
A.dot(B) == (np.linalg.norm(A) * np.linalg.norm(B)) * \
    np.cos(angle_between_A_and_B)

#### .Example dot product calculation

In [None]:
cos_similarity_between_A_and_B = np.cos(angle_between_A_and_B) \
   = A.dot(B) / (np.linalg.norm(A) * np.linalg.norm(B))

#### .Example dot product calculation

In [None]:
import math

#### .Calculating cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
tf = tf.fillna(0)  # <1>
vec1 = tf.values[:1,:]  # <2>
vec2 = tf.values[1:2,:]
cosine_similarity(vec1, vec2)  # <3>

#### .Calculating cosine similarity

In [None]:
cosine_sim(vec1[0], vec2[0])  # <1>

#### .Calculating cosine similarity

In [None]:
import copy
question = "What is algorithmic bias?"
ngram_docs = copy.copy(docs)
ngram_docs.append(question)

#### .Calculating cosine similarity

In [None]:
question_vec = vectorizer.transform([new_sentence])
question_vec

#### 

In [None]:
question_vec.to_array()

#### 

In [None]:
vocab = list(zip(*sorted((i, tok) for tok, i in
    vectorizer.vocabulary_.items())))[1]
pd.Series(question_vec.to_array()[0], index=vocab).head(8)

#### 

In [None]:
cosine_similarity(count_vectors, question_vector)

#### 

In [None]:
docs[3]

#### 

In [None]:
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
ngram_vectors = ngram_vectorizer.fit_transform(corpus)
ngram_vectors

#### 

In [None]:
vocab = list(zip(*sorted((i, tok) for tok, i in
    ngram_vectorizer.vocabulary_.items())))[1]
pd.DataFrame(ngram_vectors.toarray(),
    columns=vocab)['algorithmic bias']

#### 

In [None]:
from this import s as secret
print(secret)

#### 

In [None]:
!pip install nlpia2_wikipedia         # <1>
import wikipedia as wiki
page = wiki.page('machine learning')  # <2>
mlcounts = count_chars(page.content)
mlcounts

#### 

In [None]:
plt.subplot(2,1,1)
secretcounts /= secretcounts.sum()  # <1>
secretcounts.sort_index()['a':'z'].plot(kind='bar', grid='on')
plt.title('Secret Message')
plt.subplot(2,1,2)
mlcounts /= mlcounts.sum()  # <2>
mlcounts.sort_index()['a':'z'].plot(kind='bar', grid='on')
plt.title('ML Article')
plt.show()

#### 

In [None]:
peak_distance = ord('R') - ord('E')
peak_distance

#### 

In [None]:
chr(ord('v') - peak_distance)  # <1>

#### 

In [None]:
chr(ord('n') - peak_distance)  # <2>

#### 

In [None]:
chr(ord('W') - peak_distance)

#### 

In [None]:
import codecs
print(codecs.decode(secret, 'rot-13'))

#### 

In [None]:
import nltk
nltk.download('brown')  # <1>
from nltk.corpus import brown
brown.words()[:10]  # <2>

#### 

In [None]:
brown.tagged_words()[:5]  # <3>

#### 

In [None]:
len(brown.words())

#### 

In [None]:
from collections import Counter
puncs = set((',', '.', '--', '-', '!', '?',
    ':', ';', '``', "''", '(', ')', '[', ']'))
word_list = (x.lower() for x in brown.words() if x not in puncs)
token_counts = Counter(word_list)
token_counts.most_common(10)

#### 

In [None]:
DATA_DIR = ('https://gitlab.com/tangibleai/nlpia/'
            '-/raw/master/src/nlpia/data')
url = DATA_DIR + '/bias_discrimination.txt'
bias_discrimination = requests.get(url).content.decode()
intro_tokens = [t.text for t in nlp(bias_intro.lower())]
disc_tokens = [t.text for t in nlp(bias_discrimination.lower())]
intro_total = len(intro_tokens)
intro_total

#### 

In [None]:
disc_total = len(disc_tokens)
disc_total

#### 

In [None]:
intro_tf = {}
disc_tf = {}
intro_counts = Counter(intro_tokens)
intro_tf['bias'] = intro_counts['bias'] / intro_total
disc_counts = Counter(disc_tokens)
disc_tf['bias'] = disc_counts['bias'] / disc_total
'Term Frequency of "bias" in intro is:{:.4f}'.format(intro_tf['bias'])

#### 

In [None]:
'Term Frequency of "bias" in discrimination chapter is: {:.4f}'\
    .format(disc_tf['bias'])

#### 

In [None]:
intro_tf['and'] = intro_counts['and'] / intro_total
disc_tf['and'] = disc_counts['and'] / disc_total
print('Term Frequency of "and" in intro is: {:.4f}'\
    .format(intro_tf['and']))

#### 

In [None]:
print('Term Frequency of "and" in discrimination chapter is: {:.4f}'\
    .format(disc_tf['and']))

#### 

In [None]:
num_docs_containing_and = 0
for doc in [intro_tokens, disc_tokens]:
    if 'and' in doc:
        num_docs_containing_and += 1  # <1>

#### 

In [None]:
intro_tf['black'] = intro_counts['black'] / intro_total
disc_tf['black'] = disc_counts['black'] / disc_total

#### 

In [None]:
num_docs = 2
intro_idf = {}
disc_idf = {}
intro_idf['and'] = num_docs / num_docs_containing_and
disc_idf['and'] = num_docs / num_docs_containing_and
intro_idf['bias'] = num_docs / num_docs_containing_bias
disc_idf['bias'] = num_docs / num_docs_containing_bias
intro_idf['black'] = num_docs / num_docs_containing_black
disc_idf['black'] = num_docs / num_docs_containing_black

#### 

In [None]:
intro_tfidf = {}
intro_tfidf['and'] = intro_tf['and'] * intro_idf['and']
intro_tfidf['bias'] = intro_tf['bias'] * intro_idf['bias']
intro_tfidf['black'] = intro_tf['black'] * intro_idf['black']

#### 

In [None]:
disc_tfidf = {}
disc_tfidf['and'] = disc_tf['and'] * disc_idf['and']
disc_tfidf['bias'] = disc_tf['bias'] * disc_idf['bias']
disc_tfidf['black'] = disc_tf['black'] * disc_idf['black']

#### .Downloading Hacker Public Radio show notes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
url = 'https://gitlab.com/tangibleai/community/knowt/-/raw/main/'
url += '.knowt-data/corpus_hpr/sentences.csv?inline=false'
df = pd.read_csv(url)
docs = df['sentence']
vectorizer = TfidfVectorizer(min_df=1)  # <1>
vectorizer = vectorizer.fit(docs)       # <2>
vectors = vectorizer.transform(docs)    # <3>

#### .Downloading Hacker Public Radio show notes

In [None]:
query_vec = vectorizer.transform(
    ['where is the lost audio'])  # <1>
query_vec  # <2>

#### .Downloading Hacker Public Radio show notes

In [None]:
dotproducts = query_vec.dot(vectors.T)  # <3>
dotproducts.argmax()
idx = dotproducts.argmax()
idx

#### .Downloading Hacker Public Radio show notes

In [None]:
df.iloc[idx]

#### .Downloading Hacker Public Radio show notes

In [None]:
df.iloc[i]['sentence']

#### 

In [None]:
DS_FAQ_URL = ('https://gitlab.com/tangibleai/nlpia2/-/raw/main/'
    'src/nlpia2/data/faqbot.csv')
df = pd.read_csv(DS_FAQ_URL, index_col=0)

#### 

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['question'])
tfidfvectors_sparse = vectorizer.transform(df['question'])  # <1>
tfidfvectors = tfidfvectors_sparse.todense()  # <2>

#### 

In [None]:
def ask(question):
   question_vector = vectorizer.transform([question]).todense()
   idx = question_vector.dot(tfidfvectors.T).argmax() # <1>

   print(
       f"Your question:\n  {question}\n\n"
       f"Most similar FAQ question:\n  {df['question'][idx]}\n\n"
       f"Answer to that FAQ question:\n  {df['answer'][idx]}\n\n"
   )

#### 

In [None]:
ask("What's overfitting a model?")

#### 

In [None]:
ask('How do I decrease overfitting for Logistic Regression?')

#### 

In [None]:
question = 'LogisticRegression'
question_vector = vectorizer.transform([question])
dotproducts = question_vector.dot(tfidfvectors_sparse.T)
dotproducts = dotproducts.toarray()[0]  # <1>
idx = dotproducts.argsort()[-3:]  # <2>
idx

#### 

In [None]:
dotproducts[idx]

#### 

In [None]:
df['answer'][idx]