#### [`Chapter-03_Math-with-Words-TF-IDF-Vectors`](/home/hobs/code/hobs/nlpia-manuscript/manuscript/adoc/Chapter-03_Math-with-Words-TF-IDF-Vectors.adoc)

#### 

In [1]:
import spacy
# spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
sentence = ('It has also arisen in criminal justice, healthcare, and '
    'hiring, compounding existing racial, economic, and gender biases.')
doc = nlp(sentence)
tokens = [token.text for token in doc]
tokens

['It',
 'has',
 'also',
 'arisen',
 'in',
 'criminal',
 'justice',
 ',',
 'healthcare',
 ',',
 'and',
 'hiring',
 ',',
 'compounding',
 'existing',
 'racial',
 ',',
 'economic',
 ',',
 'and',
 'gender',
 'biases',
 '.']

#### 

In [2]:
from collections import Counter
bag_of_words = Counter(tokens)
bag_of_words

Counter({',': 5,
         'and': 2,
         'It': 1,
         'has': 1,
         'also': 1,
         'arisen': 1,
         'in': 1,
         'criminal': 1,
         'justice': 1,
         'healthcare': 1,
         'hiring': 1,
         'compounding': 1,
         'existing': 1,
         'racial': 1,
         'economic': 1,
         'gender': 1,
         'biases': 1,
         '.': 1})

#### 

In [3]:
bag_of_words.most_common(3)  # <1>

[(',', 5), ('and', 2), ('It', 1)]

#### 

In [4]:
import pandas as pd
most_common = dict(bag_of_words.most_common())  # <1>
counts = pd.Series(most_common)  # <2>
counts

,              5
and            2
It             1
has            1
also           1
arisen         1
in             1
criminal       1
justice        1
healthcare     1
hiring         1
compounding    1
existing       1
racial         1
economic       1
gender         1
biases         1
.              1
dtype: int64

#### 

In [5]:
len(counts)  # <1>

18

#### 

In [6]:
counts.sum()

23

#### 

In [7]:
len(tokens)  # <2>

23

#### 

In [8]:
counts / counts.sum()  # <3>

,              0.217391
and            0.086957
It             0.043478
has            0.043478
also           0.043478
arisen         0.043478
in             0.043478
criminal       0.043478
justice        0.043478
healthcare     0.043478
hiring         0.043478
compounding    0.043478
existing       0.043478
racial         0.043478
economic       0.043478
gender         0.043478
biases         0.043478
.              0.043478
dtype: float64

#### 

In [9]:
counts['justice']

1

#### 

In [10]:
counts['justice'] / counts.sum()

0.043478260869565216

#### 

In [11]:
sentence = "Algorithmic bias has been cited in cases ranging from " \
    "election outcomes to the spread of online hate speech."
tokens = [tok.text for tok in nlp(sentence)]
counts = Counter(tokens)
dict(counts)

{'Algorithmic': 1,
 'bias': 1,
 'has': 1,
 'been': 1,
 'cited': 1,
 'in': 1,
 'cases': 1,
 'ranging': 1,
 'from': 1,
 'election': 1,
 'outcomes': 1,
 'to': 1,
 'the': 1,
 'spread': 1,
 'of': 1,
 'online': 1,
 'hate': 1,
 'speech': 1,
 '.': 1}

#### 

In [12]:
import requests
url = ('https://gitlab.com/tangibleai/nlpia2/'
       '-/raw/main/src/nlpia2/ch03/bias_intro.txt')
response = requests.get(url)
response

<Response [200]>

In [27]:
response.headers

{'Date': 'Sat, 04 Nov 2023 18:31:28 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '2849', 'Connection': 'keep-alive', 'cache-control': 'max-age=60, public, must-revalidate, stale-while-revalidate=60, stale-if-error=300, s-maxage=60', 'content-disposition': 'inline', 'content-security-policy': "base-uri 'self'; child-src https://www.google.com/recaptcha/ https://www.recaptcha.net/ https://content.googleapis.com https://content-compute.googleapis.com https://content-cloudbilling.googleapis.com https://content-cloudresourcemanager.googleapis.com https://www.googletagmanager.com/ns.html https://*.zuora.com/apps/PublicHostedPageLite.do https://gitlab.com/admin/ https://gitlab.com/assets/ https://gitlab.com/-/speedscope/index.html https://gitlab.com/-/sandbox/ https://gitlab.com/assets/ blob: data:; connect-src 'self' https://gitlab.com wss://gitlab.com https://sentry.gitlab.net https://new-sentry.gitlab.net https://customers.gitlab.com https://snowplow.trx.gitlab.net 

#### 

In [13]:
bias_intro_bytes = response.content  # <1>
bias_intro = response.text  # <2>
assert bias_intro_bytes.decode() == bias_intro    # <3>
bias_intro[:70]

'Algorithmic bias describes systematic and repeatable errors in a compu'

#### 

In [14]:
tokens = [tok.text for tok in nlp(bias_intro)]
counts = Counter(tokens)
counts

Counter({',': 35,
         'of': 16,
         '.': 16,
         'to': 15,
         'and': 14,
         '\n': 14,
         'the': 13,
         'or': 11,
         'in': 10,
         'can': 7,
         'algorithms': 7,
         'bias': 6,
         'is': 6,
         'a': 5,
         'as': 5,
         'not': 4,
         '"': 4,
         'has': 4,
         'their': 4,
         'Algorithmic': 3,
         'that': 3,
         'outcomes': 3,
         'many': 3,
         'but': 3,
         'design': 3,
         'algorithm': 3,
         'unanticipated': 3,
         'data': 3,
         'social': 3,
         'from': 3,
         'algorithmic': 3,
         'been': 3,
         'are': 3,
         'cases': 3,
         'systematic': 2,
         'unfair': 2,
         'such': 2,
         'users': 2,
         'Bias': 2,
         'due': 2,
         'including': 2,
         'limited': 2,
         'used': 2,
         'platforms': 2,
         'have': 2,
         'ranging': 2,
         'biases': 2,
         'gend

#### 

In [15]:
counts.most_common(5)

[(',', 35), ('of', 16), ('.', 16), ('to', 15), ('and', 14)]

#### 

In [16]:
counts.most_common()[-4:]

[('inputs', 1), ('between', 1), ('same', 1), ('service', 1)]

#### 

In [17]:
docs = [nlp(s) for s in bias_intro.split('\n')
        if s.strip()]  # <1>
counts = []
for doc in docs:
    counts.append(Counter([
        t.text.lower() for t in doc]))  # <2>
df = pd.DataFrame(counts)
df = df.fillna(0).astype(int)  # <3>
len(df)

16

#### 

In [18]:
df.head()

Unnamed: 0,algorithmic,bias,describes,systematic,and,repeatable,errors,in,a,computer,...,there,no,examine,network,interrelated,programs,inputs,between,same,service
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 

In [19]:
df.iloc[10]  # <1>

algorithmic    0
bias           0
describes      0
systematic     0
and            2
              ..
programs       0
inputs         0
between        0
same           0
service        0
Name: 10, Length: 246, dtype: int64

#### 

In [20]:
docs_tokens = []
for doc in docs:
    docs_tokens.append([
        tok.text.lower() for tok in nlp(doc.text)])  # <1>
len(docs_tokens[0])

27

#### 

In [21]:
all_doc_tokens = []
for tokens in docs_tokens:
    all_doc_tokens.extend(tokens)
len(all_doc_tokens)

482

#### 

In [22]:
vocab = sorted(  # <1>
    set(all_doc_tokens))  # <2>
len(vocab)

246

#### 

In [23]:
len(all_doc_tokens) / len(vocab)  # <3>

1.9593495934959348

#### 

In [24]:
vocab  # <1>

['"',
 "'s",
 ',',
 '-',
 '.',
 '2018',
 ';',
 'a',
 'ability',
 'accurately',
 'across',
 'addressed',
 'advanced',
 'algorithm',
 'algorithmic',
 'algorithms',
 'also',
 'an',
 'analysis',
 'and',
 'anticipated',
 'application',
 'arbitrary',
 'are',
 'arisen',
 'arrests',
 'as',
 'audiences',
 'authority',
 'barrier',
 'be',
 'because',
 'become',
 'been',
 'behavior',
 'being',
 'between',
 'bias',
 'biases',
 'but',
 'by',
 'can',
 'cases',
 'certain',
 'change',
 'cited',
 'coded',
 'collected',
 'color',
 'complexity',
 'compounding',
 'comprehensive',
 'computer',
 'concerned',
 'considered',
 'contexts',
 'create',
 'criminal',
 'cultural',
 'darker',
 'data',
 'datasets',
 'decisions',
 'describes',
 'design',
 'discovering',
 'discrimination',
 'displace',
 'due',
 'easily',
 'economic',
 'election',
 'emerge',
 'emerging',
 'engine',
 'enter',
 'errors',
 'ethnicity',
 'european',
 'even',
 'examine',
 'existing',
 'expand',
 'expectations',
 'expertise',
 'faces',
 'facial

#### 

In [25]:
count_vectors = []

#### 

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [doc.text for doc in docs]
vectorizer = CountVectorizer()
count_vectors = vectorizer.fit_transform(corpus)  # <1>
print(count_vectors.toarray()) # <2>

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]]


#### 

In [None]:
v1 = np.array(list(range(5)))
v2 = pd.Series(reversed(range(5)))
slow_answer = sum([4.2 * (x1 * x2) for x1, x2 in zip(v1, v2)])
slow_answer

#### 

In [None]:
faster_answer = sum(4.2 * v1 * v2)  # <1>
faster_answer

#### 

In [None]:
fastest_answer = 4.2 * v1.dot(v2)  # <2>
fastest_answer

#### 

In [None]:
A.dot(B) == (np.linalg.norm(A) * np.linalg.norm(B)) * \
    np.cos(angle_between_A_and_B)

#### 

In [None]:
cos_similarity_between_A_and_B = np.cos(angle_between_A_and_B) \
   = A.dot(B) / (np.linalg.norm(A) * np.linalg.norm(B))

#### 

In [None]:
import math
def cosine_sim(vec1, vec2):
    vec1 = [val for val in vec1.values()] # <1>
    vec2 = [val for val in vec2.values()]

    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]

    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))

    return dot_prod / (mag_1 * mag_2)

#### .Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
vec1 = count_vectors[1,:]
vec2 = count_vectors[2,:]
cosine_similarity(vec1, vec2)

#### .Cosine similarity

In [None]:
import copy
question = "What is algorithmic bias?"
ngram_docs = copy.copy(docs)
ngram_docs.append(question)

#### .Cosine similarity

In [None]:
question_vec = vectorizer.transform([new_sentence])
question_vec

#### .Cosine similarity

In [None]:
question_vec.to_array()

#### .Cosine similarity

In [None]:
vocab = list(zip(*sorted((i, tok) for tok, i in
    vectorizer.vocabulary_.items())))[1]
pd.Series(question_vec.to_array()[0], index=vocab).head(8)

#### 

In [None]:
cosine_similarity(count_vectors, question_vector)

#### 

In [None]:
docs[3]

#### 

In [None]:
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
ngram_vectors = ngram_vectorizer.fit_transform(corpus)
ngram_vectors

#### 

In [None]:
vocab = list(zip(*sorted((i, tok) for tok, i in
    ngram_vectorizer.vocabulary_.items())))[1]
pd.DataFrame(ngram_vectors.toarray(),
    columns=vocab)['algorithmic bias']

#### 

In [None]:
from this import s
print(s)

#### 

In [None]:
char_vectorizer = CountVectorizer(
    ngram_range=(1,1), analyzer='char')  # <1>
s_char_frequencies = char_vectorizer.fit_transform(s)
generate_histogram(
    s_char_frequencies, s_char_vectorizer)  # <2>

#### 

In [None]:
DATA_DIR = ('https://gitlab.com/tangibleai/nlpia/'
            '-/raw/master/src/nlpia/data')
url = DATA_DIR + '/machine_learning_full_article.txt'
ml_text = requests.get(url).content.decode()
ml_char_frequencies = char_vectorizer.fit_transform(ml_text)
generate_histogram(s_char_frequencies, s_char_vectorizer)

#### 

In [None]:
chr(ord('W') - peak_distance)

#### 

In [None]:
import codecs
print(codecs.decode(s, 'rot-13'))

#### 

In [None]:
nltk.download('brown')  # <1>
from nltk.corpus import brown
brown.words()[:10]  # <2>

#### 

In [None]:
brown.tagged_words()[:5]  # <3>

#### 

In [None]:
len(brown.words())

#### 

In [None]:
from collections import Counter
puncs = set((',', '.', '--', '-', '!', '?',
    ':', ';', '``', "''", '(', ')', '[', ']'))
word_list = (x.lower() for x in brown.words() if x not in puncs)
token_counts = Counter(word_list)
token_counts.most_common(10)

#### 

In [None]:
DATA_DIR = ('https://gitlab.com/tangibleai/nlpia/'
            '-/raw/master/src/nlpia/data')
url = DATA_DIR + '/bias_discrimination.txt'
bias_discrimination = requests.get(url).content.decode()
intro_tokens = [token.text for token in nlp(bias_intro.lower())]
disc_tokens = [token.text for token in nlp(bias_discrimination.lower())]
intro_total = len(intro_tokens)
intro_total

#### 

In [None]:
disc_total = len (disc_tokens)
disc_total

#### 

In [None]:
intro_tf = {}
disc_tf = {}
intro_counts = Counter(intro_tokens)
intro_tf['bias'] = intro_counts['bias'] / intro_total
disc_counts = Counter(disc_tokens)
disc_tf['bias'] = disc_counts['bias'] / disc_total
'Term Frequency of "bias" in intro is:{:.4f}'.format(intro_tf['bias'])

#### 

In [None]:
'Term Frequency of "bias" in discrimination chapter is: {:.4f}'\
    .format(disc_tf['bias'])

#### 

In [None]:
intro_tf['and'] = intro_counts['and'] / intro_total
disc_tf['and'] = disc_counts['and'] / disc_total
print('Term Frequency of "and" in intro is: {:.4f}'\
    .format(intro_tf['and']))

#### 

In [None]:
print('Term Frequency of "and" in discrimination chapter is: {:.4f}'\
    .format(disc_tf['and']))

#### 

In [None]:
num_docs_containing_and = 0
for doc in [intro_tokens, disc_tokens]:
    if 'and' in doc:
        num_docs_containing_and += 1  # <1>

#### 

In [None]:
intro_tf['black'] = intro_counts['black'] / intro_total
disc_tf['black'] = disc_counts['black'] / disc_total

#### 

In [None]:
num_docs = 2
intro_idf = {}
disc_idf = {}
intro_idf['and'] = num_docs / num_docs_containing_and
disc_idf['and'] = num_docs / num_docs_containing_and
intro_idf['bias'] = num_docs / num_docs_containing_bias
disc_idf['bias'] = num_docs / num_docs_containing_bias
intro_idf['black'] = num_docs / num_docs_containing_black
disc_idf['black'] = num_docs / num_docs_containing_black

#### 

In [None]:
intro_tfidf = {}
intro_tfidf['and'] = intro_tf['and'] * intro_idf['and']
intro_tfidf['bias'] = intro_tf['bias'] * intro_idf['bias']
intro_tfidf['black'] = intro_tf['black'] * intro_idf['black']

#### 

In [None]:
disc_tfidf = {}
disc_tfidf['and'] = disc_tf['and'] * disc_idf['and']
disc_tfidf['bias'] = disc_tf['bias'] * disc_idf['bias']
disc_tfidf['black'] = disc_tf['black'] * disc_idf['black']

#### 

In [None]:
doc_tfidf_vectors = []
for doc in docs:  # <1>
    vec = copy.copy(zero_vector)  # <2>
    tokens = [token.text for token in nlp(doc.lower())]
    token_counts = Counter(tokens)

    for token, count in token_counts.items():
        docs_containing_key = 0
        for d in docs:
            if token in d:
                docs_containing_key += 1
        tf = value / len(vocab)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0
        vec[key] = tf * idf
    doc_tfidf_vectors.append(vec)

#### 

In [None]:
query = "How long does it take to get to the store?"
query_vec = copy.copy(zero_vector)  # <1>
tokens = [token.text for token in nlp(query.lower())]
token_counts = Counter(tokens)
for key, value in token_counts.items():
    docs_containing_key = 0
    for _doc in docs:
      if key in _doc.lower():
        docs_containing_key += 1
    if docs_containing_key == 0:  # <1>
        continue
    tf = value / len(tokens)
    idf = len(docs) / docs_containing_key
    query_vec[key] = tf * idf
cosine_sim(query_vec, doc_tfidf_vectors[0])

#### 

In [None]:
cosine_sim(query_vec, doc_tfidf_vectors[1])

#### 

In [None]:
cosine_sim(query_vec, doc_tfidf_vectors[2])

#### .Computing TF-IDF matrix using Scikit-Learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = docs
vectorizer = TfidfVectorizer(min_df=1) # <1>
vectorizer = vectorizer.fit(corpus)  # <2>
vectors = vectorizer.transform(corpus)  # <3>
print(vectors.todense().round(2))  # <4>

#### 

In [None]:
DS_FAQ_URL = ('https://gitlab.com/tangibleai/qary/-/raw/main/'
    'src/qary/data/faq/faq-python-data-science-cleaned.csv')
qa_dataset = pd.read_csv(DS_FAQ_URL)

#### 

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['question'])
tfidfvectors_sparse = vectorizer.transform(df['question'])  # <1>
tfidfvectors = tfidfvectors_sparse.todense()  # <2>

#### 

In [None]:
def bot_reply(question):
   question_vector = vectorizer.transform([question]).todense()
   idx = question_vector.dot(tfidfvectors.T).argmax() # <1>

   print(
       f"Your question:\n  {question}\n\n"
       f"Most similar FAQ question:\n  {df['question'][idx]}\n\n"
       f"Answer to that FAQ question:\n  {df['answer'][idx]}\n\n"
   )

#### 

In [None]:
bot_reply("What's overfitting a model?")

#### 

In [None]:
bot_reply('How do I decrease overfitting for Logistic Regression?')