#### Tuesday, April 2, 2024

mamba activate 

This all runs in one pass.

# Embedding with NLKT and Gensim
Copyright 2024, Denis Rothman

Installing libraries

In [1]:
#!pip install --upgrade nltk -qq
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/rob/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#!pip install gensim -qq

In [3]:
import gensim
print(gensim.__version__)

4.3.2


# 1.Reading the text file

## Downloading the text file

In [None]:
#1.Load Decartes.txt using the Colab file manager
#2.Downloading the file from GitHub
# !curl -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-and-Computer-Vision-3rd-Edition/master/Chapter11/Descartes.txt --output "Descartes.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    14  100    14    0     0     94      0 --:--:-- --:--:-- --:--:--    94


end of WIP code until book title finalized

## Reading the text file

In [4]:
with open('Descartes.txt', 'r', encoding='utf-8') as file:
    descartes_book = file.read().replace('\n', '')

# 2.Tokenizing the text with punkt

In [5]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(descartes_book)
print(len(tokens))

23605


## Preprocessing the tokens


In [6]:
# applying lowercase, removing punctuation and stopwords
#stemming/lemmatization
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words and token not in string.punctuation]

[nltk_data] Downloading package wordnet to /home/rob/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rob/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
print(len(tokens))

9781


The tokenization process filtered the information that wasn’t meaningful for our tasks. However, duplicates may remain in the vocabulary.

In [8]:
unique_tokens = set(tokens)
print(len(unique_tokens))

3843


In [9]:
print(unique_tokens)

{'resultsan', 'thread', 'musthave', 'said', 'conduct', 'antecedence', 'themin', 'ligature', 'look', 'unite', 'spending', 'opposite.my', 'handshad', 'publishthem', 'felt', 'commencement', 'disposed', 'similar', 'choice', 'iwould', 'security', 'principaldifficulties', 'instant', 'icould', 'sufficientknowledge', 'element', 'us', 'retired', 'designated', 'agreeably', 'suchreflection', 'deteriorate', 'weredependencies', 'admirablethan', 'triangle', 'ofmy', 'use', 'leave', 'freedom', 'heart.but', 'toimitate', 'afraid', 'theirtruth.though', 'ownthoughts', 'mathematics', 'pleasure', 'qualified', 'isnecessary', 'equally', 'thefall', 'desired', 'godis', 'vapor', 'byagain', 'lessastonished', 'havealleged', 'heavenssince', 'aright', 'perfectfrankness', 'nutrition', 'accustoming', 'iought', 'year', 'anatomy', 'publishanything', 'consider', 'ascend', 'certainof', 'wewish', 'compensate', 'report', 'distinctidea', 'became', 'venosa_', 'areyet', 'heis', 'auricle', 'le', 'anticipating', 'preceptor', 'ag

In [10]:
tokens=unique_tokens
#print(len(tokens))

# 3.Embedding with Gensim and Word2Vec

In [11]:
from gensim.models import Word2Vec

# Train a Word2Vec model
model = Word2Vec([tokens],compute_loss=True,vector_size=300,min_count=1)

# Save the model for later use
model.save("descartes_word2vec.model")

# 4.Model description

In [12]:
from IPython.display import display
import ipywidgets as widgets

# Load the model
model = Word2Vec.load("descartes_word2vec.model")

# Widget for the model attribute
attr_widget = widgets.Dropdown(
    options=['wv', 'vector_size', 'train_count', 'total_train_time', 'epochs','sg'],
    value='wv',
    description='Attribute:',
)
display(attr_widget)

# Widget for the number of lines
num_lines_widget = widgets.IntSlider(min=0, max=100, step=1, value=10, description='Lines:')
display(num_lines_widget)

# Button to display the data
display_button = widgets.Button(description='Display')
display(display_button)

# Function to display the data
def display_data(button):
    attr = attr_widget.value
    num_lines = num_lines_widget.value

    if attr == 'wv':
        words = list(model.wv.index_to_key)
        for word in words[:num_lines]:
            print(word, model.wv[word])
    else:
        print(getattr(model, attr))

# Link the function to the button
display_button.on_click(display_data)

Dropdown(description='Attribute:', options=('wv', 'vector_size', 'train_count', 'total_train_time', 'epochs', …

IntSlider(value=10, description='Lines:')

Button(description='Display', style=ButtonStyle())

## Accessing a word and a vector in the saved model

In [13]:
try:
    vector = model.wv['consciousness']
    print('Vector for "consciousness":', vector)
except KeyError:
    print('"consciousness" is not in the dictionary')

"consciousness" is not in the dictionary


In [14]:
try:
    vector = model.wv['conscious']
    print('Vector for "conscious":', vector)
except KeyError:
    print('"conscious" is not in the dictionary')

Vector for "conscious": [ 3.16101382e-03  1.94981709e-04 -2.63873138e-03 -1.93212670e-03
 -1.48083281e-03 -1.08521024e-04  1.24077284e-04 -5.02827694e-04
 -1.90547341e-03  2.59127258e-03 -8.75646831e-04 -2.53247353e-03
 -2.37995759e-03  9.25373286e-04 -2.67503643e-03 -3.75769741e-04
  2.22137920e-03  4.06547115e-05 -8.64673173e-04 -2.96606962e-03
 -1.72955776e-03 -2.92122201e-03 -1.83228974e-03  2.64889537e-03
 -2.39109853e-03 -2.86871777e-03  1.78468763e-03  5.75154729e-04
  2.00890517e-03  2.24741525e-03  1.83215621e-03 -2.93757534e-03
 -1.77681737e-03 -1.68577966e-03 -3.14994220e-04 -1.76216254e-03
  2.67186388e-03 -2.77142817e-06  3.68984911e-05  1.07263657e-03
 -3.17618065e-03 -1.45949330e-03 -3.17235431e-03 -7.13663583e-04
  5.32862672e-04 -3.21333227e-03  2.59234523e-03 -1.95570430e-03
  2.45319400e-03  2.89587188e-03 -8.53235542e-05 -2.11620936e-03
  1.30764383e-03  2.66564800e-03 -9.43589199e-04 -3.29051167e-03
 -1.96186104e-03 -2.23569688e-03  1.22390653e-03  8.17126245e-04
 

Most similar words

In [15]:
try:
    similar_words = model.wv.most_similar('conscious')
    print('Most similar words to "conscious":', similar_words)
except KeyError:
    print('"concious" is not in the dictionary')


Most similar words to "conscious": [('consists', 0.18727317452430725), ('mathematical', 0.17745932936668396), ('vegetative', 0.16930024325847626), ('loftiersuperstructure', 0.1682967245578766), ('theology', 0.16704244911670685), ('designate', 0.16656053066253662), ('living.for', 0.1654047667980194), ('tends', 0.16311410069465637), ('theprincipal', 0.1619839370250702), ('intense', 0.15588311851024628)]


# 6.Exploring Gensim's vector space

## The dictionary of words

In [16]:
for word, index in model.wv.key_to_index.items():
    print(f"Word: {word}, Index: {index}")

Word: bybeing, Index: 0
Word: werefrequently, Index: 1
Word: solong, Index: 2
Word: fixed, Index: 3
Word: ofman, Index: 4
Word: direction, Index: 5
Word: compass, Index: 6
Word: probable, Index: 7
Word: moral, Index: 8
Word: tosurmount, Index: 9
Word: distinction, Index: 10
Word: reasonwhich, Index: 11
Word: value, Index: 12
Word: recondite, Index: 13
Word: artifice, Index: 14
Word: manner, Index: 15
Word: denominated, Index: 16
Word: render, Index: 17
Word: undisturbed, Index: 18
Word: attentive, Index: 19
Word: returning, Index: 20
Word: represents, Index: 21
Word: througha, Index: 22
Word: twenty-three, Index: 23
Word: trunk, Index: 24
Word: seeknothing, Index: 25
Word: health, Index: 26
Word: undertake, Index: 27
Word: lay, Index: 28
Word: growing, Index: 29
Word: grandartery, Index: 30
Word: thequestion, Index: 31
Word: extricate, Index: 32
Word: movementsperformed, Index: 33
Word: bestow, Index: 34
Word: noreason, Index: 35
Word: themathematicians, Index: 36
Word: myown, Index: 3

## Pairs of words and cosine similarity

In [17]:
import numpy as np
from gensim import matutils
import pandas as pd

# Define list of words
words = ["method","reason","truth","rightly", "science","seeking"]

# Initialize list to store results
data = []

# Iterate over all pairs of words
for i in range(len(words)):
    for j in range(len(words)):  # changed this line
        word1 = words[i]
        word2 = words[j]

        # Ensure words are in the model's vocabulary
        if word1 not in model.wv or word2 not in model.wv:
            print(f"One or both words ('{word1}', '{word2}') are not in the model's vocabulary.")
            continue

        # Calculate cosine similarity
        vec1 = model.wv[word1]
        vec2 = model.wv[word2]
        similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))

        # Convert similarity to distance
        distance = 1 - similarity

        # Append to our results
        data.append({'word1': word1, 'word2': word2, 'distance': distance})

# Create DataFrame and display
df = pd.DataFrame(data)
display(df)

Unnamed: 0,word1,word2,distance
0,method,method,0.0
1,method,reason,0.990448
2,method,truth,0.871246
3,method,rightly,1.005896
4,method,science,0.982914
5,method,seeking,0.983999
6,reason,method,0.990448
7,reason,reason,0.0
8,reason,truth,1.115006
9,reason,rightly,1.003871


# 7.Tensorflow Projector

https://projector.tensorflow.org/

To visualize the embeddings with the TensorFlow Embedding Projector, you'll need to create two files: a vector file and a metadata file.

In [20]:
import csv
import os
import numpy as np

# Directory where you want to save the files
# LOG_DIR = '/content'
LOG_DIR = 'content'
os.makedirs(LOG_DIR, exist_ok=True)

# Get the words and vectors
words = list(model.wv.key_to_index.keys())
vectors = [model.wv[word] for word in words]

# Write the vectors to a .tsv file
with open(os.path.join(LOG_DIR, "vecs.tsv"), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(vectors)

# Write the labels (words) to a separate .tsv file
with open(os.path.join(LOG_DIR, "meta.tsv"), 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows([[word] for word in words])  # No header row

In [21]:
!echo "Vectors file (vecs.tsv) size:"
!wc -l content/vecs.tsv
!echo "Metadata file (meta.tsv) size:"
!wc -l content/meta.tsv

Vectors file (vecs.tsv) size:
3843 content/vecs.tsv
Metadata file (meta.tsv) size:
3843 content/meta.tsv
