In [58]:
from nltk.corpus import brown
from nltk.corpus import stopwords
import string
# nltk.download('punkt') # Download the 'punkt' tokenizer
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
import numpy as np
import gensim
import re
import pandas as pd
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Define preprocess functions

In [61]:

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))

    text = text.lower()      # Convert to lowercase
    text = re.sub(r'[^\w\s]', 'SKIP', text)   # Remove punctuation and special characters
    if text in stop_words:
        text = 'SKIP'
    return text




In [111]:
sentences = brown.sents()   #list of sentences, each sentence is a list of words

flattened_list = [word for sentence in sentences for word in sentence]
print(f'Number of Tokens before processing: {len(flattened_list):,}')
print()

num_tokens = 0
processed_sents = []
for sent in sentences:
   p_sentence = []
   for word in sent:
     word = preprocess_text(word)
     if word != 'SKIP':
        p_sentence.append(word)
   processed_sents.append(p_sentence)
   num_tokens += len(p_sentence)

print(f'Number of Tokens after processing: {num_tokens:,}')
print(f'Process Sentence Example:')
for i in processed_sents[:3]:
    print(i)


Number of Tokens before processing: 1,161,192

Number of Tokens after processing: 687,794
Process Sentence Example:
['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', 'atlantas', 'recent', 'primary', 'election', 'produced', '', 'evidence', '', 'irregularities', 'took', 'place', '']
['jury', 'said', 'termend', 'presentments', 'city', 'executive', 'committee', '', 'overall', 'charge', 'election', '', '', 'deserves', 'praise', 'thanks', 'city', 'atlanta', '', 'manner', 'election', 'conducted', '']
['septemberoctober', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', '', 'irregularities', '', 'hardfought', 'primary', 'mayornominate', 'ivan', 'allen', 'jr', '']


In [112]:
print(sentences[1])
print(processed_sents[1])

['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['jury', 'said', 'termend', 'presentments', 'city', 'executive', 'committee', '', 'overall', 'charge', 'election', '', '', 'deserves', 'praise', 'thanks', 'city', 'atlanta', '', 'manner', 'election', 'conducted', '']


## Import Word2Vec Model

In [70]:
model = gensim.models.Word2Vec (
    vector_size=100,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
)

### Build Vocabulary

In [71]:
model.build_vocab(
    processed_sents,
    progress_per=20000  # Tweaks how often progress is reported

)

#### Showcase sample word vector and results layout

In [82]:
word_vectors = model.wv
vector = word_vectors['uncertain']

print('Vector for Uncertain:')
print(f'Length of vector: {len(vector)}')
print(f'{vector}')

Vector for Uncertain:
Length of vector: 100
[-1.3875127e-03 -8.1217596e-03 -4.2979526e-03  5.8132838e-03
 -2.4545563e-03  5.9709321e-03 -7.2090640e-03  7.6677990e-03
  9.2748285e-04  2.6469468e-04  2.2181988e-04  6.4045666e-03
 -4.3719206e-03  2.5470508e-03 -2.4282443e-03  9.7007211e-03
  9.2004649e-03  1.6240835e-04  8.5140290e-03  7.6306891e-03
  8.9184018e-03 -8.3793399e-05  6.6874959e-03 -6.7278384e-03
  3.7678909e-03  8.1698978e-03  6.1578345e-03  4.4980170e-03
  4.6713445e-03 -4.9447883e-03 -7.0395377e-03  1.1095536e-03
  1.2271714e-03 -9.3590729e-03  6.8509304e-03 -8.3757816e-03
  2.0340241e-03  4.6982528e-03  9.1343792e-03  5.8620260e-03
  5.0437334e-04 -1.6658187e-04  3.5977876e-03  9.3553634e-03
  6.6908384e-03  6.3942005e-03 -8.5568856e-03 -9.6110357e-03
 -8.0016017e-04  1.4892805e-03 -7.2910904e-04 -4.7898972e-03
  3.7911772e-03  5.1792930e-03 -9.5618665e-03  1.2392449e-03
  3.8865351e-03 -4.6941852e-03 -1.4235448e-03  2.7174198e-03
 -9.4392328e-03 -2.0232988e-03  1.8521762

In [94]:
vocab = model.wv.key_to_index.keys()
vocab_list = []
for i in vocab:
    vocab_list.append(i)

print(f'Length of Vocab List: {len(vocab_list):,}')
print(vocab_list[20:40])


Length of Vocab List: 15,173
['on', 'be', ';', 'I', 'by', 'had', 'at', '?', 'not', 'are', 'from', 'or', 'this', 'have', 'an', 'which', '--', 'were', 'but', 'He']


In [30]:

for i in range(20:40):
    # Pick a random word.
    word = vocab[i]
    count = model.wv.get_vecattr(word, 'count')
    word_counts.append((word, count))


df = pd.DataFrame(word_counts, columns=['Word', 'Count'])
display(df)

Unnamed: 0,Word,Count
0,geographic,6
1,singer,10
2,beds,12
3,easily,107
4,camera,36
5,yow,5
6,mounts,8
7,duty,61
8,urgently,6
9,st,164


In [95]:
print('Training the model...')

model.train(
    processed_sents,
    total_examples=len(processed_sents),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
)

print('  Done.')
print('')

Training the model...
  Done.



In [96]:
similar_words = model.wv.most_similar('uncertain', topn=10)

# Print the most similar words and their similarity scores
for word, similarity in similar_words:
    print(f"{word}: Similarity = {similarity:.4f}")

submitting: Similarity = 0.8028
utopian: Similarity = 0.8014
misleading: Similarity = 0.7995
catharsis: Similarity = 0.7920
cliche: Similarity = 0.7907
conclude: Similarity = 0.7903
reproduce: Similarity = 0.7857
decisively: Similarity = 0.7846
anymore: Similarity = 0.7838
alienation: Similarity = 0.7787
