In [2]:
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import string
# nltk.download('punkt') # Download the 'punkt' tokenizer
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
import numpy as np
import gensim
import re
import pandas as pd
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Define preprocess functions

In [21]:

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))

    text = text.lower()      # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)   # Remove punctuation and special characters
    if text in stop_words:                    #remove stop-words based on nltk set
        text = 'SKIP'
    elif text == '':
        text = 'SKIP'
    return text

### Download and Process Brown Corpus

In [22]:
sentences = brown.sents()   #list of sentences, each sentence is a list of words

flattened_list = [word for sentence in sentences for word in sentence]
print(f'Number of Tokens before processing: {len(flattened_list):,}')
print()

num_tokens = 0
processed_sents = []
for sent in sentences:
   p_sentence = []
   for word in sent:
     word = preprocess_text(word)
     if word != 'SKIP':
        p_sentence.append(word)
   processed_sents.append(p_sentence)
   num_tokens += len(p_sentence)

print(f'Number of Tokens after processing: {num_tokens:,}')
print(f'Process Sentence Example:')
for i in processed_sents[:3]:
    print(i)


Number of Tokens before processing: 1,161,192

Number of Tokens after processing: 539,921
Process Sentence Example:
['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', 'atlantas', 'recent', 'primary', 'election', 'produced', 'evidence', 'irregularities', 'took', 'place']
['jury', 'said', 'termend', 'presentments', 'city', 'executive', 'committee', 'overall', 'charge', 'election', 'deserves', 'praise', 'thanks', 'city', 'atlanta', 'manner', 'election', 'conducted']
['septemberoctober', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', 'irregularities', 'hardfought', 'primary', 'mayornominate', 'ivan', 'allen', 'jr']


In [23]:
#check processing
print(sentences[1])
print(processed_sents[1])

['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['jury', 'said', 'termend', 'presentments', 'city', 'executive', 'committee', 'overall', 'charge', 'election', 'deserves', 'praise', 'thanks', 'city', 'atlanta', 'manner', 'election', 'conducted']


## Import Word2Vec Model

In [24]:
model = gensim.models.Word2Vec (
    vector_size=100,    # Number of features in word vector

    window=10,   # Context window size (in each direction). Default is 5


    min_count=5, # Words must appear this many times to be in vocab.
                 #   Default is 5

    workers=10,  # Training thread count

    sg=1,        # 0: CBOW, 1: Skip-gram.

    hs=0,        # 0: Negative Sampling, 1: Hierarchical Softmax
                 #   Default is 0, NS

    negative=5   # Nmber of negative samples
                 #   Default is 5
)

### Build Vocabulary

In [25]:
model.build_vocab(
    processed_sents,
    progress_per=20000  # Tweaks how often progress is reported

)

### Train the Model

In [26]:
print('Training the model...')

model.train(
    processed_sents,
    total_examples=len(processed_sents),
    epochs=10,        # How many training passes to take.
    report_delay=10.0 # Report progress every 10 seconds.
)

print('  Done.')
print('')

Training the model...
  Done.



#### Showcase sample word vector and results layout

In [27]:
word_vectors = model.wv
vector = word_vectors['uncertain']

print('Vector for Uncertain:')
print(f'Length of vector: {len(vector)}')
print(f'{vector}')

Vector for Uncertain:
Length of vector: 100
[-0.3484722   0.10163841 -0.01252631  0.01786912  0.17679088 -0.30190015
 -0.26260594  0.10047907 -0.07748776  0.09296066 -0.18984024 -0.12112519
 -0.19701016 -0.22183388  0.2633721  -0.20558923  0.28330415 -0.16902506
  0.04455652 -0.38614827 -0.02205837  0.12179832  0.18581161 -0.08096561
  0.12242373  0.04912557 -0.11092206  0.22599871 -0.17253287  0.29855505
  0.3332483   0.0661625  -0.0473337   0.00911532  0.06316879  0.06293564
  0.02696742 -0.0372304  -0.06809297 -0.19113077 -0.05350975  0.00749517
 -0.03151321 -0.09120497  0.21621387 -0.13368987 -0.04868797  0.03340903
 -0.00315683  0.11653571  0.07077207 -0.05241055 -0.21221593  0.09143794
 -0.28332743  0.22272316  0.22338995  0.03178673 -0.30422604  0.28374264
 -0.00108746  0.07090506  0.11222957 -0.04180789 -0.2734059   0.31775865
  0.15461881 -0.08890928 -0.04745176  0.23896833  0.05201704 -0.00560736
  0.24608867 -0.13530016  0.19973059  0.2777233   0.09203767 -0.03666746
 -0.287

In [28]:
vocab = model.wv.key_to_index.keys()
vocab_list = []
for i in vocab:
    vocab_list.append(i)

print(f'Length of Vocab List: {len(vocab_list):,}')
print(vocab_list[20:40])


Length of Vocab List: 14,046
['much', 'way', 'people', 'mr', 'us', 'little', 'state', 'good', 'make', 'world', 'still', 'see', 'men', 'work', 'long', 'get', 'life', 'never', 'day', 'another']


In [29]:
word_counts = []

for i in range(20,40):
    # Pick a random word.
    word = vocab_list[i]
    count = model.wv.get_vecattr(word, 'count')
    word_counts.append((word, count))


df = pd.DataFrame(word_counts, columns=['Word', 'Count'])
display(df)

Unnamed: 0,Word,Count
0,much,937
1,way,909
2,people,847
3,mr,844
4,us,838
5,little,831
6,state,807
7,good,806
8,make,794
9,world,787


In [30]:
similar_words = model.wv.most_similar('uncertain', topn=10)

# Print the most similar words and their similarity scores
for word, similarity in similar_words:
    print(f"{word}: Similarity = {similarity:.4f}")

positivist: Similarity = 0.8871
authenticity: Similarity = 0.8681
incorrect: Similarity = 0.8680
urges: Similarity = 0.8644
experimenter: Similarity = 0.8608
catharsis: Similarity = 0.8605
uniqueness: Similarity = 0.8587
mediums: Similarity = 0.8556
comprehend: Similarity = 0.8552
gabriels: Similarity = 0.8537
