In [1]:
# Imports

import requests

from bs4 import BeautifulSoup

import time

import unicodedata

import string

import contractions

import nltk

import gensim

## Scraping Random Wikipedia URLs

In [2]:
# Number of random articles to scrape
article_count = 150

# Loop to iterate over random articles
for i in range(article_count):

 # Send a GET request to the URL
    response = requests.get('https://en.wikipedia.org/wiki/Special:Random')

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    

    # Nested loop for saving paragraph data from each article to a file
    # Extract the text content from the article
    text_content = ''
    for paragraph in soup.find_all('p'):
        text_content += paragraph.get_text() + '\n'

        # Write the text content to the file
        # Using 'a' too append text from each iteration of the loop
        with open('dataset.txt', 'a', encoding='utf-8') as file:
            file.write(text_content.strip())

    # Adding a delay to the end of the loop so that Wikipedia won't kick me 
    time.sleep(.5)


## Data Pre-Processing

In [3]:
# Downloading a list of stopwords
# nltk.download("stopwords")

# **IMPORTANT** Un-hash the previous line if nltk.download("stopwords") has not previously been run

# Defining a set where each element is a stopword to include in the following translation table
stops = set(nltk.corpus.stopwords.words('english'))

In [4]:
# Defining a function to normalize text
def convert_utf(text):
    text = text.replace('\u2018', "'").replace('\u2019', "'").replace('\u201C', "`").replace('\u201D', "`").replace('\u2013', '-').replace('\u2014', '-')
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore')
    return text.decode('ascii')

In [5]:
# Defining a wider function to turn text into training-ready data 

def clean(data):
    # Converting utf-8 characters to normal characters using previously defined function
    data = convert_utf(data)
    
    # Lowercasing the text data
    data = data.lower()

    # Expanding any contractions
    data = contractions.fix(data)
    
    # Tokenizing the text data by sentence
    sentences = nltk.sent_tokenize(data)

    # Defining an empty list + for loop to append word-level tokens into
    tokenized_data = []

    for sentence in sentences:
        
        # Using word_tokenize() to break up the sentence level tokens into word level tokens
        words = nltk.word_tokenize(sentence)
        
        # Defining another empty list to append non-stopword tokens into. This is to keep track of sentences
        cleaned_sentences = []
        

        # Marking each token as not containing punctuation by default
        for token in words:
            contains_punc = False

            # Iterating over each character in the word to check if any characters are punctuation marks
            for character in token:
                if character in string.punctuation:
                    contains_punc = True
        
                    # Ending the for loop once punctuation is found for optimization
                    break
            
            # Skipping current token if it contains punctuation so as to not append 
            if contains_punc:
                continue

            # Referencing previously defined set of stopwords and a string containing all punctuation marks
            if token not in stops:
                cleaned_sentences.append(token)
                # print(token)
        
        tokenized_data.append(cleaned_sentences)
    
    return tokenized_data

In [6]:
# Loading the written file

with open("./dataset.txt", "r", encoding="utf8") as file:
    data = file.read()

# Cleaning the text with the previously defined function
data0 = clean(data)

## Training


In [7]:
# Creating an empty model

model = gensim.models.Word2Vec(vector_size=500, min_count=1, sg=0)
model.save("./model")

In [8]:
# Training a model with the preprocessed data

model.build_vocab(data0, update=False)
model.train(data0, total_examples=model.corpus_count, epochs=model.epochs)
model.save('./model')

## Testing Inference 

In [9]:
# Load model.
model = gensim.models.Word2Vec.load("./model")

In [10]:
# word_vec = model.wv["long"]

# opposite_word = model.wv.most_similar(negative=[word_vec], topn=5)
# print(opposite_word)

## Adding Additional Data

The model is not performing well, so I am adding full books from Project Gutenberg.

Note that the link to each book is strange, as the url indicates a page number, but the page delivers error 404 unless the two numbers are the same

In [11]:
# Iterating over links to full books on Project Gutenberg
for i in range(50):
      request = requests.get(f'https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt')
      
      # Writing the text content to a file
      # Using 'a' too append text from each iteration of the loop, so all 100 books are in 1 file
      with open('gutenberg_dataset.txt', 'a', encoding='utf-8') as file:
                  file.write(request.text.strip())


In [12]:
# Loading the newly written file
with open("./gutenberg_dataset.txt", "r", encoding="utf8") as file:
    data1 = file.read()

# Cleaning the text with the previously defined function
data1 = clean(data1)

In [13]:
# Re-training the model with the new data

# Setting update=True so that the new data is added to the model's current vocabulary, rather than overwriting it
model.build_vocab(data1, update=True)
model.train(data1, total_examples=model.corpus_count, epochs=model.epochs)
model.save('./model')

In [35]:
reference_pair = ("full", "empty")
target_word = "small"
result_vector = model.wv[target_word] - model.wv[reference_pair[0]] + model.wv[reference_pair[1]]
opposite_words = model.wv.similar_by_vector(result_vector, topn=5)

print(opposite_words)

[('small', 0.6334516406059265), ('communities', 0.48378485441207886), ('elsewhere', 0.47936201095581055), ('larger', 0.4593963921070099), ('indefinite', 0.39229339361190796)]
