In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Function to fetch HTML content from a URL
def get_html_from_url(url):
    response = requests.get(url)
    return response.text

In [19]:
url = 'https://qbi.uq.edu.au/brain/brain-anatomy/what-neuron'
html_content = get_html_from_url(url)

In [4]:
# Extract text from paragraphs
def extract_text_from_html(html):
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ' '.join([p.get_text() for p in paragraphs])
    return text

In [5]:
text = extract_text_from_html(html_content)
print(text)

Neurons (also called neurones or nerve cells) are the fundamental units of the brain and nervous system, the cells responsible for receiving sensory input from the external world, for sending motor commands to our muscles, and for transforming and relaying the electrical signals at every step in between. More than that, their interactions define who we are as people. Having said that, our roughly 100 billion neurons do interact closely with other cell types, broadly classified as glia (these may actually outnumber neurons, although it’s not really known). The creation of new neurons in the brain is called neurogenesis, and this can happen even in adults. A useful analogy is to think of a neuron as a tree. A neuron has three main parts: dendrites, an axon, and a cell body or soma (see image below), which can be represented as the branches, roots and trunk of a tree, respectively. A dendrite (tree branch) is where a neuron receives input from other cells. Dendrites branch as they move to

In [6]:
# Function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word,pos ='v') for word in tokens]
    return tokens



In [14]:
tokens = preprocess_text(text)
print(tokens)

['neurons', 'also', 'call', 'neurones', 'nerve', 'cells', 'fundamental', 'units', 'brain', 'nervous', 'system', 'cells', 'responsible', 'receive', 'sensory', 'input', 'external', 'world', 'send', 'motor', 'command', 'muscle', 'transform', 'relay', 'electrical', 'signal', 'every', 'step', 'interactions', 'define', 'people', 'say', 'roughly', 'billion', 'neurons', 'interact', 'closely', 'cell', 'type', 'broadly', 'classify', 'glia', 'may', 'actually', 'outnumber', 'neurons', 'although', 'really', 'know', 'creation', 'new', 'neurons', 'brain', 'call', 'neurogenesis', 'happen', 'even', 'adults', 'useful', 'analogy', 'think', 'neuron', 'tree', 'neuron', 'three', 'main', 'part', 'dendrites', 'axon', 'cell', 'body', 'soma', 'see', 'image', 'represent', 'branch', 'root', 'trunk', 'tree', 'respectively', 'dendrite', 'tree', 'branch', 'neuron', 'receive', 'input', 'cells', 'dendrites', 'branch', 'move', 'towards', 'tip', 'like', 'tree', 'branch', 'even', 'leaflike', 'structure', 'call', 'spin', 

In [15]:
# Function to get unique words
def get_unique_words(tokens):
    return set(tokens)


In [16]:
unique_words = get_unique_words(tokens)
print( unique_words)


{'synaptic', 'whether', 'cord', 'neurotransmitters', 'nucleus', 'give', 'happen', 'determine', 'command', 'neuron', 'body', 'sensory', 'entire', 'fundamental', 'dr', 'message', 'event', 'postsynaptic', 'neurones', 'project', 'units', 'receive', 'like', 'small', 'make', 'find', 'electrical', 'output', 'classify', 'creation', 'woodruff', 'billion', 'send', 'think', 'synapse', 'tree', 'accord', 'even', 'signal', 'house', 'initiation', 'say', 'analogy', 'active', 'tip', 'trunk', 'throughout', 'structure', 'root', 'long', 'transform', 'axons', 'divide', 'protrusions', 'spine', 'step', 'know', 'also', 'closely', 'nervous', 'potential', 'sum', 'dna', 'different', 'consequent', 'transmit', 'motor', 'muscle', 'responsible', 'external', 'image', 'see', 'interactions', 'dendrites', 'cells', 'transport', 'travel', 'help', 'release', 'may', 'part', 'main', 'communicate', 'transmitter', 'call', 'define', 'site', 'soma', 'proteins', 'orginate', 'leaflike', 'represent', 'respectively', 'axon', 'fire',

In [17]:
# delete tokens with length less than 3
last__= []
small_words = []

for token in list(unique_words):
    if len(token)>=3:
        last__.append (token)
    else:
        small_words.append(token)

print(last__)

['synaptic', 'whether', 'cord', 'neurotransmitters', 'nucleus', 'give', 'happen', 'determine', 'command', 'neuron', 'body', 'sensory', 'entire', 'fundamental', 'message', 'event', 'postsynaptic', 'neurones', 'project', 'units', 'receive', 'like', 'small', 'make', 'find', 'electrical', 'output', 'classify', 'creation', 'woodruff', 'billion', 'send', 'think', 'synapse', 'tree', 'accord', 'even', 'signal', 'house', 'initiation', 'say', 'analogy', 'active', 'tip', 'trunk', 'throughout', 'structure', 'root', 'long', 'transform', 'axons', 'divide', 'protrusions', 'spine', 'step', 'know', 'also', 'closely', 'nervous', 'potential', 'sum', 'dna', 'different', 'consequent', 'transmit', 'motor', 'muscle', 'responsible', 'external', 'image', 'see', 'interactions', 'dendrites', 'cells', 'transport', 'travel', 'help', 'release', 'may', 'part', 'main', 'communicate', 'transmitter', 'call', 'define', 'site', 'soma', 'proteins', 'orginate', 'leaflike', 'represent', 'respectively', 'axon', 'fire', 'anot

In [18]:
print(small_words)

['dr']
