Vectorization

###### Vectorization is the process of converting text data into numerical data that machine learning models can understand. In the context of natural language processing (NLP), vectorization typically involves transforming text into vectors of numbers. 

In [1]:
import nltk
sample_sentences = [
    "I love programming.",  # First sample sentence
    "The sun is shining today.",  # Second sample sentence
    "She sings beautifully.",  # Third sample sentence
    "The cat is sleeping.",  # Fourth sample sentence
    "He enjoys playing video games."  # Fifth sample sentence
]

In [24]:
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import ne_chunk
import string

#Text Preprocessing 1
lemmatizer = WordNetLemmatizer()

# Function to word tokenize a sentence
def word_tokenize_sentence(sentence):
    tokens = word_tokenize(sentence)
    return tokens

# Tokenize and lemmatize each sentence
lemmatized_sentences = []
for sentence in sample_sentences:
    tokens = word_tokenize_sentence(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_sentences.append(lemmatized_tokens)

# Remove stop words from lemmatized_sentences
stop_words = set(stopwords.words('english'))
filtered_sentences = []
for sentence in lemmatized_sentences:
    filtered_tokens = [token for token in sentence if token.lower() not in stop_words]
    filtered_sentences.append(filtered_tokens)

# Print the filtered sentences
print(filtered_sentences)
# Function to remove named entities from a sentence
def remove_named_entities(sentence):
    tokens = word_tokenize_sentence(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    named_entities = ne_chunk(tagged_tokens, binary=True)
    filtered_tokens = [token for token, pos in named_entities if pos != 'NE']
    return filtered_tokens

# Remove named entities from each sentence in filtered_sentences
filtered_sentences_without_entities = []
for sentence in filtered_sentences:
    filtered_tokens = remove_named_entities(' '.join(sentence))
    filtered_sentences_without_entities.append(filtered_tokens)

# Print the filtered sentences without named entities
print(filtered_sentences_without_entities)




[['love', 'programming', '.'], ['sun', 'shining', 'today', '.'], ['sings', 'beautifully', '.'], ['cat', 'sleeping', '.'], ['enjoys', 'playing', 'video', 'game', '.']]
[['love', 'programming', '.'], ['sun', 'shining', 'today', '.'], ['sings', 'beautifully', '.'], ['cat', 'sleeping', '.'], ['enjoys', 'playing', 'video', 'game', '.']]


In [31]:
# Remove punctuations from each sentence in filtered_sentences_without_entities
filtered_sentences_without_punctuation = []
for sentence in filtered_sentences_without_entities:
    sentence_without_punctuation = ' '.join(sentence).translate(str.maketrans('', '', string.punctuation)).split()
    # sentence_without_punctuation = ' '.join(sentence).translate(str.maketrans('', '', punctuation)).split()
    filtered_sentences_without_punctuation.append(sentence_without_punctuation)

# Print the filtered sentences without punctuations
print(filtered_sentences_without_punctuation)

[['love', 'programming'], ['sun', 'shining', 'today'], ['sings', 'beautifully'], ['cat', 'sleeping'], ['enjoys', 'playing', 'video', 'game']]


In [35]:
from sklearn.preprocessing import OneHotEncoder

# Convert filtered_sentences_without_punctuation to a 2D array
filtered_sentences_2d = []
for sentence in filtered_sentences_without_punctuation:
    for word in sentence:
        filtered_sentences_2d.append([word])

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the encoder on the filtered_sentences_2d
encoded_sentences = encoder.fit_transform(filtered_sentences_2d).toarray()

# Print the encoded sentences

print(encoded_sentences)

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [37]:
from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the vectorizer on the filtered_sentences_without_punctuation
count_vectors = vectorizer.fit_transform([' '.join(sentence) for sentence in filtered_sentences_without_punctuation])

# Print the count vectors
print(count_vectors.toarray())

[[0 0 0 0 1 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 0]
 [1 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 1 1 0 1 0 0 0 0 0 0 1]]
