In [11]:
# General Purpose Operations with Dataset.
import pandas as pd

# Sentiment Analysis.
import nltk

# Tokenization.
from nltk import word_tokenize, sent_tokenize

# Stopwords.
from nltk.corpus import stopwords

# Bag-Of-Words.
from sklearn.feature_extraction.text import CountVectorizer

# TF-IDF.
from sklearn.feature_extraction.text import TfidfVectorizer

# Stemming.
from nltk.stem import PorterStemmer

# Lemmatization.
from nltk.stem.wordnet import WordNetLemmatizer

In [14]:
def tokenize(text: str) -> list:
    '''
    Tokenizes to words and while tokenizing, the algorithm removes the special characters.

    Parameters:
    text: str
        List of characters to tokenize.

    Returns:
    list
        Tokens have no special characters.
    '''

    # Remove Special Characters.
    text = ''.join(t for t in text if t.isalnum() or t == ' ')

    # Tokenizing.
    return nltk.word_tokenize(text)

def remove_stopwords(words: list) -> list:
    '''
    Removing the stopwords from the given set of words.

    Parameters:
    words: list
        List of words to remove the stopwords from.

    Returns:
    list
        List of words that do not contain any stopwords.
    '''

    return [word.lower() for word in words if word.lower() not in set(stopwords.words('English'))]

def extract_bag_of_words(words: list) -> tuple:
    '''
    Extracting Bag-Of-Words from the given `text`.

    Parameters:
    words: list
        List of `words` to extract the bag of words from.

    Returns:
    tuple
        1. Feature Names.
        2. Bag of Words.
    '''

    # Creating an instance of Count-Vectorizer.
    count = CountVectorizer()

    # Finding bag of words.
    bag_of_word = count.fit_transform(words)

    # Providing feature names along with the bag of words.
    return count.get_feature_names_out(), bag_of_word.toarray()

def find_tf_idf(words: str) -> tuple:
    '''
    Finds the TF-IDF values based on the given words.

    Parameters:
    words: list
        List of `words` to find the TF-IDF from.

    Returns:
    tuple
        1. Feature Matrix.
        2. TF-IDF Vocabulary.
    '''

    # Creating an instance of TF-IDF Vectorizer.
    tfidf = TfidfVectorizer()

    # Finding the TF-IDF Vectors.
    feature_matrix = tfidf.fit_transform(words)

    # Providing feature matrix and its frequency.
    return feature_matrix, tfidf.vocabulary_

def text_stemming(text: str) -> str:
    '''
    Stems the given `text`.

    Parameters:
    text: list
        List of characters to find the stemming text.

    Returns:
    str
        Filters the stemming text.
    '''

    return PorterStemmer().stem(text)

def text_lemmatization(text: str) -> str:
    '''
    Lemmatizes the given `text`.

    Parameters:
    text: list
        List of characters to find the stemming text.

    Returns:
    str
        Filters the lemmatized words.
    '''

    return WordNetLemmatizer().lemmatize(text)

In [15]:
# Unit Testing.
data = ['Feature Extraction aims to reduce the number of features in a dataset by creating new features from the existing ones (and then discarding the original features). These new reduced set of features should then be able to summarize most of the information contained in the original set of features!!!. In this way, a summarised version of the original features can be created from a combination of the original set!!!',
        'Another commonly used technique to reduce the number of features in a dataset is Feature Selection! The difference between Feature Selection and/or Feature Extraction is that feature selection aims instead to $ rank the importance of the existing features in the dataset and discard less important ones (no new features are created)?!. If you are interested in finding out more about Feature Selection, you can find more information about it in my previous article.',
        'In this article, I will walk you through how to apply Feature Extraction techniques using the Kaggle Mushroom Classification Dataset as an example??? Our objective will be to try to predict if a Mushroom is poisonous or not by looking at the given features. All the code used in this post (and more!) is available on Kaggle and on my GitHub Account.']

for d in data:
    # Tokenization.
    tokens = remove_stopwords(words = tokenize(text = d))

    # Stemming.
    print(text_stemming(text = d), '\n')

    # Lemmatization.
    print(text_lemmatization(text = d), '\n')

    # Bag of Words.
    print(extract_bag_of_words(words = tokens), '\n')

    # TF-IDF.
    print(find_tf_idf(words = tokens), '\n')

feature extraction aims to reduce the number of features in a dataset by creating new features from the existing ones (and then discarding the original features). these new reduced set of features should then be able to summarize most of the information contained in the original set of features!!!. in this way, a summarised version of the original features can be created from a combination of the original set!!! 

Feature Extraction aims to reduce the number of features in a dataset by creating new features from the existing ones (and then discarding the original features). These new reduced set of features should then be able to summarize most of the information contained in the original set of features!!!. In this way, a summarised version of the original features can be created from a combination of the original set!!! 

(array(['able', 'aims', 'combination', 'contained', 'created', 'creating',
       'dataset', 'discarding', 'existing', 'extraction', 'feature',
       'features', '