### Library Imports:

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

### NLTK Data Downloads:

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Corpus Definition:

This creates a sample corpus (collection of documents) containing 10 sentences.

In [3]:
corpus = [
    "Artificial Intelligence is the future.",
    "AI is changing the world.",
    "AI is a branch of computer science.",
    "Machine learning is a subset of AI.",
    "Deep learning models require large amounts of data.",
    "Natural Language Processing helps computers understand human language.",
    "AI can improve decision-making in many industries.",
    "Ethics in AI is an important area of research.",
    "Computer vision allows machines to interpret visual information.",
    "AI-powered assistants are becoming increasingly common."
]

### Text Preprocessing Pipeline:

In [4]:
def preprocess(text):
    
    # Lowercasing
    text_low = text.lower()
    # Punctuation removal
    text_rp = re.sub(r'[^\w\s]', '', text_low)
    # Number removal
    text_rp2 = re.sub(r'\d+', '', text_rp)
    # Tokenization
    tokens = word_tokenize(text_rp2)
    # Stop word removal
    filtered_tokens = [word for word in tokens if word not in stopwords.words("english")]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatize_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemmatize_tokens) #Rejoining

#### Corpus Preprocessing:

In [5]:
preprocessed_corpus = [preprocess(text) for text in corpus]
print("Preprocessed Corpus:")
for text in preprocessed_corpus:
    print(text)

Preprocessed Corpus:
artificial intelligence future
ai changing world
ai branch computer science
machine learning subset ai
deep learning model require large amount data
natural language processing help computer understand human language
ai improve decisionmaking many industry
ethic ai important area research
computer vision allows machine interpret visual information
aipowered assistant becoming increasingly common


### TF-IDF Vectorizer:

In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_corpus)

In [7]:
print("\nTF-IDF Feature Names:")
vectorizer.get_feature_names_out()


TF-IDF Feature Names:


array(['ai', 'aipowered', 'allows', 'amount', 'area', 'artificial',
       'assistant', 'becoming', 'branch', 'changing', 'common',
       'computer', 'data', 'decisionmaking', 'deep', 'ethic', 'future',
       'help', 'human', 'important', 'improve', 'increasingly',
       'industry', 'information', 'intelligence', 'interpret', 'language',
       'large', 'learning', 'machine', 'many', 'model', 'natural',
       'processing', 'require', 'research', 'science', 'subset',
       'understand', 'vision', 'visual', 'world'], dtype=object)

In [8]:
print("\nTF-IDF Array:")
X.toarray()


TF-IDF Array:


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.38714999, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.6519643 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.     