In [1]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = ["This is the first document.",
             "This document is the second document.",
             "And this is the third one.",
             "Is this the first document?"]

# Preprocessing function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation and numbers
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply preprocessing to each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed documents
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array for inspection
dense_matrix = tfidf_matrix.todense()

# Display the preprocessed documents and the TF-IDF feature matrix
print("Preprocessed Documents:")
print(preprocessed_documents)
print("\nTF-IDF Feature Matrix:")
print(dense_matrix)


Preprocessed Documents:
['first document', 'document second document', 'third one', 'first document']

TF-IDF Feature Matrix:
[[0.62922751 0.77722116 0.         0.         0.        ]
 [0.78722298 0.         0.         0.61666846 0.        ]
 [0.         0.         0.70710678 0.         0.70710678]
 [0.62922751 0.77722116 0.         0.         0.        ]]
