In [3]:
import nltk
nltk.download('punkt_tab')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define a function for preprocessing text
def preprocess(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Lemmatize words and remove stop words and non-alpha characters
    return ' '.join([lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words and word.isalpha()])

# Example documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The quick brown fox is quick.",
    "Jumping foxes are quick and can jump high."
]

# Preprocess documents
processed_documents = [preprocess(doc) for doc in documents]

print(processed_documents)

['quick brown fox jump lazy dog', 'quick brown fox quick', 'jumping fox quick jump high']


In [6]:
# Initialize a TfidfVectorizer with no stop words option as we already handled them
vectorizer = TfidfVectorizer()

# Fit and transform the processed documents
tfidf_matrix = vectorizer.fit_transform(processed_documents)

# Get the feature names that correspond to the columns of the tfidf matrix
feature_names = vectorizer.get_feature_names_out()

# Convert the tfidf matrix to a dense format and create a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=feature_names)

print(tfidf_df)

      brown       dog       fox     high      jump  jumping      lazy  \
0  0.387376  0.509353  0.300832  0.00000  0.387376  0.00000  0.509353   
1  0.499037  0.000000  0.387547  0.00000  0.000000  0.00000  0.000000   
2  0.000000  0.000000  0.326310  0.55249  0.420183  0.55249  0.000000   

      quick  
0  0.300832  
1  0.775093  
2  0.326310  
