In [29]:
# Sample documents
documents = [
    "The cats are chasing mice in the garden",
    "Children were playing games in the park",
    "She runs every morning and swims in the afternoon",
    "The companies are investing in new technologies",
    "Many trees were planted by volunteers last month"
]


In [59]:
import pandas as pd

In [61]:
import nltk

In [63]:
# Import specific NLTK modules
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.probability import FreqDist
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.util import ngrams

# Import scikit-learn text processing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [65]:
# Helper function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(nltk_tag):
    """Map NLTK POS tag to WordNet POS tag for lemmatization"""
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Default to noun for lemmatization
        return wordnet.NOUN


In [67]:
# Custom preprocessor function that lemmatizes text
def lemmatize_text(text):
    """Custom preprocessor that cleans and lemmatizes text"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Get POS tags for all tokens
    pos_tags = nltk.pos_tag(tokens)
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize each token with its POS tag
    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos_tag))
        for word, pos_tag in pos_tags
    ]
    
    # Return lemmatized text - needs to return a string for the vectorizor
    return ' '.join(lemmatized_tokens)


In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
import re

In [73]:
# Create a TFIDF vectorizer with custom preprocessing
tfidf_vectorizer = TfidfVectorizer(
    preprocessor=lemmatize_text,
    stop_words='english',
    ngram_range=(1, 2), # unigrams and bigrams
    max_features=10 # limit to top ten terms for ease of visualizing
)

# Apply the vectorizer to the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame with the TF-IDF values
df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    index=[f"Document {i+1}" for i in range(len(documents))],
    columns=feature_names
)

df_tfidf


Unnamed: 0,afternoon,mouse garden,new,new technology,park,plant,plant volunteer,play,play game,run morning
Document 1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Document 2,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.57735,0.0
Document 3,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107
Document 4,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
Document 5,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0
