In [1]:
# Simple Bag-of-Words implementation
from sklearn.feature_extraction.text import CountVectorizer

documents = ["I love machine learning", "I love deep learning"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['deep' 'learning' 'love' 'machine']
[[0 1 1 1]
 [1 1 1 0]]


In [3]:
import pandas as pd

In [4]:
# TF-IDF implementation
from sklearn.feature_extraction.text import TfidfVectorizer


documents = [
   "I love machine learning because machines can learn",
   "I love deep learning",
   "Deep learning is a subset of machine learning"
]


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()


df_tfidf = pd.DataFrame(
data=X.toarray(),
index=["Doc " + str(i+1) for i in range(len(documents))],
columns=feature_names)


df_tfidf

Unnamed: 0,because,can,deep,is,learn,learning,love,machine,machines,of,subset
Doc 1,0.426184,0.426184,0.0,0.0,0.426184,0.251711,0.324124,0.324124,0.426184,0.0,0.0
Doc 2,0.0,0.0,0.619805,0.0,0.0,0.481334,0.619805,0.0,0.0,0.0,0.0
Doc 3,0.0,0.0,0.322764,0.424396,0.0,0.50131,0.0,0.322764,0.0,0.424396,0.424396


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
#Import necessary Libraries
import pandas as pd
import numpy as np
import nltk
import re  # for regular expressions
import string
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [13]:
# Import specific NLTK modules
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.probability import FreqDist
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.util import ngrams


In [14]:
# Sample documents
documents = [
    "The cats are chasing mice in the garden",
    "Children were playing games in the park",
    "She runs every morning and swims in the afternoon",
    "The companies are investing in new technologies",
    "Many trees were planted by volunteers last month"
]

# Helper function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(nltk_tag):
    """Map NLTK POS tag to WordNet POS tag for lemmatization"""
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Default to noun for lemmatization
        return wordnet.NOUN

# Custom preprocessor function that lemmatizes text
def lemmatize_text(text):
    """Custom preprocessor that cleans and lemmatizes text"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Get POS tags for all tokens
    pos_tags = nltk.pos_tag(tokens)
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize each token with its POS tag
    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos_tag))
        for word, pos_tag in pos_tags
    ]
    
    # Return lemmatized text - needs to return a string for the vectorizor
    return ' '.join(lemmatized_tokens)

# Create a TFIDF vectorizer with custom preprocessing
tfidf_vectorizer = TfidfVectorizer(
    preprocessor=lemmatize_text,
    stop_words='english',
    ngram_range=(1, 2), # unigrams and bigrams
    max_features=10 # limit to top ten terms for ease of visualizing
)

# Apply the vectorizer to the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame with the TF-IDF values
df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    index=[f"Document {i+1}" for i in range(len(documents))],
    columns=feature_names
)

df_tfidf



Unnamed: 0,afternoon,mouse garden,new,new technology,park,plant,plant volunteer,play,play game,run morning
Document 1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Document 2,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.57735,0.0
Document 3,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107
Document 4,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
Document 5,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0
