In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pandas as pd

In [2]:
# Sample text data (small corpus)
corpus = [
    "I love machine learning and deep learning",
    "Machine learning is fascinating and powerful",
    "Deep learning is a subset of machine learning",
]
texts = [' '.join([word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS]) for text in corpus]

print("Clean Corpus:")
texts

Clean Corpus:


['love machine learning deep learning',
 'Machine learning fascinating powerful',
 'Deep learning subset machine learning']

In [3]:
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

# Get the feature names (terms) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array
tfidf_dense = tfidf_matrix.todense()

In [4]:
# 2. Displaying the results

# Vocabulary - this shows the index of each word in the TF-IDF matrix
print("\nVocabulary:")
for index, word in sorted(tfidf_vectorizer.vocabulary_.items(), key=lambda item: item[1]):
    print(f"{word}: {index}")


Vocabulary:
0: deep
1: fascinating
2: learning
3: love
4: machine
5: powerful
6: subset


In [5]:
# Feature names - the actual words corresponding to the columns in the matrix
print("\nFeature Names:")
print(feature_names)


Feature Names:
['deep' 'fascinating' 'learning' 'love' 'machine' 'powerful' 'subset']


In [6]:
texts

['love machine learning deep learning',
 'Machine learning fascinating powerful',
 'Deep learning subset machine learning']

In [7]:
# TF-IDF Matrix - dense representation of the TF-IDF weights for each word in the documents
pd.DataFrame(tfidf_dense, columns=feature_names)

Unnamed: 0,deep,fascinating,learning,love,machine,powerful,subset
0,0.417233,0.0,0.648038,0.548612,0.324019,0.0,0.0
1,0.0,0.608845,0.359594,0.0,0.359594,0.608845,0.0
2,0.417233,0.0,0.648038,0.0,0.324019,0.0,0.548612
