# TF-IDF and N-Gram Practical Implementation


In [1]:
# Install scikit-learn if not already
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Import required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

### Sample data

In [3]:
documents = [
    "Data science is fun",
    "I love learning data science",
    "Machine learning is a part of data science"
]

### TF-IDF Vectorization (Unigrams)

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# Convert to DataFrame for better readability
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,data,fun,is,learning,love,machine,of,part,science
0,0.391484,0.66284,0.504107,0.0,0.0,0.0,0.0,0.0,0.391484
1,0.391484,0.0,0.0,0.504107,0.66284,0.0,0.0,0.0,0.391484
2,0.268062,0.0,0.345179,0.345179,0.0,0.453868,0.453868,0.453868,0.268062


### TF-IDF Vectorization with Bigrams (2-grams)

In [5]:
bigram_vectorizer = TfidfVectorizer(ngram_range=(2, 2))
X2 = bigram_vectorizer.fit_transform(documents)
df2 = pd.DataFrame(X2.toarray(), columns=bigram_vectorizer.get_feature_names_out())
df2

Unnamed: 0,data science,is fun,is part,learning data,learning is,love learning,machine learning,of data,part of,science is
0,0.385372,0.652491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.652491
1,0.385372,0.0,0.0,0.652491,0.0,0.652491,0.0,0.0,0.0,0.0
2,0.255374,0.0,0.432385,0.0,0.432385,0.0,0.432385,0.432385,0.432385,0.0


### Custom Example 1: Unigram + Bigram Combined

In [6]:
combined_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X3 = combined_vectorizer.fit_transform(documents)
df3 = pd.DataFrame(X3.toarray(), columns=combined_vectorizer.get_feature_names_out())
df3

Unnamed: 0,data,data science,fun,is,is fun,is part,learning,learning data,learning is,love,love learning,machine,machine learning,of,of data,part,part of,science,science is
0,0.274634,0.274634,0.464997,0.353642,0.464997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.274634,0.464997
1,0.274634,0.274634,0.0,0.0,0.0,0.0,0.353642,0.464997,0.0,0.464997,0.464997,0.0,0.0,0.0,0.0,0.0,0.0,0.274634,0.0
2,0.184899,0.184899,0.0,0.238092,0.0,0.313062,0.238092,0.0,0.313062,0.0,0.0,0.313062,0.313062,0.313062,0.313062,0.313062,0.313062,0.184899,0.0


### Custom Example 2: Small dataset

In [7]:
texts = [
    "AI is the future",
    "Future of AI is bright"
]

vector = TfidfVectorizer(ngram_range=(1,2))
X4 = vector.fit_transform(texts)
df4 = pd.DataFrame(X4.toarray(), columns=vector.get_feature_names_out())
df4

Unnamed: 0,ai,ai is,bright,future,future of,is,is bright,is the,of,of ai,the,the future
0,0.317404,0.317404,0.0,0.317404,0.0,0.317404,0.0,0.446101,0.0,0.0,0.446101,0.446101
1,0.268446,0.268446,0.377292,0.268446,0.377292,0.268446,0.377292,0.0,0.377292,0.377292,0.0,0.0
