In [1]:
import numpy as np
import pandas as pd

In [8]:
df = pd.DataFrame({
    'text': [
        'people watch campusx',
        'campusx watch campusx',
        'people write comment',
        'campusx write comment'
    ],
})
print(df)

                    text
0   people watch campusx
1  campusx watch campusx
2   people write comment
3  campusx write comment


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
tfidf = TfidfVectorizer()

# Fit on text data and transform into TF-IDF feature vectors
tfidf.fit_transform(df['text']).toarray()


array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [10]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]
['campusx' 'comment' 'people' 'watch' 'write']


In [11]:
# Sample corpus
corpus = [
    "people watch campusx",
    "campusx watch campusx",
    "people write comment",
    "campus write comment"
]

# Step 1: Combine all sentences into one string
all_text = " ".join(corpus)

# Step 2: Split into individual words
words = all_text.split()

# Step 3: Find total number of words
total_words = len(words)

# Step 4: Find total number of unique words (vocabulary)
unique_words = set(words)
vocab_size = len(unique_words)

# Display results
print("Total number of words in the corpus:", total_words)
print("Total number of unique words (vocabulary):", vocab_size)
print("Vocabulary:", unique_words)


Total number of words in the corpus: 12
Total number of unique words (vocabulary): 6
Vocabulary: {'comment', 'campusx', 'write', 'people', 'watch', 'campus'}


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Sample corpus
corpus = [
    "people watch campusx",
    "campusx watch campusx",
    "people write comment",
    "campus write comment"
]

# Bag of Bigrams (2-grams)
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_bigram = bigram_vectorizer.fit_transform(corpus)
bigram_vocab = bigram_vectorizer.get_feature_names_out()

# Bag of Trigrams (3-grams)
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
X_trigram = trigram_vectorizer.fit_transform(corpus)
trigram_vocab = trigram_vectorizer.get_feature_names_out()

# Display results
print("----- BIGRAMS -----")
print("Vocabulary size:", len(bigram_vocab))
print("Vocabulary:", bigram_vocab)

print("\n----- TRIGRAMS -----")
print("Vocabulary size:", len(trigram_vocab))
print("Vocabulary:", trigram_vocab)


----- BIGRAMS -----
Vocabulary size: 6
Vocabulary: ['campus write' 'campusx watch' 'people watch' 'people write'
 'watch campusx' 'write comment']

----- TRIGRAMS -----
Vocabulary size: 4
Vocabulary: ['campus write comment' 'campusx watch campusx' 'people watch campusx'
 'people write comment']
