In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.DataFrame(
    {
        "text" : ["people watch campusx", "campusx watch campusx","people write comments","campusx write comments"],
        "output" : [1,1,0,0]
    }
)
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comments,0
3,campusx write comments,0


### One Hot Encoding

In [3]:
sentences = df["text"]

# Create a vocabulary set
vocab = set()
for sentence in sentences:
	words = sentence.lower().split()
	for word in words:
		vocab.add(word)

# Create a dictionary to map words to integers
word_to_int = {word: i for i, word in enumerate(vocab)}

# Create a binary vector for each word in each sentence
vectors = []
for sentence in sentences:
	words = sentence.lower().split()
	sentence_vectors = []
	for word in words:
		binary_vector = np.zeros(len(vocab))
		binary_vector[word_to_int[word]] = 1
		sentence_vectors.append(binary_vector)
	vectors.append(sentence_vectors)

# Print the one-hot encoded vectors for each word in each sentence
for i in range(len(sentences)):
	print(f"Sentences {i + 1}:")
	for j in range(len(vectors[i])):
		print(f"{sentences[i].split()[j]}: {vectors[i][j]}")


Sentences 1:
people: [0. 0. 1. 0. 0.]
watch: [0. 1. 0. 0. 0.]
campusx: [1. 0. 0. 0. 0.]
Sentences 2:
campusx: [1. 0. 0. 0. 0.]
watch: [0. 1. 0. 0. 0.]
campusx: [1. 0. 0. 0. 0.]
Sentences 3:
people: [0. 0. 1. 0. 0.]
write: [0. 0. 0. 1. 0.]
comments: [0. 0. 0. 0. 1.]
Sentences 4:
campusx: [1. 0. 0. 0. 0.]
write: [0. 0. 0. 1. 0.]
comments: [0. 0. 0. 0. 1.]


### Bag of Words (Uni-grams)

In [4]:
cv = CountVectorizer()

bow = cv.fit_transform(df["text"])

In [5]:
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comments': 1}


In [6]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]
[[0 1 1 0 1]]
[[1 1 0 0 1]]


In [7]:
cv.transform(["campusx write and watch comments of campusx"]).toarray()

array([[2, 1, 0, 1, 1]])

### N-grams (Bi-grams)

In [8]:
cv = CountVectorizer(ngram_range=(2,2))

bow = cv.fit_transform(df["text"])

In [9]:
print(cv.vocabulary_)

{'people watch': 2, 'watch campusx': 4, 'campusx watch': 0, 'people write': 3, 'write comments': 5, 'campusx write': 1}


In [10]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]
[[0 0 0 1 0 1]]
[[0 1 0 0 0 1]]


In [11]:
cv = CountVectorizer(ngram_range=(1,2))

bow = cv.fit_transform(df["text"])

In [12]:
print(cv.vocabulary_)

{'people': 4, 'watch': 7, 'campusx': 0, 'people watch': 5, 'watch campusx': 8, 'campusx watch': 1, 'write': 9, 'comments': 3, 'people write': 6, 'write comments': 10, 'campusx write': 2}


In [13]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[1 0 0 0 1 1 0 1 1 0 0]]
[[2 1 0 0 0 0 0 1 1 0 0]]
[[0 0 0 1 1 0 1 0 0 1 1]]
[[1 0 1 1 0 0 0 0 0 1 1]]


### TF-IDF

In [14]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(df["text"]).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [15]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]
['campusx' 'comments' 'people' 'watch' 'write']
