Some common terms to remember:

1. Corpus
2. Vocabulary
3. Document
4. Word

##Bag of words

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'text':['people watch video',
                           'video watch video',
                           'people write comment',
                           'video write comment'],
                   'output':[1,1,0,0]})

In [3]:
df

Unnamed: 0,text,output
0,people watch video,1
1,video watch video,1
2,people write comment,0
3,video write comment,0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [6]:
cv.vocabulary_

{'people': 1, 'watch': 3, 'video': 2, 'write': 4, 'comment': 0}

In [5]:
bow = cv.fit_transform(df['text'])

In [7]:
bow.toarray()

array([[0, 1, 1, 1, 0],
       [0, 0, 2, 1, 0],
       [1, 1, 0, 0, 1],
       [1, 0, 1, 0, 1]])

In [8]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[0 1 1 1 0]]
[[0 0 2 1 0]]
[[1 1 0 0 1]]


In [9]:
cv.transform(['Sneh watch video']).toarray()

array([[0, 0, 1, 1, 0]])

In [10]:
X = bow.toarray()
y = df['output']

In [11]:
X

array([[0, 1, 1, 1, 0],
       [0, 0, 2, 1, 0],
       [1, 1, 0, 0, 1],
       [1, 0, 1, 0, 1]])

In [12]:
y

Unnamed: 0,output
0,1
1,1
2,0
3,0


## N-grams

N-grams are continuous sequences of words or symbols, or tokens in a document.

In [13]:
df = pd.DataFrame({'text':['people watch video',
                           'video watch video',
                           'people write comment',
                           'video write comment'],
                   'output':[1,1,0,0]})

In [14]:
df

Unnamed: 0,text,output
0,people watch video,1
1,video watch video,1
2,people write comment,0
3,video write comment,0


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [17]:
print(cv.vocabulary_)

{'people watch': 0, 'watch video': 4, 'video watch': 2, 'people write': 1, 'write comment': 5, 'video write': 3}


In [16]:
bow = cv.fit_transform(df['text'])

In [18]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[1 0 0 0 1 0]]
[[0 0 1 0 1 0]]
[[0 1 0 0 0 1]]


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))

In [20]:
bow = cv.fit_transform(df['text'])

In [21]:
print(cv.vocabulary_)

{'people watch video': 0, 'video watch video': 2, 'people write comment': 1, 'video write comment': 3}


In [22]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[1 0 0 0]]
[[0 0 1 0]]
[[0 1 0 0]]


## TF-IDF (Term frequency- Inverse document frequency)

Term frequency-inverse document frequency (tf-idf) is a measure of how important a word is to a document in a collection of documents.

In [23]:
df = pd.DataFrame({'text':['people watch video',
                           'video watch video',
                           'people write comment',
                           'video write comment'],
                   'output':[1,1,0,0]})

In [24]:
df

Unnamed: 0,text,output
0,people watch video,1
1,video watch video,1
2,people write comment,0
3,video write comment,0


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [27]:
arr = tfidf.fit_transform(df['text']).toarray()

In [28]:
arr

array([[0.        , 0.61366674, 0.49681612, 0.61366674, 0.        ],
       [0.        , 0.        , 0.8508161 , 0.52546357, 0.        ],
       [0.57735027, 0.57735027, 0.        , 0.        , 0.57735027],
       [0.61366674, 0.        , 0.49681612, 0.        , 0.61366674]])

In [30]:
print(tfidf.idf_)

[1.51082562 1.51082562 1.22314355 1.51082562 1.51082562]
