In [1]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
v.fit(["Thor Hathodwala is looking for a job"])
v.vocabulary_



{'thor': 5, 'hathodwala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [2]:
v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor Hathodwala is looking for a job"])
v.vocabulary_

{'thor hathodwala': 4,
 'hathodwala is': 1,
 'is looking': 2,
 'looking for': 3,
 'for job': 0}

In [4]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(["Thor Hathodwala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodwala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodwala': 10,
 'hathodwala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [5]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [8]:
import spacy

# load english language model and create nlp object
nlp = spacy.load("en_core_web_sm")
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)
preprocess("Thor ate pizza")

'thor eat pizza'

In [9]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [10]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [12]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [13]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)