In [1]:
corpus = [
    "Ivan cooked the pizza",
    "Ganesh! is tall",
    "Rakesh, is eating Biryani"
]

In [2]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)


In [3]:
preprocess("Ivan cooked the pizza")

'Ivan cook pizza'

In [4]:
preprocess("Venkat is eating Biryani")

'Venkat eat Biryani'

In [5]:
corpus_processed = [
    preprocess(text) for text in corpus
]
corpus_processed

['Ivan cook pizza', 'Ganesh tall', 'Rakesh eat Biryani']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
v.fit(corpus)
v.vocabulary_

{'ivan': 5,
 'cooked': 1,
 'the': 9,
 'pizza': 6,
 'ganesh': 3,
 'is': 4,
 'tall': 8,
 'rakesh': 7,
 'eating': 2,
 'biryani': 0}

In [7]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'ivan': 7,
 'cook': 1,
 'pizza': 9,
 'ivan cook': 8,
 'cook pizza': 2,
 'ganesh': 5,
 'tall': 12,
 'ganesh tall': 6,
 'rakesh': 10,
 'eat': 3,
 'biryani': 0,
 'rakesh eat': 11,
 'eat biryani': 4}

In [8]:
v.transform(["Ivan cooked the pizza"]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]])

In [9]:
v.transform(["Rakesh is eating Biryani"]).toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]])