In [2]:
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
corpus = ['This is the first document.', 
          'This document is the second document.',
          'And this is the third one.', 
          'Is this the first document?',]

In [4]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    pre = []
    for token in nlp(text):
        if token.is_stop == False and token.is_alpha == True:
            pre.append(token.lemma_)
    return pre

In [6]:
vectorizer = TfidfVectorizer(tokenizer=preprocess, min_df=1)

X = vectorizer.fit_transform(corpus).toarray()
X.shape

(4, 2)

In [7]:
X

array([[1.        , 0.        ],
       [0.78722298, 0.61666846],
       [0.        , 0.        ],
       [1.        , 0.        ]])

In [8]:
preprocess(corpus[1])

['document', 'second', 'document']

In [9]:
vectorizer.get_feature_names()

['document', 'second']

In [10]:
X.argsort()

array([[1, 0],
       [1, 0],
       [0, 1],
       [1, 0]], dtype=int64)

In [15]:
X.sum(axis=0)

array([2.78722298, 0.61666846])

In [16]:
corpus

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [17]:
corpus[-3:]

['This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']