In [1]:
import nltk

In [2]:
paragraph = """By 55,000 years ago, the first modern humans, 
            or Homo sapiens, had arrived on the Indian subcontinent from Africa, 
            where they had earlier evolved. The earliest known modern human 
            remains in South Asia date to about 30,000 years ago. After 6500 BCE, 
            evidence for domestication of food crops and animals, construction of permanent structures, 
            and storage of agricultural surplus appeared in Mehrgarh and other sites in what is now 
            Balochistan, Pakistan. These gradually developed into the Indus Valley civilisation, 
            the first urban culture in South Asia, which flourished during 2500–1900 BCE in what is now 
            Pakistan and western India. Centred around cities such as Mohenjo-daro, Harappa, Dholavira, 
            and Kalibangan, and relying on varied forms of subsistence, the civilisation engaged robustly 
            in crafts production and wide-ranging trade."""

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [7]:
sentences = nltk.sent_tokenize(paragraph)

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
corpus

['year ago first modern human homo sapiens arrived indian subcontinent africa earlier evolved',
 'earliest known modern human remains south asia date year ago',
 'bce evidence domestication food crop animal construction permanent structure storage agricultural surplus appeared mehrgarh site balochistan pakistan',
 'gradually developed indus valley civilisation first urban culture south asia flourished bce pakistan western india',
 'centred around city mohenjo daro harappa dholavira kalibangan relying varied form subsistence civilisation engaged robustly craft production wide ranging trade']

In [11]:
## Creating TFIDF models
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()

In [12]:
X

array([[0.29808172, 0.24049046, 0.        , 0.        , 0.        ,
        0.        , 0.29808172, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.29808172, 0.        , 0.        ,
        0.        , 0.29808172, 0.24049046, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.29808172, 0.24049046,
        0.        , 0.29808172, 0.        , 0.        , 0.        ,
        0.        , 0.24049046, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.29808172, 0.        , 0.        , 0.        , 0.        ,
        0.29808172, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.24049046],
       [0.        , 0.28694451, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28694451, 0. 