In [14]:
import nltk

In [28]:
paragraph = '''AI, machine learning and deep learning are common terms in enterprise
                IT and sometimes used interchangeably, especially by companies in their marketing materials.
                But there are distinctions. The term AI, coined in the 1950s, refers to the simulation of human
                intelligence by machines. It covers an ever-changing set of capabilities as new technologies
                are developed. Technologies that come under the umbrella of AI include machine learning and
                deep learning. Machine learning enables software applications to become more accurate at
                predicting outcomes without being explicitly programmed to do so. Machine learning algorithms
                use historical data as input to predict new output values. This approach became vastly more
                effective with the rise of large data sets to train on. Deep learning, a subset of machine
                learning, is based on our understanding of how the brain is structured. Deep learning's
                use of artificial neural networks structure is the underpinning of recent advances in AI,
                including self-driving cars and ChatGPT.'''

In [29]:
# Cleaning the texts
import re # re libray will use for regular expression 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [30]:
ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)

In [31]:
corpus = []
# Create the empty list name as corpus becuase after cleaned the data corpus will store this clean data


In [32]:
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
#   review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]   
    review = ' '.join(review)
    corpus.append(review)

In [33]:
corpus

['ai machine learning deep learning common term enterprise sometimes used interchangeably especially company marketing material',
 'distinction',
 'term ai coined refers simulation human intelligence machine',
 'cover ever changing set capability new technology developed',
 'technology come umbrella ai include machine learning deep learning',
 'machine learning enables software application become accurate predicting outcome without explicitly programmed',
 'machine learning algorithm use historical data input predict new output value',
 'approach became vastly effective rise large data set train',
 'deep learning subset machine learning based understanding brain structured',
 'deep learning use artificial neural network structure underpinning recent advance ai including self driving car chatgpt']

In [34]:
# Creating the Bag of Words model 

# Also we called as document matrix 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_bow = cv.fit_transform(corpus).toarray()

In [36]:
X_bow

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0,

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
X_tf = tf.fit_transform(corpus).toarray()

In [38]:
X_tf

array([[0.        , 0.        , 0.19057861, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28821885, 0.28821885, 0.        ,
        0.        , 0.19057861, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28821885, 0.28821885, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28821885, 0.        , 0.30944803,
        0.15472401, 0.28821885, 0.28821885, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.28821885, 0.        ,
        0.        , 0.        , 0.        , 0.24501235, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.28821885,
        0.        , 0.        , 0.        ],
   