In [10]:
import nltk

# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [11]:

paragraph =  """One problem that we encounter in the bag-of-words approach is that it treats every word equally, 
                but in a document, there is a high chance of particular words being repeated more often than others.
                In a news report about Messi winning the Copa-America tournament, the word Messi would be more frequently repeated. 
                We cannot give Messi the same weight as any other word in that document. 
                In the news report, if we take each sentence as a document, we can count the number of documents each time Messi occurs. 
                This method is called document-frequency."""
               
 

In [12]:
ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [13]:
    
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()


In [14]:
X

array([[0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 3, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

## TF-IDF

In [15]:
import nltk

# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [16]:

paragraph =  """One problem that we encounter in the bag-of-words approach is that it treats every word equally, 
                but in a document, there is a high chance of particular words being repeated more often than others.
                In a news report about Messi winning the Copa-America tournament, the word Messi would be more frequently repeated. 
                We cannot give Messi the same weight as any other word in that document. 
                In the news report, if we take each sentence as a document, we can count the number of documents each time Messi occurs. 
                This method is called document-frequency."""
               
 

In [17]:
ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [18]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [19]:
X

array([[0.        , 0.23566993, 0.23566993, 0.        , 0.        ,
        0.23566993, 0.        , 0.        , 0.13277233, 0.23566993,
        0.23566993, 0.23566993, 0.        , 0.        , 0.        ,
        0.23566993, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.23566993, 0.23566993, 0.23566993, 0.23566993,
        0.23566993, 0.19013701, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.23566993, 0.        , 0.        ,
        0.47349269, 0.        ],
       [0.31318405, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.31318405, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.31318405, 0.        ,
        0.        , 0.41948602, 0.        , 0.25267492, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.25267492, 0.25267492, 0.        , 0.        ,
        0.        , 0.31318405, 0.        , 0.        , 0.31318405,
        0.20974

# Happy Learning!