In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The car is driven on the road.","The truck is driven on the highway"]

In [None]:
# create the transform
vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
# summarize
print(vectorizer.vocabulary_)

{'the': 6, 'car': 0, 'is': 3, 'driven': 1, 'on': 4, 'road': 5, 'truck': 7, 'highway': 2}


In [None]:
# encode document
newvector = vectorizer.transform(text)

# summarize encoded vector
print(newvector.toarray())

[[1 1 0 1 1 1 2 0]
 [0 1 1 1 1 0 2 1]]


# TF - IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The car is driven on the road.","The truck is driven on the highway"]

In [None]:
# create the transform
vectorizer = TfidfVectorizer()

In [None]:
# tokenize and build vocab
vectorizer.fit(text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
#Focus on IDF VALUES
print(vectorizer.idf_)

[1.40546511 1.         1.40546511 1.         1.         1.40546511
 1.         1.40546511]


In [None]:
# summarize
print(vectorizer.vocabulary_)

{'the': 6, 'car': 0, 'is': 3, 'driven': 1, 'on': 4, 'road': 5, 'truck': 7, 'highway': 2}


# **TF-IDF (KN)**

In [None]:
# import nltk
# nltk.download('popular')

In [None]:
import nltk

paragraph =  """I have three visions for India. In 3000 years of our history, people from all over 
               the world have come and invaded us, captured our lands, conquered our minds. 
               From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,
               I see four milestones in my career"""
               
               
# Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    


In [None]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [None]:
X

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.57735027, 0.        , 0.        ],
       [0.        , 0.        , 0.31622777, 0.        , 0.31622777,
        0.31622777, 0.        , 0.        , 0.31622777, 0.        ,
        0.31622777, 0.31622777, 0.        , 0.31622777, 0.        ,
        0.        , 0.31622777, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.31622777, 0.31622777],
       [0.30151134, 0.30151134, 0.        , 0.30151134, 0.        ,
        0.        , 0.30151134, 0.30151134, 0.        , 0.        ,
        0.        , 0.        , 0.30151134, 0.        , 0.30151134,
        0.30151134, 0.        , 0.30151134, 0.30151134, 0.        ,
        0.30151134, 0.        , 0.        , 0.        ]])