In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [2]:
reviews = ["I bought this product this is good",
           "This product is good",
           "This is not worth",
           "I will recommend this product to everyone"]
test_reviews = ["I bought this mobile product this is good product",
                "I will recommend this product to everyone add"]

## Bag of Words

In [3]:
#countvectorizer for Bag of Words approach
count_vectorizer=CountVectorizer()

In [4]:
count_vectorizer.fit(reviews)

CountVectorizer()

In [5]:
#This gives positions of words according to alphabetical order
count_vectorizer.vocabulary_

{'bought': 0,
 'this': 7,
 'product': 5,
 'is': 3,
 'good': 2,
 'not': 4,
 'worth': 10,
 'will': 9,
 'recommend': 6,
 'to': 8,
 'everyone': 1}

In [6]:
#Vectors of reviews using Bag Of Words
review_vectors=count_vectorizer.transform(reviews)
print(review_vectors.toarray())

[[1 0 1 1 0 1 0 2 0 0 0]
 [0 0 1 1 0 1 0 1 0 0 0]
 [0 0 0 1 1 0 0 1 0 0 1]
 [0 1 0 0 0 1 1 1 1 1 0]]


In [7]:
#Vectors of test_reviews using Bag Of Words
test_review_vectors=count_vectorizer.transform(test_reviews)
print(test_review_vectors.toarray())

[[1 0 1 1 0 2 0 2 0 0 0]
 [0 1 0 0 0 1 1 1 1 1 0]]


## TF-IDF 

In [8]:
#creating vectors using TF-IDF approach
tfidf=TfidfVectorizer()

In [9]:
tfidf.fit(reviews)

TfidfVectorizer()

In [10]:
#Vectors using TF-IDF
tfidf_review_vector=tfidf.transform(reviews)
print(tfidf_review_vector.toarray())

[[0.53257171 0.         0.41988565 0.33993362 0.         0.33993362
  0.         0.55583602 0.         0.         0.        ]
 [0.         0.         0.60313701 0.48829139 0.         0.48829139
  0.         0.39921021 0.         0.         0.        ]
 [0.         0.         0.         0.38991559 0.61087812 0.
  0.         0.31878155 0.         0.         0.61087812]
 [0.         0.46226355 0.         0.         0.         0.29505684
  0.46226355 0.2412283  0.46226355 0.46226355 0.        ]]


In [11]:
#Vectors of test_reviews using TF-IDF
tfidf_test_review_vector=tfidf.transform(test_reviews)
print(tfidf_test_review_vector.toarray())

[[0.45893203 0.         0.36182728 0.29293037 0.         0.58586074
  0.         0.47897954 0.         0.         0.        ]
 [0.         0.46226355 0.         0.         0.         0.29505684
  0.46226355 0.2412283  0.46226355 0.46226355 0.        ]]



## TF-IDF with N-gram

In [12]:
#creating Vectors using TF-IDF with N-gram approach
tfidf_n=TfidfVectorizer(ngram_range=(2,3))
tfidf_n.fit(reviews)

TfidfVectorizer(ngram_range=(2, 3))

In [13]:
print(tfidf_n.vocabulary_)

{'bought this': 0, 'this product': 17, 'product this': 8, 'this is': 14, 'is good': 2, 'bought this product': 1, 'this product this': 19, 'product this is': 9, 'this is good': 15, 'product is': 6, 'this product is': 18, 'product is good': 7, 'is not': 3, 'not worth': 5, 'this is not': 16, 'is not worth': 4, 'will recommend': 22, 'recommend this': 12, 'product to': 10, 'to everyone': 21, 'will recommend this': 23, 'recommend this product': 13, 'this product to': 20, 'product to everyone': 11}


In [14]:
#Vectors using TF-IDF with N-gram
tfidf_n_review_vector=tfidf_n.transform(reviews)
print(tfidf_n_review_vector.toarray())

[[0.36153669 0.36153669 0.28503968 0.         0.         0.
  0.         0.         0.36153669 0.36153669 0.         0.
  0.         0.         0.28503968 0.36153669 0.         0.23076418
  0.         0.36153669 0.         0.         0.         0.        ]
 [0.         0.         0.39278432 0.         0.         0.
  0.49819711 0.49819711 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.31799276
  0.49819711 0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.46516193 0.46516193 0.46516193
  0.         0.         0.         0.         0.         0.
  0.         0.         0.36673901 0.         0.46516193 0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.34488069 0.34488069
  0.34488069 0.34488069 0.         0.         0.         0.22013288
  0.         0.         0.3448806

In [15]:
#Vectors for test_reviews using TF-IDF with N-gram
tfidf_n_review_test_vector=tfidf_n.transform(test_reviews)
print(tfidf_n_review_test_vector.toarray())

[[0.43671931 0.         0.34431452 0.         0.         0.
  0.         0.         0.43671931 0.43671931 0.         0.
  0.         0.         0.34431452 0.43671931 0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.34488069 0.34488069
  0.34488069 0.34488069 0.         0.         0.         0.22013288
  0.         0.         0.34488069 0.34488069 0.34488069 0.34488069]]
