In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB,  MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
corpus = [
     'This is the first document.',
    'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',]

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [4]:
data = ['some of the data are important',
       'data is important in sometimes',
       'is ths data is important',
       'importance of data in calculated']

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)
print(vectorizer.get_feature_names())
print(X.toarray())

['are', 'calculated', 'data', 'importance', 'important', 'in', 'is', 'of', 'some', 'sometimes', 'the', 'ths']
[[1 0 1 0 1 0 0 1 1 0 1 0]
 [0 0 1 0 1 1 1 0 0 1 0 0]
 [0 0 1 0 1 0 2 0 0 0 0 1]
 [0 1 1 1 0 1 0 1 0 0 0 0]]


In [6]:
soil = ['which is one?,',
      'one of these which/.',
      'which of these one?',
      'which one is this?']

In [7]:
vector = CountVectorizer()
X = vector.fit_transform(soil)
print(vector.get_feature_names())
print(X.toarray())

['is', 'of', 'one', 'these', 'this', 'which']
[[1 0 1 0 0 1]
 [0 1 1 1 0 1]
 [0 1 1 1 0 1]
 [1 0 1 0 1 1]]


In [9]:
data =  ["He is ::having a great Time, at the park time?",
       "She, unlike most women, is a big player on the park's grass.",
       "she can't be going"]


In [10]:
count = CountVectorizer(stop_words='english', analyzer='word',ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
X = count.fit_transform(data)
print('Every_feature:\n',count.get_feature_names())
print('Total_array:\n',X.toarray())

Every_feature:
 ['big', 'going', 'grass', 'great', 'having', 'park', 'player', 'time', 'unlike', 'women']
Total_array:
 [[0 0 0 1 1 1 0 2 0 0]
 [1 0 1 0 0 1 1 0 1 1]
 [0 1 0 0 0 0 0 0 0 0]]


In [11]:
#print every three features
print('Every 3rd feature:\n',count.get_feature_names()[::3])

Every 3rd feature:
 ['big', 'great', 'player', 'women']


In [12]:
#print vocabulary features 
print('vocabulary_size:\n',len(count.vocabulary_))
print('vocabulary content:\n',count.vocabulary_)

vocabulary_size:
 10
vocabulary content:
 {'having': 4, 'great': 3, 'time': 7, 'park': 5, 'unlike': 8, 'women': 9, 'big': 0, 'player': 6, 'grass': 2, 'going': 1}


In [13]:
#bi_gram
count = CountVectorizer(stop_words='english', analyzer='word',ngram_range=(1, 2), max_df=1.0, min_df=1, max_features=None)
X = count.fit_transform(data)
print('Every_feature:\n',count.get_feature_names())
print('Total_array:\n',X.toarray())

Every_feature:
 ['big', 'big player', 'going', 'grass', 'great', 'great time', 'having', 'having great', 'park', 'park grass', 'park time', 'player', 'player park', 'time', 'time park', 'unlike', 'unlike women', 'women', 'women big']
Total_array:
 [[0 0 0 0 1 1 1 1 1 0 1 0 0 2 1 0 0 0 0]
 [1 1 0 1 0 0 0 0 1 1 0 1 1 0 0 1 1 1 1]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [14]:
#tri_gram
count = CountVectorizer(stop_words='english', analyzer='word',ngram_range=(1, 3), max_df=1.0, min_df=1, max_features=None)
X = count.fit_transform(data)
print('Every_feature:\n',count.get_feature_names())
print('Total_array:\n',X.toarray())

Every_feature:
 ['big', 'big player', 'big player park', 'going', 'grass', 'great', 'great time', 'great time park', 'having', 'having great', 'having great time', 'park', 'park grass', 'park time', 'player', 'player park', 'player park grass', 'time', 'time park', 'time park time', 'unlike', 'unlike women', 'unlike women big', 'women', 'women big', 'women big player']
Total_array:
 [[0 0 0 0 0 1 1 1 1 1 1 1 0 1 0 0 0 2 1 1 0 0 0 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [15]:
#min_df
count = CountVectorizer(stop_words='english',analyzer = 'word',ngram_range=(1, 1),max_df=1.0,min_df=0.6,max_features=None)
X = count.fit_transform(data)
print('Every_feature:\n',count.get_feature_names())
print('Total_array:\n',X.toarray())

Every_feature:
 ['park']
Total_array:
 [[1]
 [1]
 [0]]


In [16]:
#max_df
count = CountVectorizer(stop_words='english',analyzer = 'word',ngram_range=(1, 1),max_df=0.50,min_df=1,max_features=None)
X = count.fit_transform(data)
print('Every_feature:\n',count.get_feature_names())
print('Total_array:\n',X.toarray())

Every_feature:
 ['big', 'going', 'grass', 'great', 'having', 'player', 'time', 'unlike', 'women']
Total_array:
 [[0 0 0 1 1 0 2 0 0]
 [1 0 1 0 0 1 0 1 1]
 [0 1 0 0 0 0 0 0 0]]


In [17]:
#max_features
count = CountVectorizer(stop_words='english',analyzer = 'word',ngram_range=(1, 1),max_df=0.50,min_df=1,max_features=4)
X = count.fit_transform(data)
print('Every_feature:\n',count.get_feature_names())
print('Total_array:\n',X.toarray())

Every_feature:
 ['big', 'going', 'grass', 'time']
Total_array:
 [[0 0 0 2]
 [1 0 1 0]
 [0 1 0 0]]


In [18]:
#TfidfVectorizer -- Brief Tutorial

In [19]:
txt1 = ['His smile was not perfect', 'His smile was not not not not perfect', 'she not sang']

In [20]:
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
X = tf.fit_transform(txt1)
print ("The text: ", txt1)

The text:  ['His smile was not perfect', 'His smile was not not not not perfect', 'she not sang']


In [21]:
#vocabulary
print('vocabulary:',tf.vocabulary_)

vocabulary: {'his': 0, 'smile': 5, 'was': 6, 'not': 1, 'perfect': 2, 'she': 4, 'sang': 3}


In [22]:
#idf
idf = tf.idf_
#print(dict(zip(txt_fitted.get_feature_names(), idf)))

#We see that the tokens 'sang','she' have the most idf weight because they are the only tokens that appear in one document only.

#The token 'not' appears 6 times but it is also in all documents, so its idf is the lowest


In [23]:
# get feature names
feature_names = np.array(tf.get_feature_names())
feature_names

array(['his', 'not', 'perfect', 'sang', 'she', 'smile', 'was'],
      dtype='<U7')

In [24]:
sorted_by_idf = np.argsort(tf.idf_)
sorted_by_idf

array([1, 0, 2, 5, 6, 3, 4], dtype=int64)

In [25]:
#sort by idf
print('Features with lower idf:\n',feature_names[sorted_by_idf[:3]])
print('features with higher idf:\n',feature_names[sorted_by_idf[-3:]])

Features with lower idf:
 ['not' 'his' 'perfect']
features with higher idf:
 ['was' 'sang' 'she']


In [26]:
print("The token 'not' has  the largest weight in document #2 because it appears 3 times there. But in document #1\
 its weight is 0 because it does not appear there.")
#txt_transformed.toarray()

The token 'not' has  the largest weight in document #2 because it appears 3 times there. But in document #1 its weight is 0 because it does not appear there.


In [27]:
new1 = tf.transform(txt1)

# find maximum value for each of the features over all of dataset:
max_val = new1.max(axis=0).toarray().ravel()

#sort weights from smallest to biggest and extract their indices 
sort_by_tfidf = max_val.argsort()

print("Features with lowest tfidf:\n{}".format(
      feature_names[sort_by_tfidf[:3]]))

print("\nFeatures with highest tfidf: \n{}".format(
      feature_names[sort_by_tfidf[-3:]]))

Features with lowest tfidf:
['his' 'perfect' 'smile']

Features with highest tfidf: 
['sang' 'she' 'not']


In [28]:
x = np.array([[1,1,1],
            [0,0,0],
            [2,2,2]])
y = np.array(['GOOD','BAD','WORST'])
predict_this = [0,0,1]

In [29]:
#GaussianNB
model = GaussianNB()
model.fit(x,y)
predicted = model.predict([predict_this])
print('GaussianNB:',predicted)

GaussianNB: ['BAD']


In [30]:
#MultinomialNB
model = MultinomialNB()
model.fit(x,y)
predicted = model.predict([predict_this])
print('MultinomialNB:',predicted)

MultinomialNB: ['GOOD']


In [31]:
#BaseNB
model = BernoulliNB()
model.fit(x,y)
predicted = model.predict([predict_this])
print('BaseNB:',predicted)

BaseNB: ['BAD']


In [32]:
#Numerical_prediction
x = np.array([[5,0,1,3],[4,3,1,2]])
y = np.array([1,0])
model = GaussianNB()
model.fit(x,y)
predicted = model.predict([[3,0,3,1]])
print('predicted:',predicted)

predicted: [1]


In [33]:
x = np.array([[5,0,1,3],[4,3,1,2]])
y = np.array([1,0])
model = MultinomialNB()
model.fit(x,y)
predicted = model.predict([[3,0,3,1]])
print('predicted:',predicted)

predicted: [1]


In [34]:
x = np.array([[5,0,1,3],[4,3,1,2]])
y = np.array([1,0])
model = BernoulliNB()
model.fit(x,y)
predicted = model.predict([[3,0,3,1]])
print('predicted:',predicted)

predicted: [1]


In [35]:
x = np.array([[2,2,2,2],[1,3,4,5]])
y = np.array([1,0])
model = GaussianNB()
model.fit(x,y)
predicted = model.predict([[2,4,2,2]])
print('predicted:',predicted)

predicted: [1]


In [36]:
docs=["the house had a tiny little mouse",
      "the cat saw the mouse",
      "the mouse ran away from the house",
      "the cat finally ate the mouse",
      "the end of the mouse story"
     ]

In [37]:
#instantiate CountVectorizer()
cv=CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape
print(word_count_vector.toarray())
print(cv.get_feature_names())

[[0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 0 1 0 2 0]
 [0 1 0 0 0 1 0 1 0 1 0 1 0 0 2 0]
 [1 0 1 0 1 0 0 0 0 1 0 0 0 0 2 0]
 [0 0 0 1 0 0 0 0 0 1 1 0 0 1 2 0]]
['ate', 'away', 'cat', 'end', 'finally', 'from', 'had', 'house', 'little', 'mouse', 'of', 'ran', 'saw', 'story', 'the', 'tiny']


In [38]:
transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
transformer.fit(word_count_vector)

NameError: name 'TfidfTransformer' is not defined

In [None]:
tf_idf = pd.DataFrame(transformer.idf_,index=cv.get_feature_names(),columns=["idf_weights"])
tf_idf

In [None]:
count_vector = cv.transform(docs)
tf_idf_vector =transformer.transform(count_vector)
print(count_vector)

In [None]:
feature_names = cv.get_feature_names()
 
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)
 

In [None]:
tdidf = TfidfVectorizer(use_idf=True)
tf_idf = tdidf.fit_transform(docs)

feature_first_vector = tf_idf[0]
df = pd.DataFrame(feature_first_vector.T.todense(),index =tdidf.get_feature_names(),columns=['tdidf_values'])
df.sort_values(by=['tdidf_values'],ascending=False)



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
text = ['The quick brown fox jumped over the lazy dog']
count = CountVectorizer()
vector = count.fit(text)
print('fit:',count.vocabulary_)
print('feature_names:',vector.get_feature_names())
counts = vector.transform(text)
print('shape:',counts.shape)
print('array_values:',counts.toarray())

In [None]:
count = TfidfVectorizer()
vector = count.fit_transform(text)
print(vector)
print(count.get_feature_names())
print(vector.toarray())