## Prepare textdata for machine learning

## Create TF matrix then convert it to TF-IDF matrix

#### First, we define term frequency vectors

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=["the", "is"])

a

In [18]:
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=['the', 'is'],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


Train set d1, d2
Test set d3, d4

In [19]:
train_set = ("The sky is blue.", 
             "The sun is bright.")
test_set = ("The sun in the sky is bright.",
            "We can see the shining sun, the bright sun.")


In [20]:
vectorizer.fit_transform(train_set)
print(vectorizer.get_feature_names())

['blue', 'bright', 'sky', 'sun']


In [21]:
print(vectorizer.vocabulary_)

{'sky': 2, 'blue': 0, 'sun': 3, 'bright': 1}


In [43]:
sparse_matrix = vectorizer.transform(test_set)
print(sparse_matrix)

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (1, 1)	1
  (1, 3)	2


Create matrix for test set

In [40]:
sparse_matrix.todense()

matrix([[0, 1, 1, 1],
        [0, 1, 0, 2]], dtype=int64)

#### Now we are going to define IDF (inverse document frequency)

Find the TF matrix

In [99]:
count_vectorizer = CountVectorizer(stop_words=["the", "is"])
count_vectorizer.fit_transform(train_set)
print("Vocabulary:", count_vectorizer.vocabulary_)

freq_term_matrix = count_vectorizer.transform(test_set)
freq_term_matrix.todense()

Vocabulary: {'sky': 2, 'blue': 0, 'sun': 3, 'bright': 1}


matrix([[0, 1, 1, 1],
        [0, 1, 0, 2]], dtype=int64)

Transform to TFIDF

In [105]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)

print("IDF:", tfidf.idf_)

IDF: [2.09861229 1.         1.40546511 1.        ]


In [107]:
tfidf_matrix = tfidf.transform(freq_term_matrix)
print(tfidf_matrix.toarray())

[[0.         0.50154891 0.70490949 0.50154891]
 [0.         0.4472136  0.         0.89442719]]


## Using Pipeline to make TFIDF matrix

In [89]:
from sklearn.pipeline import Pipeline

stop_words = ["the", "is"]
vocabulary = ["sky", "birght", "sun", "blue"]

pipe = Pipeline([('tf', CountVectorizer(stop_words = stop_words)),
                 ("tfidf", TfidfTransformer())]).fit(train_set)

freq_term_matrix = pipe['tf'].transform(test_set)
print(freq_term_matrix.toarray())

[[0 1 1 1]
 [0 1 0 2]]


Transform to TFIDF

In [98]:
pipe['tfidf'].fit_transform(freq_term_matrix).toarray()

array([[0.        , 0.50154891, 0.70490949, 0.50154891],
       [0.        , 0.4472136 , 0.        , 0.89442719]])

In [58]:
pipe.transform(train_set).shape

(2, 4)

## Using TfidfVectorizer, an all in one class

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words = stop_words)
vectorizer.fit(train_set)

print("Vocabulary:", vectorizer.vocabulary_)
print("IDF:", vectorizer.idf_)

vector = vectorizer.transform(test_set)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

Vocabulary: {'sky': 2, 'blue': 0, 'sun': 3, 'bright': 1}
IDF: [1.40546511 1.40546511 1.40546511 1.40546511]
(2, 4)
[[0.         0.57735027 0.57735027 0.57735027]
 [0.         0.4472136  0.         0.89442719]]
