# Text Vectorization

In [1]:
textSamples = ['Co to je','To je něco']
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(textSamples) 
print(X_train_counts.shape)
print(X_train_counts.toarray())
print(count_vect.get_feature_names())
print(count_vect.get_params())

(2, 4)
[[1 1 0 1]
 [0 1 1 1]]
['co', 'je', 'něco', 'to']
{'tokenizer': None, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'preprocessor': None, 'analyzer': 'word', 'input': 'content', 'ngram_range': (1, 1), 'min_df': 1, 'strip_accents': None, 'encoding': 'utf-8', 'vocabulary': None, 'stop_words': None, 'max_df': 1.0, 'max_features': None, 'dtype': <class 'numpy.int64'>, 'decode_error': 'strict', 'lowercase': True, 'binary': False}


In [2]:
print(X_train_counts.toarray())

[[1 1 0 1]
 [0 1 1 1]]


## NWordGrams

In [3]:
cv = CountVectorizer(ngram_range=(1,2),analyzer='word')
X_cv = cv.fit_transform(textSamples)
print(X_cv.toarray())
print(cv.get_feature_names())

[[1 1 1 0 0 1 1]
 [0 0 1 1 1 1 1]]
['co', 'co to', 'je', 'je něco', 'něco', 'to', 'to je']


In [4]:
#https://stackoverflow.com/questions/36046180/how-to-selected-vocabulary-in-scikit-countvectorizer
term_freq_matrix = X_cv.sum(0)
term_freq_matrix

matrix([[1, 1, 2, 1, 1, 2, 2]], dtype=int64)

In [5]:
import numpy as np
min_freq = np.amin(term_freq_matrix)
min_freq

1

In [6]:
cv.get_feature_names()

['co', 'co to', 'je', 'je něco', 'něco', 'to', 'to je']

## NCharGrams

In [7]:
cvch = CountVectorizer(ngram_range=(1,3),analyzer='char')
X_cvch = cvch.fit_transform(textSamples)
print(X_cvch.toarray(),X_cvch.shape)

[[2 1 1 0 0 1 1 1 1 1 1 0 0 1 1 0 0 0 0 2 2 1 1 1 1 1 0 0 0]
 [2 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 2 1 1 0 1 1 1 1 1 1]] (2, 29)


In [8]:
cvch.get_feature_names()

[' ',
 ' j',
 ' je',
 ' n',
 ' ně',
 ' t',
 ' to',
 'c',
 'co',
 'co ',
 'e',
 'e ',
 'e n',
 'j',
 'je',
 'je ',
 'n',
 'ně',
 'něc',
 'o',
 'o ',
 'o j',
 'o t',
 't',
 'to',
 'to ',
 'ě',
 'ěc',
 'ěco']

## Pipeline & Feature Union

Apply pipeline of steps by Pipeline
Concatenate various inputs into single array using Feature Union

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier
import pandas as pd

http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [11]:
pdText = pd.DataFrame({'text':textSamples})
clf = FeatureUnion(
        transformer_list=[
    ('word', Pipeline([
                    ('wordGrams', ItemSelector(key='text')),
                    ('vect', CountVectorizer(analyzer='word',ngram_range=(1,1))),
                   
                       ])),
    ('chars', Pipeline([
                    ('nGrams', ItemSelector(key='text')),
                    ('vect', CountVectorizer(analyzer='char',ngram_range=(1,3))),
                    
        ])),
    ],
)
clf_model = clf.fit(pdText)

In [12]:
clf_model.transform(pdText).shape

(2, 33)

In [13]:
clf_model.transform(pdText).toarray()

array([[1, 1, 0, 1, 2, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
        0, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1]], dtype=int64)

## HashingVectorizer

In [14]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=20,norm='l1',ngram_range= (1, 1), analyzer= 'word');
#X_test = vectorizer.transform(pdText)
X_test = vectorizer.fit_transform(['Co to je','Co to není','Co to co to','Samuel zasel do sklepa pro chleba'])
X_test.shape

(4, 20)

In [15]:
X_test.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.33333333,  0.        ,  0.        , -0.33333333,
         0.        ,  0.        ,  0.        ,  0.33333333,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.33333333,  0.        ,  0.        ,  0.        ,
        -0.33333333,  0.        ,  0.        ,  0.33333333,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.5       ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.5       ,  0.        ],
       [ 0.        ,  0.        ,  0.16666667,  0.        ,  0.        ,
         0.        ,  0.        , -0.33333333, -

In [16]:
vectorizer.get_params()

{'alternate_sign': True,
 'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'n_features': 20,
 'ngram_range': (1, 1),
 'non_negative': False,
 'norm': 'l1',
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None}

from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import Perceptron
pipelinek = Pipeline([
                   ('union', FeatureUnion(
                           transformer_list=[
                       ('words', CountVectorizer(analyzer='word',ngram_range=(1,2), min_df = 2, max_features =10000)),
                       #('chars', CountVectorizer(analyzer='char',ngram_range=(1,3), min_df = 2, max_features =10000)),
                        ],)),
                   ('tfidf', TfidfTransformer(use_idf=False)),   
                    ])
X_trainVect = pipelinek.fit_transform(pdText)
X_trainVect.toarray()

In [18]:
pdText.shape

(2, 1)

In [19]:
print(pdText)

         text
0    Co to je
1  To je něco


# Count Vectorizer and Tdiff

In [21]:
textSamples = ['Co to je','To je něco','Co to co to','To je lev','je','to','co']
texts = pd.DataFrame(textSamples, columns=["texts"])
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
#min_df = 2, max_features =1
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(textSamples) 
print("Training data shape:")
print(X_train_counts.shape)
print("Sparse metics:")
print(X_train_counts)
print("Array:")
print("{0}".format(X_train_counts.toarray()))
print("Vocabulary list: {0}".format(count_vect.vocabulary_))
print(" Length of the vocabulary: {0}".format(len(count_vect.vocabulary_)))
texts['CountVectorizer'] = list(X_train_counts.toarray())
texts

Training data shape:
(7, 5)
Sparse metics:
  (0, 1)	1
  (0, 4)	1
  (0, 0)	1
  (1, 3)	1
  (1, 1)	1
  (1, 4)	1
  (2, 4)	2
  (2, 0)	2
  (3, 2)	1
  (3, 1)	1
  (3, 4)	1
  (4, 1)	1
  (5, 4)	1
  (6, 0)	1
Array:
[[1 1 0 0 1]
 [0 1 0 1 1]
 [2 0 0 0 2]
 [0 1 1 0 1]
 [0 1 0 0 0]
 [0 0 0 0 1]
 [1 0 0 0 0]]
Vocabulary list: {'je': 1, 'něco': 3, 'to': 4, 'co': 0, 'lev': 2}
 Length of the vocabulary: 5


Unnamed: 0,texts,CountVectorizer
0,Co to je,"[1, 1, 0, 0, 1]"
1,To je něco,"[0, 1, 0, 1, 1]"
2,Co to co to,"[2, 0, 0, 0, 2]"
3,To je lev,"[0, 1, 1, 0, 1]"
4,je,"[0, 1, 0, 0, 0]"
5,to,"[0, 0, 0, 0, 1]"
6,co,"[1, 0, 0, 0, 0]"


Apply Tfidf

In [22]:
t = TfidfTransformer(use_idf=False)
t.fit(X_train_counts)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False,
         use_idf=False)

In [23]:
X_train_Tfid = t.transform(X_train_counts)
print(X_train_Tfid.shape)
print("sparse metric:")
print(X_train_Tfid)
print("Array:")
print(X_train_Tfid.toarray())
texts['tfidfs'] = list(X_train_Tfid.toarray())
texts

(7, 5)
sparse metric:
  (0, 1)	0.57735026919
  (0, 4)	0.57735026919
  (0, 0)	0.57735026919
  (1, 3)	0.57735026919
  (1, 1)	0.57735026919
  (1, 4)	0.57735026919
  (2, 4)	0.707106781187
  (2, 0)	0.707106781187
  (3, 2)	0.57735026919
  (3, 1)	0.57735026919
  (3, 4)	0.57735026919
  (4, 1)	1.0
  (5, 4)	1.0
  (6, 0)	1.0
Array:
[[ 0.57735027  0.57735027  0.          0.          0.57735027]
 [ 0.          0.57735027  0.          0.57735027  0.57735027]
 [ 0.70710678  0.          0.          0.          0.70710678]
 [ 0.          0.57735027  0.57735027  0.          0.57735027]
 [ 0.          1.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          1.        ]
 [ 1.          0.          0.          0.          0.        ]]


Unnamed: 0,texts,CountVectorizer,tfidfs
0,Co to je,"[1, 1, 0, 0, 1]","[0.57735026919, 0.57735026919, 0.0, 0.0, 0.577..."
1,To je něco,"[0, 1, 0, 1, 1]","[0.0, 0.57735026919, 0.0, 0.57735026919, 0.577..."
2,Co to co to,"[2, 0, 0, 0, 2]","[0.707106781187, 0.0, 0.0, 0.0, 0.707106781187]"
3,To je lev,"[0, 1, 1, 0, 1]","[0.0, 0.57735026919, 0.57735026919, 0.0, 0.577..."
4,je,"[0, 1, 0, 0, 0]","[0.0, 1.0, 0.0, 0.0, 0.0]"
5,to,"[0, 0, 0, 0, 1]","[0.0, 0.0, 0.0, 0.0, 1.0]"
6,co,"[1, 0, 0, 0, 0]","[1.0, 0.0, 0.0, 0.0, 0.0]"


In [24]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=15,norm='l1',ngram_range= (1, 1), analyzer= 'word');
#X_test = vectorizer.transform(pdText)
X_test = vectorizer.fit_transform(textSamples)
print(X_test.toarray())
texts['Hashing'] = list(X_test.toarray())
texts

[[ 0.          0.33333333  0.          0.33333333  0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
  -0.33333333]
 [ 0.          0.33333333  0.          0.          0.          0.
   0.33333333  0.          0.          0.          0.          0.          0.
   0.         -0.33333333]
 [ 0.          0.5         0.          0.5         0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.        ]
 [ 0.          0.33333333  0.          0.          0.          0.          0.
   0.          0.          0.33333333  0.          0.          0.          0.
  -0.33333333]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
  -1.        ]
 [ 0.          1.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.  

Unnamed: 0,texts,CountVectorizer,tfidfs,Hashing
0,Co to je,"[1, 1, 0, 0, 1]","[0.57735026919, 0.57735026919, 0.0, 0.0, 0.577...","[0.0, 0.333333333333, 0.0, 0.333333333333, 0.0..."
1,To je něco,"[0, 1, 0, 1, 1]","[0.0, 0.57735026919, 0.0, 0.57735026919, 0.577...","[0.0, 0.333333333333, 0.0, 0.0, 0.0, 0.0, 0.33..."
2,Co to co to,"[2, 0, 0, 0, 2]","[0.707106781187, 0.0, 0.0, 0.0, 0.707106781187]","[0.0, 0.5, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,To je lev,"[0, 1, 1, 0, 1]","[0.0, 0.57735026919, 0.57735026919, 0.0, 0.577...","[0.0, 0.333333333333, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,je,"[0, 1, 0, 0, 0]","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,to,"[0, 0, 0, 0, 1]","[0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,co,"[1, 0, 0, 0, 0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Keras Tokenizer

In [25]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.utils import to_categorical

texts = ['Co to je','Co to není','Co to co to','Samuel zasel do sklepa pro co?']

#num_words is tne number of unique words in the sequence
tokenizer = Tokenizer(num_words=7)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

#max_len is the maximum length of the input text so that we can create vector [0,0,1,3,50] where 1,3,50 are individual words
data = pad_sequences(sequences, maxlen=6)

print('Shape of data tensor:', data.shape)
data

Using TensorFlow backend.


Found 9 unique tokens.
Shape of data tensor: (4, 6)


array([[0, 0, 0, 1, 2, 3],
       [0, 0, 0, 1, 2, 4],
       [0, 0, 1, 2, 1, 2],
       [0, 0, 0, 5, 6, 1]])

In [26]:
print(tokenizer.word_docs)

{'není': 1, 'zasel': 1, 'co': 4, 'pro': 1, 'je': 1, 'samuel': 1, 'to': 3, 'sklepa': 1, 'do': 1}


In [27]:
print(tokenizer.word_index)

{'není': 4, 'zasel': 6, 'co': 1, 'pro': 9, 'je': 3, 'samuel': 5, 'to': 2, 'sklepa': 8, 'do': 7}


In [28]:
print(tokenizer.document_count)

4


In [29]:
print(tokenizer.word_counts)

OrderedDict([('co', 5), ('to', 4), ('je', 1), ('není', 1), ('samuel', 1), ('zasel', 1), ('do', 1), ('sklepa', 1), ('pro', 1)])


In [30]:
print(sequences)
print(tokenizer.sequences_to_matrix(sequences))

[[1, 2, 3], [1, 2, 4], [1, 2, 1, 2], [5, 6, 1]]
[[ 0.  1.  1.  1.  0.  0.  0.]
 [ 0.  1.  1.  0.  1.  0.  0.]
 [ 0.  1.  1.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  1.  1.]]


In [31]:
print(tokenizer.sequences_to_matrix(sequences,mode='freq'))

[[ 0.          0.33333333  0.33333333  0.33333333  0.          0.          0.        ]
 [ 0.          0.33333333  0.33333333  0.          0.33333333  0.          0.        ]
 [ 0.          0.5         0.5         0.          0.          0.          0.        ]
 [ 0.          0.33333333  0.          0.          0.          0.33333333
   0.33333333]]


# Character Tokenizer

In [32]:
tokenizer = Tokenizer(num_words=100, char_level=True, lower=True)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(texts)
print(sequences)
print(tokenizer.word_index)

['Co to je', 'Co to není', 'Co to co to', 'Samuel zasel do sklepa pro co?']
[[5, 2, 1, 4, 2, 1, 12, 3], [5, 2, 1, 4, 2, 1, 8, 3, 8, 13], [5, 2, 1, 4, 2, 1, 9, 2, 1, 4, 2], [14, 6, 15, 16, 3, 7, 1, 17, 6, 10, 3, 7, 1, 18, 2, 1, 10, 19, 7, 3, 11, 6, 1, 11, 20, 2, 1, 9, 2, 21]]
{'t': 4, 'S': 14, 'j': 12, 'm': 15, 'o': 2, 'u': 16, 'z': 17, 's': 10, '?': 21, 'í': 13, 'r': 20, 'n': 8, 'c': 9, 'e': 3, 'd': 18, 'p': 11, 'C': 5, 'k': 19, ' ': 1, 'a': 6, 'l': 7}
