## Vectorize text as a bag-of-words

In [None]:
import sklearn

print(sklearn.__version__)

0.22.1


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
train_text = ["A bird in hand is worth two in the bush.",
              "Good things come to those who wait.",
              "These watches cost $1500! ",
              "There are other fish in the sea.",
              "The ball is in your court.",
              "Mr. Smith Goes to Washington ",
              "Doogie Howser M.D."]

count_vectorizer = CountVectorizer()

In [None]:
count_vectorizer.fit(train_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
count_vectorizer.get_feature_names()

['1500',
 'are',
 'ball',
 'bird',
 'bush',
 'come',
 'cost',
 'court',
 'doogie',
 'fish',
 'goes',
 'good',
 'hand',
 'howser',
 'in',
 'is',
 'mr',
 'other',
 'sea',
 'smith',
 'the',
 'there',
 'these',
 'things',
 'those',
 'to',
 'two',
 'wait',
 'washington',
 'watches',
 'who',
 'worth',
 'your']

In [None]:
count_vectorizer.get_stop_words()

In [None]:
count_vectorizer.vocabulary_

{'bird': 3,
 'in': 14,
 'hand': 12,
 'is': 15,
 'worth': 31,
 'two': 26,
 'the': 20,
 'bush': 4,
 'good': 11,
 'things': 23,
 'come': 5,
 'to': 25,
 'those': 24,
 'who': 30,
 'wait': 27,
 'these': 22,
 'watches': 29,
 'cost': 6,
 '1500': 0,
 'there': 21,
 'are': 1,
 'other': 17,
 'fish': 9,
 'sea': 18,
 'ball': 2,
 'your': 32,
 'court': 7,
 'mr': 16,
 'smith': 19,
 'goes': 10,
 'washington': 28,
 'doogie': 8,
 'howser': 13}

In [None]:
count_vectorizer.vocabulary_.get('things')

23

In [None]:
train_text

['A bird in hand is worth two in the bush.',
 'Good things come to those who wait.',
 'These watches cost $1500! ',
 'There are other fish in the sea.',
 'The ball is in your court.',
 'Mr. Smith Goes to Washington ',
 'Doogie Howser M.D.']

In [None]:
transformed_vector = count_vectorizer.transform(train_text)

In [None]:
print(transformed_vector.shape)

(7, 33)


In [None]:
print(transformed_vector.toarray())

[[0 0 0 1 1 0 0 0 0 0 0 0 1 0 2 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0]
 [0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0]
 [1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [None]:
test_text = ["Every cloud has a silver lining."]

count_vectorizer.transform(test_text).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
count_vectorizer.fit(train_text + test_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
print(count_vectorizer.vocabulary_)

{'bird': 3, 'in': 17, 'hand': 14, 'is': 18, 'worth': 36, 'two': 31, 'the': 25, 'bush': 4, 'good': 13, 'things': 28, 'come': 6, 'to': 30, 'those': 29, 'who': 35, 'wait': 32, 'these': 27, 'watches': 34, 'cost': 7, '1500': 0, 'there': 26, 'are': 1, 'other': 21, 'fish': 11, 'sea': 22, 'ball': 2, 'your': 37, 'court': 8, 'mr': 20, 'smith': 24, 'goes': 12, 'washington': 33, 'doogie': 9, 'howser': 16, 'every': 10, 'cloud': 5, 'has': 15, 'silver': 23, 'lining': 19}


In [None]:
count_vectorizer.transform(test_text).toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
text = ["That bird is sitting in the bush and this bird is in hand.",
        "Wait and then walk",
        "Watches are cool "]

transformed_vector = count_vectorizer.transform(text)

transformed_vector

<3x38 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [None]:
print(transformed_vector)

  (0, 3)	2
  (0, 4)	1
  (0, 14)	1
  (0, 17)	2
  (0, 18)	2
  (0, 25)	1
  (1, 32)	1
  (2, 1)	1
  (2, 34)	1


In [None]:
transformed_vector.shape

(3, 38)

In [None]:
print(transformed_vector.toarray())

[[0 0 0 2 1 0 0 0 0 0 0 0 0 0 1 0 0 2 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0]]


### How to read the spare array
The first tuple is the sentense. There are 3 sentenses, so the indices are 0, 1 and 2

The next number is the index into the sentense. And th elast number is the count. 

(0, 4)	1 Means that in Sentense 1 (0) there is a 1 in the 4th index.

### CountVectorizer on text file

In [None]:
with open('biography.txt', 'r') as f:
    file_contents = f.read()

print(file_contents)

Marie Curie was a Polish-born physicist and chemist and one of the most famous scientists of her time.
Together with her husband Pierre, she was awarded the Nobel Prize in 1903, and she went on to win another in 1911.
Marie Sklodowska was born in Warsaw on 7 November 1867, the daughter of a teacher.
In 1891, she went to Paris to study physics and mathematics at the Sorbonne where she met Pierre Curie, professor of the School of Physics.
They were married in 1895.
The Curies worked together investigating radioactivity, building on the work of the German physicist Roentgen and the French physicist Becquerel.
In July 1898, the Curies announced the discovery of a new chemical element, polonium.
At the end of the year, they announced the discovery of another, radium.
The Curies, along with Becquerel, were awarded the Nobel Prize for Physics in 1903.
Pierre's life was cut short in 1906 when he was knocked down and killed by a carriage.
Marie took over his teaching post, becoming the first wo

In [None]:
sentences = file_contents.split('\n')
        
print(sentences)

['Marie Curie was a Polish-born physicist and chemist and one of the most famous scientists of her time.', 'Together with her husband Pierre, she was awarded the Nobel Prize in 1903, and she went on to win another in 1911.', 'Marie Sklodowska was born in Warsaw on 7 November 1867, the daughter of a teacher.', 'In 1891, she went to Paris to study physics and mathematics at the Sorbonne where she met Pierre Curie, professor of the School of Physics.', 'They were married in 1895.', 'The Curies worked together investigating radioactivity, building on the work of the German physicist Roentgen and the French physicist Becquerel.', 'In July 1898, the Curies announced the discovery of a new chemical element, polonium.', 'At the end of the year, they announced the discovery of another, radium.', 'The Curies, along with Becquerel, were awarded the Nobel Prize for Physics in 1903.', "Pierre's life was cut short in 1906 when he was knocked down and killed by a carriage.", 'Marie took over his teac

In [None]:
transformed_vector = count_vectorizer.fit_transform(sentences)

transformed_vector.shape

(20, 167)

In [None]:
print(transformed_vector)

  (0, 91)	1
  (0, 34)	1
  (0, 153)	1
  (0, 111)	1
  (0, 21)	1
  (0, 108)	1
  (0, 11)	2
  (0, 27)	1
  (0, 103)	1
  (0, 101)	2
  (0, 144)	1
  (0, 96)	1
  (0, 56)	1
  (0, 128)	1
  (0, 72)	1
  (0, 146)	1
  (1, 153)	1
  (1, 11)	1
  (1, 144)	1
  (1, 72)	1
  (1, 148)	1
  (1, 161)	1
  (1, 76)	1
  (1, 110)	1
  (1, 131)	2
  :	:
  (17, 124)	1
  (17, 62)	2
  (17, 42)	1
  (17, 8)	1
  (17, 86)	1
  (17, 25)	1
  (17, 54)	1
  (17, 74)	1
  (17, 51)	1
  (17, 116)	1
  (18, 153)	1
  (18, 11)	1
  (18, 101)	1
  (18, 144)	2
  (18, 99)	1
  (18, 114)	1
  (18, 37)	1
  (18, 35)	1
  (18, 59)	1
  (18, 73)	1
  (18, 28)	1
  (18, 48)	1
  (18, 80)	1
  (18, 127)	1
  (18, 160)	1


In [None]:
print(count_vectorizer.vocabulary_)

{'marie': 91, 'curie': 34, 'was': 153, 'polish': 111, 'born': 21, 'physicist': 108, 'and': 11, 'chemist': 27, 'one': 103, 'of': 101, 'the': 144, 'most': 96, 'famous': 56, 'scientists': 128, 'her': 72, 'time': 146, 'together': 148, 'with': 161, 'husband': 76, 'pierre': 110, 'she': 131, 'awarded': 15, 'nobel': 99, 'prize': 114, 'in': 77, '1903': 4, 'went': 154, 'on': 102, 'to': 147, 'win': 159, 'another': 13, '1911': 6, 'sklodowska': 134, 'warsaw': 152, 'november': 100, '1867': 0, 'daughter': 37, 'teacher': 140, '1891': 1, 'paris': 107, 'study': 136, 'physics': 109, 'mathematics': 93, 'at': 14, 'sorbonne': 135, 'where': 157, 'met': 95, 'professor': 115, 'school': 126, 'they': 145, 'were': 155, 'married': 92, '1895': 2, 'curies': 35, 'worked': 164, 'investigating': 79, 'radioactivity': 117, 'building': 22, 'work': 163, 'german': 64, 'roentgen': 125, 'french': 61, 'becquerel': 17, 'july': 82, '1898': 3, 'announced': 12, 'discovery': 43, 'new': 98, 'chemical': 26, 'element': 49, 'polonium':

We lost:

* The meaning of text corpus
* The ordering of the words

In [None]:
count_vectorizer.inverse_transform(transformed_vector)

[array(['marie', 'curie', 'was', 'polish', 'born', 'physicist', 'and',
        'chemist', 'one', 'of', 'the', 'most', 'famous', 'scientists',
        'her', 'time'], dtype='<U13'),
 array(['was', 'and', 'the', 'her', 'together', 'with', 'husband',
        'pierre', 'she', 'awarded', 'nobel', 'prize', 'in', '1903', 'went',
        'on', 'to', 'win', 'another', '1911'], dtype='<U13'),
 array(['marie', 'was', 'born', 'of', 'the', 'in', 'on', 'sklodowska',
        'warsaw', 'november', '1867', 'daughter', 'teacher'], dtype='<U13'),
 array(['curie', 'and', 'of', 'the', 'pierre', 'she', 'in', 'went', 'to',
        '1891', 'paris', 'study', 'physics', 'mathematics', 'at',
        'sorbonne', 'where', 'met', 'professor', 'school'], dtype='<U13'),
 array(['in', 'they', 'were', 'married', '1895'], dtype='<U13'),
 array(['physicist', 'and', 'of', 'the', 'together', 'on', 'curies',
        'worked', 'investigating', 'radioactivity', 'building', 'work',
        'german', 'roentgen', 'french', 'becq